add cuda kernel

144d790b · fenglei.tian · e63322d9 · 144d790b · 144d790b · 144d790b
Commit 144d790b authored Feb 09, 2018 by fenglei.tian
3 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include <algorithm>
+#include <map>
+
+#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
+
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            namespace cuda
+            {
+                namespace kernel
+                {
+
+                    void emit_abs(float* in, float* out, size_t count)
+                    {
+
+                    }
+                    void emit_broadcast(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& out,
+                            const Shape& arg0_shape,
+                            const Shape& out_shape,
+                            const AxisSet& broadcast_axes)
+                    {
+                    }
+
+                    //
+                    // For the reference kernel this is gpud on, see ngraph/runtime/kernel/concat.hpp.
+                    //
+                    void emit_concat(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::vector<std::string>& args,
+                            const std::string& out,
+                            const std::vector<Shape>& in_shapes,
+                            const Shape& out_shape,
+                            size_t concatenation_axis)
+                    {
+                    }
+
+                    void emit_replace_slice(
+                            codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& arg1, // replacement value
+                            const std::string& out,
+                            const Shape& arg1_shape,
+                            const Shape& out_shape,
+                            const Coordinate& lower_bounds,
+                            const Coordinate& upper_bounds,
+                            const Strides& strides)
+                    {
+                    }
+
+                    void emit_slice(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& out,
+                            const Shape& arg0_shape,
+                            const Shape& out_shape,
+                            const Coordinate& lower_bounds,
+                            const Coordinate& upper_bounds,
+                            const Strides& strides)
+                    {
+                    }
+
+                    void emit_reshape(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& out,
+                            const Shape& arg0_shape,
+                            const Shape& out_shape,
+                            const AxisVector& arg0_axis_order)
+                    {
+                    }
+
+                    void emit_sum(codegen::CodeWriter& writer,
+                            const std::string& element_type,
+                            const std::string& arg0, // replacement context
+                            const std::string& out,
+                            const Shape& arg0_shape,
+                            const Shape& out_shape,
+                            const AxisSet& reduction_axes)
+                    {
+                    }
+
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+
+#pragma once
+
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            namespace kernel
+            {
+                void emit_broadcast(codegen::CodeWriter& writer,
+                                    const std::string& element_type,
+                                    const std::string& arg0, // replacement context
+                                    const std::string& out,
+                                    const Shape& arg0_shape,
+                                    const Shape& out_shape,
+                                    const AxisSet& broadcast_axes);
+                void emit_concat(codegen::CodeWriter& writer,
+                                 const std::string& element_type,
+                                 const std::vector<std::string>& args,
+                                 const std::string& out,
+                                 const std::vector<Shape>& in_shapes,
+                                 const Shape& out_shape,
+                                 const size_t concatenation_axis);
+
+                void emit_replace_slice(codegen::CodeWriter& writer,
+                                        const std::string& element_type,
+                                        const std::string& arg0, // replacement context
+                                        const std::string& arg1, // replacement value
+                                        const std::string& out,
+                                        const Shape& arg1_shape,
+                                        const Shape& out_shape,
+                                        const Coordinate& lower_bounds,
+                                        const Coordinate& upper_bounds,
+                                        const Strides& strides);
+                void emit_slice(codegen::CodeWriter& writer,
+                                const std::string& element_type,
+                                const std::string& arg0, // replacement context
+                                const std::string& out,
+                                const Shape& arg0_shape,
+                                const Shape& out_shape,
+                                const Coordinate& lower_bounds,
+                                const Coordinate& upper_bounds,
+                                const Strides& strides);
+                void emit_reshape(codegen::CodeWriter& writer,
+                                  const std::string& element_type,
+                                  const std::string& arg0, // replacement context
+                                  const std::string& out,
+                                  const Shape& arg0_shape,
+                                  const Shape& out_shape,
+                                  const AxisVector& arg0_axis_order);
+                void emit_sum(codegen::CodeWriter& writer,
+                              const std::string& element_type,
+                              const std::string& arg0, // replacement context
+                              const std::string& out,
+                              const Shape& arg0_shape,
+                              const Shape& out_shape,
+                              const AxisSet& reduction_axes);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -52,24 +52,24 @@ using namespace ngraph;

 #define NVRTC_SAFE_CALL(x) \ 
 do { \ 
-nvrtcResult result = x; \ 
-if (result != NVRTC_SUCCESS) { \ 
-std::cerr << "\nerror: " #x " failed with error " \ 
-<< nvrtcGetErrorString(result) << '\n'; \
+    nvrtcResult result = x; \ 
+        if (result != NVRTC_SUCCESS) { \ 
+            std::cerr << "\nerror: " #x " failed with error " \ 
+                << nvrtcGetErrorString(result) << '\n'; \
                exit(1); \ 
-} \
- } while(0) 
+        } \
+} while(0) 

 #define CUDA_SAFE_CALL(x) \ 
 do { \ 
-CUresult result = x; \ 
-if (result != CUDA_SUCCESS) { \ 
-const char *msg; \ 
-cuGetErrorName(result, &msg); \ 
-std::cerr << "\nerror: " #x " failed with error " \ 
-<< msg << '\n'; \ 
-exit(1); \ 
-} \ 
+    CUresult result = x; \ 
+        if (result != CUDA_SUCCESS) { \ 
+            const char *msg; \ 
+                cuGetErrorName(result, &msg); \ 
+                std::cerr << "\nerror: " #x " failed with error " \ 
+                << msg << '\n'; \ 
+                exit(1); \ 
+        } \ 
 } while(0)


@@ -96,59 +96,59 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
    }
  })";

-size_t numBlocks = 4; size_t numThreads = 4; 
-// Create an instance of nvrtcProgram with the code string. 
+        size_t numBlocks = 4; size_t numThreads = 4; 
+    // Create an instance of nvrtcProgram with the code string. 

-nvrtcProgram prog; 
-NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
+    nvrtcProgram prog; 
+    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
                op_abs, // buffer 
                "op_abs.cu", // name 
-0, // numHeaders 
-NULL, // headers 
-NULL)); // includeNames
+                0, // numHeaders 
+                NULL, // headers 
+                NULL)); // includeNames


-const char *opts[] = {"--gpu-architecture=compute_35",
+    const char *opts[] = {"--gpu-architecture=compute_35",
        "--relocatable-device-code=true"};
-nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog 
+    nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog 
            2, // numOptions 
            opts); // options
-// Obtain compilation log from the program. 
+    // Obtain compilation log from the program. 

-size_t logSize; 
+    size_t logSize; 

-NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
-char *log = new char[logSize]; 
-NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
-std::cout << log << '\n'; 
-delete[] log; 
+    NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
+    char *log = new char[logSize]; 
+    NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
+    std::cout << log << '\n'; 
+    delete[] log; 

-if (compileResult != NVRTC_SUCCESS) {
+    if (compileResult != NVRTC_SUCCESS) {
        exit(1);
    }

-size_t ptxSize; 
+    size_t ptxSize; 
    NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); 
-char *ptx = new char[ptxSize]; 
+    char *ptx = new char[ptxSize]; 
    NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. 
    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. 

-CUdevice cuDevice; 
-CUcontext context; 
-CUmodule module; 
-CUfunction cuda_op_abs_kernel; 
-CUDA_SAFE_CALL( cuInit(0)); 
+    CUdevice cuDevice; 
+    CUcontext context; 
+    CUmodule module; 
+    CUfunction cuda_op_abs_kernel; 
+    CUDA_SAFE_CALL( cuInit(0)); 
    CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); 
    CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
-// CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState)); 
+    // CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState)); 
    //CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0));
-//CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));
+    //CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));

-//size_t cubinSize;
-//void *cubin;
-//CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
-CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
-CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
+    //size_t cubinSize;
+    //void *cubin;
+    //CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
+    CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
+    CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));


    writer << "{  // " << n->get_name() << "\n";
@@ -157,32 +157,33 @@ CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
    writer << "if(count == 0) return;\n";
    writer << "void *argsList[] = {(void *)" << args[0].get_name() << ", (void *)" << out[0].get_name() << ", &count};\n";
    writer << "//cuLaunchKernel(cuda_op_abs_kernel, count, 1, 1, 1, 1, 1, 0, NULL, argsList, 0);\n";
+    writer.indent--;
    writer << "}\n";

-// Generate input for execution, and create output buffers. 
-//size_t nt = numBlocks * numThreads; 
-//size_t bufferSize = nt * sizeof(float); 
-//float *hOut = new float[nt]; 
-//float *hIn = new float[nt]; 
-//for(int i = 0; i< nt; i++) hIn[i] = -i;
-//
-//CUdeviceptr dOut, dIn;
-//cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. 
-//cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. 
-//cuMemcpyHtoD(dIn, hIn, bufferSize); 
-//
-//void *argst[] = {&dIn, &dOut, &nt};
-// CUDA_SAFE_CALL(
-// cuLaunchKernel(kernel, 
-//    numBlocks , 1, 1, // grid dim 
-//    numThreads, 1, 1, // block dim 
-//    0, NULL, // shared mem and stream 
-//   argst, 0)); // arguments 
-//CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. 
-//cuMemcpyDtoH(hOut, dOut, bufferSize); 
-//for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources. 
-//cuMemFree(dOut); 
-//cuModuleUnload(module); 
+    // Generate input for execution, and create output buffers. 
+    //size_t nt = numBlocks * numThreads; 
+    //size_t bufferSize = nt * sizeof(float); 
+    //float *hOut = new float[nt]; 
+    //float *hIn = new float[nt]; 
+    //for(int i = 0; i< nt; i++) hIn[i] = -i;
+    //
+    //CUdeviceptr dOut, dIn;
+    //cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. 
+    //cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. 
+    //cuMemcpyHtoD(dIn, hIn, bufferSize); 
+    //
+    //void *argst[] = {&dIn, &dOut, &nt};
+    // CUDA_SAFE_CALL(
+    // cuLaunchKernel(kernel, 
+    //    numBlocks , 1, 1, // grid dim 
+    //    numThreads, 1, 1, // block dim 
+    //    0, NULL, // shared mem and stream 
+    //   argst, 0)); // arguments 
+    //CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. 
+    //cuMemcpyDtoH(hOut, dOut, bufferSize); 
+    //for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources. 
+    //cuMemFree(dOut); 
+    //cuModuleUnload(module); 
 }

 void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
@@ -242,7 +243,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
    writer << " // " << n->get_name() << "\n  return;\n";
-/*
+    /*
       const Shape& arg0_shape = args[0].get_shape();
       const Shape& arg1_shape = args[1].get_shape();
       if (arg0_shape.empty() || arg1_shape.empty())
@@ -342,11 +343,11 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    // clang-format on
    writer.indent--;
    writer << "}\n";
-    }
-    else
-    {
+}
+else
+{
    // General ND Call?
-    }
+}
 */
 }