Upgrade nGraph support to CUDA 9.1 (#1009)

* Output warnings and compiler errors from NVRTC, updated avg_pool kernel for cuda 9. * Added defgaurd to only apply defines for deprecated function if CUDA_VERSION < 9000. Updated shfl_xor invocation.

Upgrade nGraph support to CUDA 9.1 (#1009)
* Output warnings and compiler errors from NVRTC, updated avg_pool kernel for cuda 9. * Added defgaurd to only apply defines for deprecated function if CUDA_VERSION < 9000. Updated shfl_xor invocation.
164a3ba8 · Chris Sullivan · Scott Cyphers · 8e6b9a20 · 164a3ba8 · 164a3ba8
Commit 164a3ba8 authored May 15, 2018 by Chris Sullivan Committed by Scott Cyphers May 15, 2018
3 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -483,7 +483,8 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
                    if (include_pad == false)
                    {
                        // count the number of (non-padded) elements
-                        writer << "pool_size += __popc(__ballot(within_tensor_bounds));\n";
+                        writer << "pool_size += __popc(__ballot_sync(0xffffffff, "
+                                  "within_tensor_bounds));\n";
                    }
                    // this will need to change to k->c once
                    // feature pooling support is added
@@ -503,7 +504,7 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
                writer << "for (int i = 16; i > 0; i >>= 1)\n";
                writer.block_begin();
                {
-                    writer << "sum += __shfl_xor(sum,i);\n";
+                    writer << "sum += __shfl_xor_sync(0xffffffff,sum,i,32);\n";
                }
                writer.block_end();
                // write result to output
@@ -713,11 +714,19 @@ void runtime::gpu::CUDAEmitter::print_tensor_from_gpu(codegen::CodeWriter& write
 std::string runtime::gpu::CUDAEmitter::include_helpers()
 {
+    std::stringstream ss;
+#if defined(CUDA_VERSION) && CUDA_VERSION < 9000
+    ss << R"(
+#define __ballot_sync(mask, predicate) __ballot(predicate)
+#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width)
+#define __shfl_xor_sync(mask, val, laneMask, width) __shfl_xor(val, laneMask, width)
+)";
+#endif
    // div64: fast integer division via magic multiplication and shifting
    // if value is a power of 2, magic will be 1 and only shifting
    // is required (predicate p in div64)
    // load: helper to load from constant memory for fast access
-    std::stringstream ss;
    ss << R"(
 __device__ __forceinline__ int div64(int value, int magic, int shift)
 {

--- a/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
@@ -14,6 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
+#include <cstring>
 #include <iostream>
 #include <string>
@@ -31,18 +32,39 @@ std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::st
    nvrtcProgram prog;
    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
                                       kernel.c_str(),
-                                       "op.cu",
+                                       "ngraph.cu",
                                       0,      // numHeaders
                                       NULL,   // headers
                                       NULL)); // includeNames
    nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);
+    // output compiler log helper
+    auto emit_log = [&prog]() {
+        size_t logSize;
+        NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
+        char* log = static_cast<char*>(malloc(sizeof(char) * logSize + 1));
+        NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
+        log[logSize] = '\x0';
+        if (std::strlen(log) >= 2)
+        {
+            std::cerr << log;
+        }
+        free(log);
+    };
+    // throw if compilation was not successful
    if (compile_result != NVRTC_SUCCESS)
    {
-        throw std::runtime_error("compile error: \n" + kernel + "\n options");
+        std::cerr << "Compile error: \n" + kernel;
+        // output compiler errors
+        emit_log();
+        throw std::runtime_error("NVRTC compilation failure.");
    }
+    // output any compiler warnings
+    emit_log();
+    // retrieve the intermediate PTX
    size_t ptx_size;
    NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
    char* ptx = new char[ptx_size];
@@ -51,6 +73,7 @@ std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::st
                    ptx)); // Load the generated PTX and get a handle to the parent kernel.
    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
+    // extract the compiled function
    CUmodule module;
    CUfunction function;
    CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
 abc_int64
 abc_tbb
 aliased_output
-avg_pool_1d_1channel_1image
-avg_pool_1d_1channel_2image
-avg_pool_1d_2channel_2image
-avg_pool_2d_1channel_1image_padded
-avg_pool_2d_1channel_1image_strided
-avg_pool_2d_2channel_2image
-avg_pool_2d_2channel_2image_padded
-avg_pool_2d_2channel_2image_padded_3x3
-avg_pool_2d_2channel_2image_padded_3x3_strided
-avg_pool_2d_2channel_2image_padded_3x3_strided_uneven
-avg_pool_2d_2channel_2image_padded_only_above
-avg_pool_2d_2channel_2image_padded_only_below
-backwards_avgpool_n1_c1_hw2x2
-backwards_avgpool_n1_c1_hw4x4
-backwards_avgpool_n2_c2_hw2x2_win_2x2_str_1x1_padding_numeric
-backwards_avgpool_n2_c2_hw4x4
-backwards_avgpool_n2_c2_hw4x4_numeric
-backwards_avgpool_n2_c2_hw4x4_win_2x2_str_1x1_numeric
 backwards_broadcast0
 backwards_broadcast1
 backwards_concat_axis_0