Commit 164a3ba8 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

Upgrade nGraph support to CUDA 9.1 (#1009)

* Output warnings and compiler errors from NVRTC, updated avg_pool kernel for cuda 9.

* Added defgaurd to only apply defines for deprecated function if CUDA_VERSION < 9000. Updated shfl_xor invocation.
parent 8e6b9a20
......@@ -483,7 +483,8 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
if (include_pad == false)
{
// count the number of (non-padded) elements
writer << "pool_size += __popc(__ballot(within_tensor_bounds));\n";
writer << "pool_size += __popc(__ballot_sync(0xffffffff, "
"within_tensor_bounds));\n";
}
// this will need to change to k->c once
// feature pooling support is added
......@@ -503,7 +504,7 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
writer << "for (int i = 16; i > 0; i >>= 1)\n";
writer.block_begin();
{
writer << "sum += __shfl_xor(sum,i);\n";
writer << "sum += __shfl_xor_sync(0xffffffff,sum,i,32);\n";
}
writer.block_end();
// write result to output
......@@ -713,11 +714,19 @@ void runtime::gpu::CUDAEmitter::print_tensor_from_gpu(codegen::CodeWriter& write
std::string runtime::gpu::CUDAEmitter::include_helpers()
{
std::stringstream ss;
#if defined(CUDA_VERSION) && CUDA_VERSION < 9000
ss << R"(
#define __ballot_sync(mask, predicate) __ballot(predicate)
#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width)
#define __shfl_xor_sync(mask, val, laneMask, width) __shfl_xor(val, laneMask, width)
)";
#endif
// div64: fast integer division via magic multiplication and shifting
// if value is a power of 2, magic will be 1 and only shifting
// is required (predicate p in div64)
// load: helper to load from constant memory for fast access
std::stringstream ss;
ss << R"(
__device__ __forceinline__ int div64(int value, int magic, int shift)
{
......
......@@ -14,6 +14,7 @@
* limitations under the License.
*******************************************************************************/
#include <cstring>
#include <iostream>
#include <string>
......@@ -31,18 +32,39 @@ std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::st
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
kernel.c_str(),
"op.cu",
"ngraph.cu",
0, // numHeaders
NULL, // headers
NULL)); // includeNames
nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);
// output compiler log helper
auto emit_log = [&prog]() {
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char* log = static_cast<char*>(malloc(sizeof(char) * logSize + 1));
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
log[logSize] = '\x0';
if (std::strlen(log) >= 2)
{
std::cerr << log;
}
free(log);
};
// throw if compilation was not successful
if (compile_result != NVRTC_SUCCESS)
{
throw std::runtime_error("compile error: \n" + kernel + "\n options");
std::cerr << "Compile error: \n" + kernel;
// output compiler errors
emit_log();
throw std::runtime_error("NVRTC compilation failure.");
}
// output any compiler warnings
emit_log();
// retrieve the intermediate PTX
size_t ptx_size;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
char* ptx = new char[ptx_size];
......@@ -51,6 +73,7 @@ std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::st
ptx)); // Load the generated PTX and get a handle to the parent kernel.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
// extract the compiled function
CUmodule module;
CUfunction function;
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
......
abc_int64
abc_tbb
aliased_output
avg_pool_1d_1channel_1image
avg_pool_1d_1channel_2image
avg_pool_1d_2channel_2image
avg_pool_2d_1channel_1image_padded
avg_pool_2d_1channel_1image_strided
avg_pool_2d_2channel_2image
avg_pool_2d_2channel_2image_padded
avg_pool_2d_2channel_2image_padded_3x3
avg_pool_2d_2channel_2image_padded_3x3_strided
avg_pool_2d_2channel_2image_padded_3x3_strided_uneven
avg_pool_2d_2channel_2image_padded_only_above
avg_pool_2d_2channel_2image_padded_only_below
backwards_avgpool_n1_c1_hw2x2
backwards_avgpool_n1_c1_hw4x4
backwards_avgpool_n2_c2_hw2x2_win_2x2_str_1x1_padding_numeric
backwards_avgpool_n2_c2_hw4x4
backwards_avgpool_n2_c2_hw4x4_numeric
backwards_avgpool_n2_c2_hw4x4_win_2x2_str_1x1_numeric
backwards_broadcast0
backwards_broadcast1
backwards_concat_axis_0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment