Commit f33317cc authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

add dtype-generic load definitions and clean up nvrtc helpers (#1975)

* Refactor include_helpers into an nvrtc specific helper file. Add templated define functions for coherent and noncoherent memory loads.

* Format

* const refs.

* Remove cast of zero.
parent af889535
......@@ -1245,7 +1245,6 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
writer << include_helpers();
CudaKernelBuilder::get_avg_pool(writer, kernel_name, dtypes, include_pad);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
......@@ -1595,7 +1594,6 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
CudaKernelBuilder::get_softmax_divide_op(
writer, kernel_name, dtypes, axes_flag, input_shape.size());
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
......@@ -1711,7 +1709,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<std::stri
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
CudaKernelBuilder::get_device_helper(
......@@ -1787,7 +1784,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar(const std::vector<std::
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
CudaKernelBuilder::get_device_helper(
......@@ -1856,7 +1852,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar_acc(const std::vector<s
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
CudaKernelBuilder::get_device_helper(
......@@ -2140,7 +2135,6 @@ size_t
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
CudaKernelBuilder::get_device_helper(writer, op, kernel, dtypes);
......@@ -2371,9 +2365,8 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string,
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
writer << include_helpers();
runtime::gpu::CudaKernelBuilder::get_broadcast_op(
writer, kernel_name, args, result_shape.size());
writer, kernel_name, dtypes[0], args, result_shape.size());
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
......@@ -2753,7 +2746,6 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
writer << include_helpers();
runtime::gpu::CudaKernelBuilder::get_convolution_forward(writer,
kernel_name,
dtypes,
......@@ -2841,109 +2833,6 @@ void runtime::gpu::CUDAEmitter::print_tensor_from_gpu(codegen::CodeWriter& write
writer.block_end();
}
std::string runtime::gpu::CUDAEmitter::include_helpers()
{
std::stringstream ss;
#if defined(CUDA_VERSION) && CUDA_VERSION < 9000
ss << R"(
#define WARP_SIZE 32
#define __ballot_sync(mask, predicate) __ballot(predicate)
#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width)
#define __shfl_xor_sync(mask, val, laneMask, width) __shfl_xor(val, laneMask, width)
)";
#endif
// add modern type definitions
ss << "typedef signed char int8_t;\n";
ss << "typedef signed short int16_t;\n";
ss << "typedef signed int int32_t;\n";
ss << "typedef signed long int int64_t;\n";
ss << "typedef unsigned char uint8_t;\n";
ss << "typedef unsigned short uint16_t;\n";
ss << "typedef unsigned int uint32_t;\n";
ss << "typedef unsigned long int uint64_t;\n";
ss << "\n";
// division_by_invariant_multiplication:
// fast integer division via invariant multiplication and shifting
// if value is a power of 2, magic will be 1 and only shifting
// is required (predicate p below)
// load: helper to load from constant memory for fast access
ss << R"(
__device__ __forceinline__ int division_by_invariant_multiplication(int value, int magic, int shift)
{
int result;
asm("{\n\t"
".reg .pred p;\n\t"
".reg .u64 res64;\n\t"
".reg .u32 lo32, hi32;\n\t"
"setp.ne.s32 p, %2, 1;\n\t"
"mul.wide.u32 res64, %1, %2;\n\t"
"mov.b64 {lo32, hi32}, res64;\n\t"
"selp.u32 hi32, hi32, %1, p;\n\t"
"shr.u32 %0, hi32, %3;\n\t"
"}" : "=r"(result) : "r"(value), "r"(magic), "r"(shift));
return result;
}
__device__ __forceinline__ void idiv_fast(int numerator, int denominator, float rcp,
int& result, int& remainder)
{
result = (int)((float)numerator * rcp);
remainder = numerator - (result * denominator);
result = (remainder >= denominator) ? (result + 1) : result;
remainder = (remainder >= denominator) ? (remainder - denominator) : remainder;
}
__device__ __forceinline__ int mod16(int numerator, int div, int maxdiv)
{
int res;
asm("vmad.s32.u32.u32 %0, -%1.h0, %2.h0, %3;" : "=r"(res) : "r"(div), "r"(maxdiv), "r"(numerator));
return res;
}
__device__ __forceinline__ int mad16(int a, int b, int c)
{
int res;
asm("vmad.s32.u32.u32 %0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c));
return res;
}
__device__ __forceinline__ int msub16(int a, int b, int c)
{
int res;
asm("vmad.s32.u32.u32 %0, %1.h0, %2.h0, -%3;" : "=r"(res) : "r"(a), "r"(b), "r"(c));
return res;
}
__device__ __forceinline__ float load(const float* __restrict__ in, int i=0, bool b=true)
{
float v = 0.0f;
if (b)
{
v = __ldg(in + i);
}
return v;
}
__device__ __forceinline__ int32_t load(const int32_t* __restrict__ in, int i=0, bool b=true)
{
int32_t v = 0;
if (b)
{
v = __ldg(in + i);
}
return v;
}
__device__ __forceinline__ int64_t load(const int64_t* __restrict__ in, int i=0, bool b=true)
{
int64_t v = 0;
if (b)
{
v = __ldg(in + i);
}
return v;
}
)";
return ss.str();
}
uint32_t runtime::gpu::CUDAEmitter::align_to_block_size(uint32_t threads, uint32_t block_size)
{
if (threads > (1u << 31) - 1)
......
......@@ -18,6 +18,7 @@
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_args.hpp"
#include "ngraph/runtime/gpu/nvrtc/helpers.hpp"
#include "ngraph/runtime/gpu/type_info.hpp"
using namespace ngraph;
......@@ -84,6 +85,7 @@ void runtime::gpu::CudaKernelBuilder::get_softmax_divide_op(
std::vector<size_t> axes_flag,
size_t rank)
{
writer << runtime::gpu::nvrtc::helpers();
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in0, "
<< data_types[1] << "* in1, " << data_types[2] << "* out,";
for (size_t i = 0; i < axes_flag.size(); i++)
......@@ -136,6 +138,7 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
bool save_elementwise,
size_t rank)
{
writer << runtime::gpu::nvrtc::helpers();
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
......@@ -336,6 +339,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
size_t out_rank,
size_t reduce_rank)
{
writer << runtime::gpu::nvrtc::helpers();
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
......@@ -416,6 +420,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
const std::string& reduce_op,
uint32_t block_size_x)
{
writer << runtime::gpu::nvrtc::helpers();
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
......@@ -509,6 +514,7 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
const std::vector<std::string>& data_types,
const std::string& reduce_op)
{
writer << runtime::gpu::nvrtc::helpers();
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
......@@ -548,9 +554,12 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
runtime::gpu::GPUKernelArgs& args,
const size_t rank)
{
writer << runtime::gpu::nvrtc::helpers();
writer << runtime::gpu::nvrtc::define_non_coherent_load(data_type, "load");
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
......@@ -1131,6 +1140,8 @@ void runtime::gpu::CudaKernelBuilder::get_avg_pool(codegen::CodeWriter& writer,
const std::array<std::string, 2>& data_types,
bool include_pad)
{
writer << runtime::gpu::nvrtc::helpers();
writer << runtime::gpu::nvrtc::define_non_coherent_load(data_types[0], "load");
// In the pooling operation out = P(in) where in: NCDHW -> out: NKMPQ
// via pooling window: JTRS. Currently feature pooling
// is not supported and so K = C and J is unused
......@@ -1249,6 +1260,7 @@ void runtime::gpu::CudaKernelBuilder::get_convolution_forward(
int sm_tile_size,
int reg_tile_size)
{
writer << runtime::gpu::nvrtc::helpers();
writer << "#define NUM_ROWS 8\n";
writer << "#define FILTER_SIZE " << filter_size << "\n";
writer << "#define SM_TILE_SIZE " << sm_tile_size << "\n";
......
......@@ -46,6 +46,7 @@ namespace ngraph
static void get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
GPUKernelArgs& args,
const size_t rank);
......
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <cuda.h>
#include <sstream>
#include <string>
#include "ngraph/except.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
namespace nvrtc
{
std::string helpers();
std::string define_zero(const std::string& dtype,
const std::string& name = "zero_");
std::string define_vzero(std::string dtype,
const uint32_t& n,
const std::string& name = "zero_");
std::string define_coherent_load(const std::string& dtype,
const std::string& name = "load_");
std::string define_coherent_vload(const std::string& dtype,
const uint32_t& n,
const std::string& name = "load_");
std::string define_non_coherent_load(const std::string& dtype,
const std::string& name = "load_");
std::string define_non_coherent_vload(const std::string& dtype,
const uint32_t& n,
const std::string& name = "load_");
}
}
}
}
std::string ngraph::runtime::gpu::nvrtc::helpers()
{
std::stringstream ss;
#if defined(CUDA_VERSION) && CUDA_VERSION < 9000
ss << R"(
#define WARP_SIZE 32
#define __ballot_sync(mask, predicate) __ballot(predicate)
#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width)
#define __shfl_xor_sync(mask, val, laneMask, width) __shfl_xor(val, laneMask, width)
)";
#endif
// add modern type definitions
ss << "typedef signed char int8_t;\n";
ss << "typedef signed short int16_t;\n";
ss << "typedef signed int int32_t;\n";
ss << "typedef signed long int int64_t;\n";
ss << "typedef unsigned char uint8_t;\n";
ss << "typedef unsigned short uint16_t;\n";
ss << "typedef unsigned int uint32_t;\n";
ss << "typedef unsigned long int uint64_t;\n";
ss << "\n";
// division_by_invariant_multiplication:
// fast integer division via invariant multiplication and shifting
// if value is a power of 2, magic will be 1 and only shifting
// is required (predicate p below)
// load: helper to load from constant memory for fast access
ss << R"(
__device__ __forceinline__ int division_by_invariant_multiplication(int value, int magic, int shift)
{
int result;
asm("{\n\t"
".reg .pred p;\n\t"
".reg .u64 res64;\n\t"
".reg .u32 lo32, hi32;\n\t"
"setp.ne.s32 p, %2, 1;\n\t"
"mul.wide.u32 res64, %1, %2;\n\t"
"mov.b64 {lo32, hi32}, res64;\n\t"
"selp.u32 hi32, hi32, %1, p;\n\t"
"shr.u32 %0, hi32, %3;\n\t"
"}" : "=r"(result) : "r"(value), "r"(magic), "r"(shift));
return result;
}
__device__ __forceinline__ void idiv_fast(int numerator, int denominator, float rcp,
int& result, int& remainder)
{
result = (int)((float)numerator * rcp);
remainder = numerator - (result * denominator);
result = (remainder >= denominator) ? (result + 1) : result;
remainder = (remainder >= denominator) ? (remainder - denominator) : remainder;
}
__device__ __forceinline__ int mod16(int numerator, int div, int maxdiv)
{
int res;
asm("vmad.s32.u32.u32 %0, -%1.h0, %2.h0, %3;" : "=r"(res) : "r"(div), "r"(maxdiv), "r"(numerator));
return res;
}
__device__ __forceinline__ int mad16(int a, int b, int c)
{
int res;
asm("vmad.s32.u32.u32 %0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c));
return res;
}
__device__ __forceinline__ int msub16(int a, int b, int c)
{
int res;
asm("vmad.s32.u32.u32 %0, %1.h0, %2.h0, -%3;" : "=r"(res) : "r"(a), "r"(b), "r"(c));
return res;
}
)";
return ss.str();
}
std::string ngraph::runtime::gpu::nvrtc::define_zero(const std::string& dtype,
const std::string& name)
{
std::stringstream ss;
ss << "__device__ __forceinline__ void " << name << "(" << dtype << "& a) { a = 0; }\n";
return ss.str();
}
std::string ngraph::runtime::gpu::nvrtc::define_vzero(std::string dtype,
const uint32_t& n,
const std::string& name)
{
std::stringstream ss;
if (n == 1 || n == 2 || n == 4)
{
static std::vector<std::string> assignment = {"a.x = ", "a.y = ", "a.z = "};
dtype = dtype + std::to_string(n);
ss << "__device__ __forceinline__ void " << name << "(" << dtype << "& a) { ";
for (auto i = 0u; i <= (n >> 1); i++)
{
ss << assignment[i];
}
ss << "0; }\n";
}
else
{
throw ngraph_error("Invalid request for vector zero of " + dtype + std::to_string(n));
}
return ss.str();
}
#define LOAD_C \
"__device__ __forceinline__ " << dtype << " " << name << "(const " << dtype \
<< "* __restrict__ in, int i=0, bool b=true) { " << dtype \
<< " v; zero_(v); if (b) v = in[i]; return v; }\n"
std::string ngraph::runtime::gpu::nvrtc::define_coherent_load(const std::string& dtype,
const std::string& name)
{
std::stringstream ss;
ss << define_zero(dtype);
ss << LOAD_C;
return ss.str();
}
std::string ngraph::runtime::gpu::nvrtc::define_coherent_vload(const std::string& dtype,
const uint32_t& n,
const std::string& name)
{
std::stringstream ss;
ss << define_vzero(dtype, n);
ss << LOAD_C;
return ss.str();
}
#define LOAD_NC \
"__device__ __forceinline__ " << dtype << " " << name << "(const " << dtype \
<< "* __restrict__ in, int i=0, bool b=true) { " << dtype \
<< " v; zero_(v); if (b) v = __ldg(in + i); return v; }\n"
std::string ngraph::runtime::gpu::nvrtc::define_non_coherent_load(const std::string& dtype,
const std::string& name)
{
std::stringstream ss;
ss << define_zero(dtype);
ss << LOAD_NC;
return ss.str();
}
std::string ngraph::runtime::gpu::nvrtc::define_non_coherent_vload(const std::string& dtype,
const uint32_t& n,
const std::string& name)
{
std::stringstream ss;
ss << define_vzero(dtype, n);
ss << LOAD_NC;
return ss.str();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment