Unverified Commit d3ea93e2 authored by Chris Sullivan's avatar Chris Sullivan Committed by GitHub

GPU elementwise emitters now respect input and output tensor types. (#633)

* GPU elementwise emitters now respect input and output tensor types.
This enables the use of binary comparison ops and op::Convert.

* Removed comments.

* All kernels now have type signature
even if the i/o tensors are equivalent type so that
kernels for specific type tensors are unique.

NGMX-391 #close 
parent 429eae9a
......@@ -18,18 +18,19 @@
using namespace ngraph;
void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
void runtime::gpu::CudaKernelBuilder::get_elementwise_op(
codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& op,
const std::array<std::string, 2>& data_types,
const size_t& num_inputs)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(";
for (size_t i = 0; i < num_inputs; i++)
{
writer << data_type << "* in" << i << ", ";
writer << data_types[0] << "* in" << i << ", ";
}
writer << data_type << "* out,"
writer << data_types[1] << "* out,"
<< "size_t n)\n";
writer << "{\n";
writer.indent++;
......@@ -55,20 +56,21 @@ void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& wr
return;
}
void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
void runtime::gpu::CudaKernelBuilder::get_device_helper(
codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& math_kernel,
const std::array<std::string, 2>& data_types,
const size_t& num_inputs)
{
if (math_kernel.size())
{
writer << "__device__ " << data_type << " " << name << "(";
writer << "__device__ " << data_types[1] << " " << name << "(";
for (size_t i = 0; i < num_inputs - 1; i++)
{
writer << data_type << " x" << i << ", ";
writer << data_types[0] << " x" << i << ", ";
}
writer << data_type << " x" << num_inputs - 1;
writer << data_types[0] << " x" << num_inputs - 1;
writer << ")\n";
writer << "{\n";
writer.indent++;
......@@ -80,3 +82,16 @@ void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& wri
}
return;
}
void runtime::gpu::CudaKernelBuilder::add_pod_typedefs(codegen::CodeWriter& writer)
{
writer << "typedef signed char int8_t;\n";
writer << "typedef signed short int16_t;\n";
writer << "typedef signed int int32_t;\n";
writer << "typedef signed long int int64_t;\n";
writer << "typedef unsigned char uint8_t;\n";
writer << "typedef unsigned short uint16_t;\n";
writer << "typedef unsigned int uint32_t;\n";
writer << "typedef unsigned long int uint64_t;\n";
writer << "\n";
}
......@@ -16,6 +16,7 @@
#pragma once
#include <array>
#include <string>
#include <vector>
......@@ -34,15 +35,16 @@ namespace ngraph
public:
static void get_elementwise_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& op,
const std::array<std::string, 2>& data_types,
const size_t& num_inputs);
static void get_device_helper(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& math_kernel,
const std::array<std::string, 2>& data_types,
const size_t& num_inputs);
static void add_pod_typedefs(codegen::CodeWriter& writer);
};
}
}
......
......@@ -21,6 +21,7 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
using namespace ngraph;
void runtime::gpu::emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
{
......
......@@ -16,6 +16,9 @@
#pragma once
#include <array>
#include <string>
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/coordinate.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
......@@ -36,33 +39,39 @@ namespace ngraph
template <typename T, typename... Inputs>
void emit_elementwise_op(std::string name,
std::array<std::string, 2> data_types,
size_t count,
CUdeviceptr out,
Inputs&&... inputs)
{
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
std::string type_signature = "_" + data_types[0] + "_" + data_types[1];
if (CudaFunctionPool::instance().get(name + type_signature) == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
std::string op_name = CudaOpMap<T>::op;
if (CudaOpMap<T>::math_kernel)
{
op_name += type_signature;
CudaKernelBuilder::get_device_helper(writer,
CudaOpMap<T>::op,
CudaOpMap<T>::type,
op_name,
CudaOpMap<T>::math_kernel,
data_types,
sizeof...(inputs));
}
CudaKernelBuilder::get_elementwise_op(
writer, name, CudaOpMap<T>::type, CudaOpMap<T>::op, sizeof...(inputs));
writer, name + type_signature, op_name, data_types, sizeof...(inputs));
std::string kernel = writer.get_code();
CudaFunctionPool::instance().set(name, kernel);
CudaFunctionPool::instance().set(name + type_signature, kernel);
}
//convert runtime ptr to driver api ptr
void* args_list[] = {&inputs..., &out, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
CUDA_SAFE_CALL(
cuLaunchKernel(*CudaFunctionPool::instance().get(name + type_signature).get(),
count,
1,
1, // grid dim
......
......@@ -38,8 +38,6 @@ namespace ngraph
class Subtract;
class Divide;
class Sign;
// requires different input and output types
class Convert;
class Equal;
class NotEqual;
......@@ -47,8 +45,6 @@ namespace ngraph
class GreaterEq;
class Less;
class LessEq;
// Unimplemented or unused in favor of cuDNN impl.
class Max;
class Min;
class Negative;
......@@ -63,7 +59,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Abs>
{
static constexpr const char* op = "fabsf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -71,7 +66,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Acos>
{
static constexpr const char* op = "acosf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -79,7 +73,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Asin>
{
static constexpr const char* op = "asinf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -87,7 +80,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Atan>
{
static constexpr const char* op = "atanf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -95,7 +87,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Ceiling>
{
static constexpr const char* op = "ceilf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -103,7 +94,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Cos>
{
static constexpr const char* op = "cosf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -111,7 +101,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Cosh>
{
static constexpr const char* op = "coshf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -119,7 +108,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Exp>
{
static constexpr const char* op = "expf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -127,7 +115,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Floor>
{
static constexpr const char* op = "floorf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -135,7 +122,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Log>
{
static constexpr const char* op = "logf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -143,7 +129,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Max>
{
static constexpr const char* op = "fmaxf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -151,7 +136,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Min>
{
static constexpr const char* op = "fminf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -159,7 +143,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Sin>
{
static constexpr const char* op = "sinf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -167,7 +150,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Sinh>
{
static constexpr const char* op = "sinhf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -175,7 +157,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Sqrt>
{
static constexpr const char* op = "sqrtf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -183,7 +164,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Tan>
{
static constexpr const char* op = "tanf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -191,7 +171,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Tanh>
{
static constexpr const char* op = "tanhf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -199,7 +178,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Power>
{
static constexpr const char* op = "powf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -207,7 +185,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Subtract>
{
static constexpr const char* op = "subtractf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = "x0-x1";
};
......@@ -215,7 +192,6 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Divide>
{
static constexpr const char* op = "fdividef";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
......@@ -223,9 +199,57 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Sign>
{
static constexpr const char* op = "sign";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = "(x0 > 0) - (x0 < 0)";
};
template <>
struct CudaOpMap<ngraph::op::Convert>
{
static constexpr const char* op = "convert";
static constexpr const char* math_kernel = "x0";
};
template <>
struct CudaOpMap<ngraph::op::Equal>
{
static constexpr const char* op = "equal";
static constexpr const char* math_kernel = "x0 == x1";
};
template <>
struct CudaOpMap<ngraph::op::NotEqual>
{
static constexpr const char* op = "not_equal";
static constexpr const char* math_kernel = "x0 != x1";
};
template <>
struct CudaOpMap<ngraph::op::Greater>
{
static constexpr const char* op = "greater";
static constexpr const char* math_kernel = "x0 > x1";
};
template <>
struct CudaOpMap<ngraph::op::GreaterEq>
{
static constexpr const char* op = "greater_equal";
static constexpr const char* math_kernel = "x0 >= x1";
};
template <>
struct CudaOpMap<ngraph::op::Less>
{
static constexpr const char* op = "less";
static constexpr const char* math_kernel = "x0 < x1";
};
template <>
struct CudaOpMap<ngraph::op::LessEq>
{
static constexpr const char* op = "less_equal";
static constexpr const char* math_kernel = "x0 <= x1";
};
}
}
}
......@@ -97,15 +97,13 @@
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
namespace ngraph
{
namespace runtime
{
namespace gpu
{
void runtime::gpu::GPU_Emitter::EmitElementwise(
void GPU_Emitter::EmitElementwise(
GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer,
const ngraph::Node* n,
......@@ -116,12 +114,14 @@ namespace ngraph
{
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::emit_elementwise_op<ngraph::op::"
<< n->description() << ">(\"" << n->description() << "\""
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", count"
<< ", (CUdeviceptr) " << out[0].get_name();
for (size_t i = 0; i < args.size(); i++)
......
......@@ -168,23 +168,23 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Equal>},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::GetOutputElement),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Greater>},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::GreaterEq>},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Less>},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::emit<ngraph::op::LessEq>},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit<ngraph::op::NotEqual>},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convert>},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>},
......
......@@ -751,7 +751,6 @@ TEST(${BACKEND_NAME}, divide_by_zero_int32)
TEST(${BACKEND_NAME}, equal)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -1245,7 +1244,6 @@ TEST(${BACKEND_NAME}, dot_matrix_vector_int64)
TEST(${BACKEND_NAME}, greater)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -1269,7 +1267,6 @@ TEST(${BACKEND_NAME}, greater)
TEST(${BACKEND_NAME}, greatereq)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -1293,7 +1290,6 @@ TEST(${BACKEND_NAME}, greatereq)
TEST(${BACKEND_NAME}, less)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -1317,7 +1313,6 @@ TEST(${BACKEND_NAME}, less)
TEST(${BACKEND_NAME}, lesseq)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -1341,7 +1336,6 @@ TEST(${BACKEND_NAME}, lesseq)
TEST(${BACKEND_NAME}, lesseq_bool)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::boolean, shape);
auto B = make_shared<op::Parameter>(element::boolean, shape);
......@@ -1461,7 +1455,6 @@ TEST(${BACKEND_NAME}, negative)
TEST(${BACKEND_NAME}, notequal)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -1943,7 +1936,6 @@ TEST(${BACKEND_NAME}, broadcast_matrix_2)
TEST(${BACKEND_NAME}, convert_int32_float32)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shape);
auto f =
......@@ -1965,7 +1957,6 @@ TEST(${BACKEND_NAME}, convert_int32_float32)
TEST(${BACKEND_NAME}, convert_int32_bool)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shape);
auto f = make_shared<Function>(make_shared<op::Convert>(A, element::boolean),
......@@ -1987,7 +1978,6 @@ TEST(${BACKEND_NAME}, convert_int32_bool)
TEST(${BACKEND_NAME}, convert_float32_bool)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Convert>(A, element::boolean),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment