Commit 529362b5 authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

Abstraction for GPU unary elementwise ops (#587)

* straightforward gpu.cos implementation following previous patterns prior to refactor

* Generalized unary elementwise gpu op impl.. New unary elementwise ops can
be added to the type annotations in gpu_cuda_kernel_ops.hpp. Next step
is to refactor the llvm interface in gpu_emitters.hpp for similar generality.

* Added gpu_emitter.hpp:EmitUnaryElementwise.

Function adds cuda kernel based on ngraph::op::op_type::description.
This can service all unary elementwise ops run on the gpu.

* The following elementwise unary ops now use the EmitUnaryElementwise emitter:
* GPU.abs
* GPU.acos
* GPU.asin
* GPU.atan
* GPU.ceiling
* GPU.cos
* GPU.cosh
* GPU.exp
* GPU.floor
* GPU.log
* GPU.not
* GPU.sign
* GPU.sin
* GPU.sinh
* GPU.tan
* GPU.tanh
Unary elementwise ops Sign and Not need extra consideration.

* tanh test changed to test::all_close for fp comparison (also done for tan in commit 65fa7c6de34c8277fe2a4801644f6bb64574f4ff).

* GPU backend skips added for recent softmax test and updated aliased output test that uses op::Constant.

* code format update

* changed cuda builder interface names to unary/binary/arbitrary, added impl. note to gpu_cuda_kernel_ops, cleaned code format

* updated ngraph-cpp reference

* Fixing incorrect github conflict resolution.

* Added GPU emitter for op::Result.
For now it simply copies the output tensor.

All but 3 tests now pass. The remaining
failing tests are:
* GPU.dot_0_0
* GPU.dot_matrix_2x0_0x2
* GPU.dot_2x0_0

* Removed call to handle memory aliasing in gpu_external_function.

* fix gpu emitter bug that will return in the middle of function

* Merge pull request #609 from NervanaSystems/tfl/fix_return_bug

fix gpu emitter bug that will return in the middle of function

* GPU backend skips added for recent softmax test and updated aliased output test that uses op::Constant.
parent a02aab01
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*******************************************************************************/ *******************************************************************************/
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
namespace ngraph namespace ngraph
...@@ -22,10 +21,10 @@ namespace ngraph ...@@ -22,10 +21,10 @@ namespace ngraph
{ {
namespace gpu namespace gpu
{ {
void CudaKernelBuilder::get_1_element_op(const std::string& name, void CudaKernelBuilder::get_unary_elementwise_op(const std::string& name,
const std::string& data_type, const std::string& data_type,
const std::string& op, const std::string& op,
std::string& kernel) std::string& kernel)
{ {
kernel = R"( kernel = R"(
extern "C" __global__ extern "C" __global__
...@@ -40,10 +39,10 @@ out[tid] =)" + op + "(in[tid]);\n" + ...@@ -40,10 +39,10 @@ out[tid] =)" + op + "(in[tid]);\n" +
return; return;
} }
void CudaKernelBuilder::get_2_element_op(const std::string& name, void CudaKernelBuilder::get_binary_elementwise_op(const std::string& name,
const std::string& data_type, const std::string& data_type,
const std::string& op, const std::string& op,
std::string& kernel) std::string& kernel)
{ {
kernel = R"( kernel = R"(
extern "C" __global__ extern "C" __global__
...@@ -60,10 +59,11 @@ out[tid] = in1[tid] )" + op + ...@@ -60,10 +59,11 @@ out[tid] = in1[tid] )" + op +
return; return;
} }
void CudaKernelBuilder::get_n_element_op(const std::string& name, void
const std::string& data_type, CudaKernelBuilder::get_arbitrary_elementwise_op(const std::string& name,
const std::vector<std::string>& ops, const std::string& data_type,
std::string& kernel) const std::vector<std::string>& ops,
std::string& kernel)
{ {
kernel = ""; kernel = "";
return; return;
......
...@@ -28,20 +28,20 @@ namespace ngraph ...@@ -28,20 +28,20 @@ namespace ngraph
class CudaKernelBuilder class CudaKernelBuilder
{ {
public: public:
static void get_1_element_op(const std::string& name, static void get_unary_elementwise_op(const std::string& name,
const std::string& data_type, const std::string& data_type,
const std::string& op, const std::string& op,
std::string& kernel); std::string& kernel);
static void get_2_element_op(const std::string& name, static void get_binary_elementwise_op(const std::string& name,
const std::string& data_type, const std::string& data_type,
const std::string& op, const std::string& op,
std::string& kernel); std::string& kernel);
static void get_n_element_op(const std::string& name, static void get_arbitrary_elementwise_op(const std::string& name,
const std::string& data_type, const std::string& data_type,
const std::vector<std::string>& ops, const std::vector<std::string>& ops,
std::string& kernel); std::string& kernel);
}; };
} }
} }
......
...@@ -17,10 +17,8 @@ ...@@ -17,10 +17,8 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
namespace ngraph namespace ngraph
{ {
...@@ -28,40 +26,6 @@ namespace ngraph ...@@ -28,40 +26,6 @@ namespace ngraph
{ {
namespace gpu namespace gpu
{ {
void emit_abs(void* in, void* out, size_t count)
{
std::string name = "abs";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
CudaKernelBuilder::get_1_element_op(name, "float", "fabsf", kernel);
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = (CUdeviceptr)in;
d_ptr_out = (CUdeviceptr)out;
void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
count,
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
void emit_broadcast( void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count) void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
{ {
......
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
#include "ngraph/coordinate.hpp" #include "ngraph/coordinate.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/strides.hpp" #include "ngraph/strides.hpp"
namespace ngraph namespace ngraph
...@@ -26,9 +29,46 @@ namespace ngraph ...@@ -26,9 +29,46 @@ namespace ngraph
{ {
namespace gpu namespace gpu
{ {
void emit_abs(void* in, void* out, size_t count); template <typename T>
struct CudaOpMap;
void emit_broadcast( void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count); void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
template <typename T>
void emit_unary_elementwise_op(void* in, void* out, size_t count, std::string name)
{
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
CudaKernelBuilder::get_unary_elementwise_op(
name, "float", CudaOpMap<T>::op, kernel);
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = (CUdeviceptr)in;
d_ptr_out = (CUdeviceptr)out;
void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
count,
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
} }
} }
} }
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
namespace ngraph
{
namespace op
{
class Abs;
class Acos;
class Asin;
class Atan;
class Ceiling;
class Cos;
class Cosh;
class Exp;
class Floor;
class Log;
class Sin;
class Sinh;
class Tan;
class Tanh;
// Unimplemented or unused in favor of cuDNN impl.
class Max;
class Min;
class Negative;
class Not;
class Sign;
class Sqrt;
}
namespace runtime
{
namespace gpu
{
template <>
struct CudaOpMap<ngraph::op::Abs>
{
static constexpr const char* op = "fabsf";
};
template <>
struct CudaOpMap<ngraph::op::Acos>
{
static constexpr const char* op = "acosf";
};
template <>
struct CudaOpMap<ngraph::op::Asin>
{
static constexpr const char* op = "asinf";
};
template <>
struct CudaOpMap<ngraph::op::Atan>
{
static constexpr const char* op = "atanf";
};
template <>
struct CudaOpMap<ngraph::op::Ceiling>
{
static constexpr const char* op = "ceilf";
};
template <>
struct CudaOpMap<ngraph::op::Cos>
{
static constexpr const char* op = "cosf";
};
template <>
struct CudaOpMap<ngraph::op::Cosh>
{
static constexpr const char* op = "coshf";
};
template <>
struct CudaOpMap<ngraph::op::Exp>
{
static constexpr const char* op = "expf";
};
template <>
struct CudaOpMap<ngraph::op::Floor>
{
static constexpr const char* op = "floorf";
};
template <>
struct CudaOpMap<ngraph::op::Log>
{
static constexpr const char* op = "logf";
};
template <>
struct CudaOpMap<ngraph::op::Max>
{
static constexpr const char* op = "fmaxf";
};
template <>
struct CudaOpMap<ngraph::op::Min>
{
static constexpr const char* op = "fminf";
};
template <>
struct CudaOpMap<ngraph::op::Sin>
{
static constexpr const char* op = "sinf";
};
template <>
struct CudaOpMap<ngraph::op::Sinh>
{
static constexpr const char* op = "sinhf";
};
template <>
struct CudaOpMap<ngraph::op::Sqrt>
{
static constexpr const char* op = "sqrtf";
};
template <>
struct CudaOpMap<ngraph::op::Tan>
{
static constexpr const char* op = "tanf";
};
template <>
struct CudaOpMap<ngraph::op::Tanh>
{
static constexpr const char* op = "tanhf";
};
}
}
}
...@@ -58,10 +58,11 @@ void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer, ...@@ -58,10 +58,11 @@ void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
{ {
} }
void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitUnaryElementwise(
const ngraph::Node* n, codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
...@@ -71,8 +72,9 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer, ...@@ -71,8 +72,9 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n"; writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::emit_abs((void*) " << args[0].get_name() << ", (void*) " writer << "ngraph::runtime::gpu::emit_unary_elementwise_op<ngraph::op::" << n->description()
<< out[0].get_name() << ", count);\n"; << ">((void*) " << args[0].get_name() << ", (void*) " << out[0].get_name()
<< ", count, \"" << n->description() << "\");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
...@@ -293,14 +295,6 @@ void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer, ...@@ -293,14 +295,6 @@ void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
throw std::runtime_error(n->get_name() + " is not implemented."); throw std::runtime_error(n->get_name() + " is not implemented.");
} }
void runtime::gpu::GPU_Emitter::EmitLog(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
const ngraph::Node* n, const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
...@@ -632,14 +626,6 @@ void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer, ...@@ -632,14 +626,6 @@ void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
throw std::runtime_error(n->get_name() + " is not implemented."); throw std::runtime_error(n->get_name() + " is not implemented.");
} }
void runtime::gpu::GPU_Emitter::EmitSign(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
const ngraph::Node* n, const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
...@@ -701,86 +687,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -701,86 +687,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSin(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitCos(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitTan(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
const ngraph::Node* n, const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
...@@ -806,22 +712,6 @@ void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer, ...@@ -806,22 +712,6 @@ void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
throw std::runtime_error(n->get_name() + " is not implemented."); throw std::runtime_error(n->get_name() + " is not implemented.");
} }
void runtime::gpu::GPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
const ngraph::Node* n, const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
...@@ -875,14 +765,6 @@ void runtime::gpu::GPU_Emitter::EmitConvolution( ...@@ -875,14 +765,6 @@ void runtime::gpu::GPU_Emitter::EmitConvolution(
throw std::runtime_error(n->get_name() + " is not implemented."); throw std::runtime_error(n->get_name() + " is not implemented.");
} }
void runtime::gpu::GPU_Emitter::EmitNot(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
const ngraph::Node* n, const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
......
...@@ -45,8 +45,8 @@ namespace ngraph ...@@ -45,8 +45,8 @@ namespace ngraph
static void EMITTER_DECL(EmitMultiply); static void EMITTER_DECL(EmitMultiply);
static void EMITTER_DECL(EmitGetOutputElement); static void EMITTER_DECL(EmitGetOutputElement);
static void EMITTER_DECL(EmitXLAGetTupleElement); static void EMITTER_DECL(EmitXLAGetTupleElement);
static void EMITTER_DECL(EmitUnaryElementwise);
static void EMITTER_DECL(EmitTuple); static void EMITTER_DECL(EmitTuple);
static void EMITTER_DECL(EmitAbs);
static void EMITTER_DECL(EmitConcat); static void EMITTER_DECL(EmitConcat);
static void EMITTER_DECL(EmitDivide); static void EMITTER_DECL(EmitDivide);
static void EMITTER_DECL(EmitEqual); static void EMITTER_DECL(EmitEqual);
...@@ -54,7 +54,6 @@ namespace ngraph ...@@ -54,7 +54,6 @@ namespace ngraph
static void EMITTER_DECL(EmitGreaterEq); static void EMITTER_DECL(EmitGreaterEq);
static void EMITTER_DECL(EmitLess); static void EMITTER_DECL(EmitLess);
static void EMITTER_DECL(EmitLessEq); static void EMITTER_DECL(EmitLessEq);
static void EMITTER_DECL(EmitLog);
static void EMITTER_DECL(EmitMaximum); static void EMITTER_DECL(EmitMaximum);
static void EMITTER_DECL(EmitMinimum); static void EMITTER_DECL(EmitMinimum);
static void EMITTER_DECL(EmitNegative); static void EMITTER_DECL(EmitNegative);
...@@ -67,27 +66,13 @@ namespace ngraph ...@@ -67,27 +66,13 @@ namespace ngraph
static void EMITTER_DECL(EmitReshape); static void EMITTER_DECL(EmitReshape);
static void EMITTER_DECL(EmitFunctionCall); static void EMITTER_DECL(EmitFunctionCall);
static void EMITTER_DECL(EmitReduce); static void EMITTER_DECL(EmitReduce);
static void EMITTER_DECL(EmitSign);
static void EMITTER_DECL(EmitSlice); static void EMITTER_DECL(EmitSlice);
static void EMITTER_DECL(EmitSum); static void EMITTER_DECL(EmitSum);
static void EMITTER_DECL(EmitExp);
static void EMITTER_DECL(EmitSin);
static void EMITTER_DECL(EmitSinh);
static void EMITTER_DECL(EmitCos);
static void EMITTER_DECL(EmitCosh);
static void EMITTER_DECL(EmitTan);
static void EMITTER_DECL(EmitTanh);
static void EMITTER_DECL(EmitAsin);
static void EMITTER_DECL(EmitAcos);
static void EMITTER_DECL(EmitAtan);
static void EMITTER_DECL(EmitPower); static void EMITTER_DECL(EmitPower);
static void EMITTER_DECL(EmitReplaceSlice); static void EMITTER_DECL(EmitReplaceSlice);
static void EMITTER_DECL(EmitOneHot); static void EMITTER_DECL(EmitOneHot);
static void EMITTER_DECL(EmitFloor);
static void EMITTER_DECL(EmitCeiling);
static void EMITTER_DECL(EmitSqrt); static void EMITTER_DECL(EmitSqrt);
static void EMITTER_DECL(EmitConvolution); static void EMITTER_DECL(EmitConvolution);
static void EMITTER_DECL(EmitNot);
static void EMITTER_DECL(EmitMaxPool); static void EMITTER_DECL(EmitMaxPool);
static void EMITTER_DECL(EmitReverse); static void EMITTER_DECL(EmitReverse);
static void EMITTER_DECL(EmitReduceWindow); static void EMITTER_DECL(EmitReduceWindow);
......
...@@ -151,7 +151,7 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -151,7 +151,7 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot}, {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply}, {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop}, {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitAbs}, {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat}, {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide}, {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual}, {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
...@@ -159,7 +159,7 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -159,7 +159,7 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq}, {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess}, {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq}, {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitLog}, {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum}, {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum}, {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative}, {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
...@@ -173,26 +173,26 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -173,26 +173,26 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape}, {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall}, {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce}, {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitSign}, {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice}, {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum}, {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitExp}, {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitSin}, {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitSinh}, {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitCos}, {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitCosh}, {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitTan}, {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitTanh}, {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitAsin}, {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitAcos}, {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitAtan}, {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice}, {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot}, {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitFloor}, {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitCeiling}, {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt}, {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution}, {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitNot}, {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool}, {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse}, {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
{TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow}, {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
...@@ -251,6 +251,7 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -251,6 +251,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/pass/memory_layout.hpp" #include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/aligned_buffer.hpp" #include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp" #include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
)"; )";
......
...@@ -336,7 +336,6 @@ TEST(${BACKEND_NAME}, abs) ...@@ -336,7 +336,6 @@ TEST(${BACKEND_NAME}, abs)
TEST(${BACKEND_NAME}, ceiling) TEST(${BACKEND_NAME}, ceiling)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Ceiling>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Ceiling>(A), op::ParameterVector{A});
...@@ -777,7 +776,6 @@ TEST(${BACKEND_NAME}, equal) ...@@ -777,7 +776,6 @@ TEST(${BACKEND_NAME}, equal)
TEST(${BACKEND_NAME}, floor) TEST(${BACKEND_NAME}, floor)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Floor>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Floor>(A), op::ParameterVector{A});
...@@ -1371,7 +1369,6 @@ TEST(${BACKEND_NAME}, lesseq_bool) ...@@ -1371,7 +1369,6 @@ TEST(${BACKEND_NAME}, lesseq_bool)
TEST(${BACKEND_NAME}, log) TEST(${BACKEND_NAME}, log)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2}; Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Log>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Log>(A), op::ParameterVector{A});
...@@ -2674,7 +2671,6 @@ TEST(${BACKEND_NAME}, reshape_6d) ...@@ -2674,7 +2671,6 @@ TEST(${BACKEND_NAME}, reshape_6d)
TEST(${BACKEND_NAME}, sin) TEST(${BACKEND_NAME}, sin)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sin>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Sin>(A), op::ParameterVector{A});
...@@ -2700,7 +2696,6 @@ TEST(${BACKEND_NAME}, sin) ...@@ -2700,7 +2696,6 @@ TEST(${BACKEND_NAME}, sin)
TEST(${BACKEND_NAME}, cos) TEST(${BACKEND_NAME}, cos)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Cos>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Cos>(A), op::ParameterVector{A});
...@@ -2726,7 +2721,6 @@ TEST(${BACKEND_NAME}, cos) ...@@ -2726,7 +2721,6 @@ TEST(${BACKEND_NAME}, cos)
TEST(${BACKEND_NAME}, tan) TEST(${BACKEND_NAME}, tan)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Tan>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Tan>(A), op::ParameterVector{A});
...@@ -2747,12 +2741,11 @@ TEST(${BACKEND_NAME}, tan) ...@@ -2747,12 +2741,11 @@ TEST(${BACKEND_NAME}, tan)
input.begin(), input.end(), input.begin(), [](float x) -> float { return tanf(x); }); input.begin(), input.end(), input.begin(), [](float x) -> float { return tanf(x); });
cf->call({a}, {result}); cf->call({a}, {result});
EXPECT_EQ(input, read_vector<float>(result)); EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
} }
TEST(${BACKEND_NAME}, asin) TEST(${BACKEND_NAME}, asin)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Asin>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Asin>(A), op::ParameterVector{A});
...@@ -2777,7 +2770,6 @@ TEST(${BACKEND_NAME}, asin) ...@@ -2777,7 +2770,6 @@ TEST(${BACKEND_NAME}, asin)
TEST(${BACKEND_NAME}, acos) TEST(${BACKEND_NAME}, acos)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Acos>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Acos>(A), op::ParameterVector{A});
...@@ -2802,7 +2794,6 @@ TEST(${BACKEND_NAME}, acos) ...@@ -2802,7 +2794,6 @@ TEST(${BACKEND_NAME}, acos)
TEST(${BACKEND_NAME}, atan) TEST(${BACKEND_NAME}, atan)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Atan>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Atan>(A), op::ParameterVector{A});
...@@ -2827,7 +2818,6 @@ TEST(${BACKEND_NAME}, atan) ...@@ -2827,7 +2818,6 @@ TEST(${BACKEND_NAME}, atan)
TEST(${BACKEND_NAME}, sinh) TEST(${BACKEND_NAME}, sinh)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sinh>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Sinh>(A), op::ParameterVector{A});
...@@ -2852,7 +2842,6 @@ TEST(${BACKEND_NAME}, sinh) ...@@ -2852,7 +2842,6 @@ TEST(${BACKEND_NAME}, sinh)
TEST(${BACKEND_NAME}, cosh) TEST(${BACKEND_NAME}, cosh)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Cosh>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Cosh>(A), op::ParameterVector{A});
...@@ -2877,7 +2866,6 @@ TEST(${BACKEND_NAME}, cosh) ...@@ -2877,7 +2866,6 @@ TEST(${BACKEND_NAME}, cosh)
TEST(${BACKEND_NAME}, tanh) TEST(${BACKEND_NAME}, tanh)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6}; Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Tanh>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Tanh>(A), op::ParameterVector{A});
...@@ -2897,12 +2885,11 @@ TEST(${BACKEND_NAME}, tanh) ...@@ -2897,12 +2885,11 @@ TEST(${BACKEND_NAME}, tanh)
input.begin(), input.end(), input.begin(), [](float x) -> float { return tanhf(x); }); input.begin(), input.end(), input.begin(), [](float x) -> float { return tanhf(x); });
cf->call({a}, {result}); cf->call({a}, {result});
EXPECT_EQ(input, read_vector<float>(result)); EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
} }
TEST(${BACKEND_NAME}, exp) TEST(${BACKEND_NAME}, exp)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{8}; Shape shape{8};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Exp>(A), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Exp>(A), op::ParameterVector{A});
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment