Commit a94d46d4 authored by Jayaram Bobba's avatar Jayaram Bobba

Merge remote-tracking branch 'origin/master' into jbobba/batchnorm-layouts

parents d6000754 9cca4073
......@@ -272,13 +272,6 @@ namespace ngraph
const char* cbeta = "0.0f";
if (args.size() > 2)
{
writer << "memcpy(" << out[0].get_name() << ", " << args[2].get_name() << ", "
<< out[0].get_size() * out[0].get_element_type().size() << ");\n";
cbeta = "1.0f";
}
writer << "cblas::cblas_sgemm("
<< "cblas::Layout::RowMajor, " << tranpose_a << tranpose_b << m << ", " << n
<< ", " << k << ",\n"
......@@ -287,6 +280,101 @@ namespace ngraph
<< " " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
<< ");\n";
if (args.size() > 2)
{
auto axes = cg->get_broadcast_axes();
if (axes.size() == 1)
{
if (*(axes.begin()) == 0)
{
writer << "static " << out[0].get_element_type().c_type_string()
<< " ones_row[" << arg2_shape[0] << "]"
<< " = { 1.0f";
for (size_t i = 1; i < arg2_shape[0]; ++i)
{
writer << ", 1.0f";
}
writer << "};\n";
writer << "cblas::cblas_sgemm("
<< "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
<< arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
<< ",\n"
<< " 1.0f, ones_row, "
<< "1"
<< ", " << args[2].get_name() << ", " << max(1UL, arg2_shape[1])
<< ", "
<< "1.0f"
<< ",\n"
<< " " << out[0].get_name() << ", "
<< max(1UL, arg2_shape[1]) << ");\n";
}
else
{
writer << "static " << out[0].get_element_type().c_type_string()
<< " ones_col[" << arg2_shape[1] << "]"
<< " = { 1.0f";
for (size_t i = 1; i < arg2_shape[1]; ++i)
{
writer << ", 1.0f";
}
writer << "};\n";
writer << "cblas::cblas_sgemm("
<< "cblas::Layout::RowMajor, " << cnotranspose << ctranspose
<< arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
<< ",\n"
<< " 1.0f, ones_col," << max(1UL, arg2_shape[1]) << ", "
<< args[2].get_name() << ", "
<< "1"
<< ", "
<< "1.0f"
<< ",\n"
<< " " << out[0].get_name() << ", "
<< max(1UL, arg2_shape[1]) << ");\n";
}
}
else
{
if (axes.size() != 2)
{
throw ngraph_error("unexpected broadcast rank");
}
writer << out[0].get_element_type().c_type_string() << " bias["
<< arg2_shape[1] << "]"
<< " = { " << args[2].get_name() << "[0]";
for (size_t i = 1; i < arg2_shape[1]; ++i)
{
writer << "," << args[2].get_name() << "[0]";
}
writer << "};\n";
writer << "static " << out[0].get_element_type().c_type_string()
<< " ones_scalar[" << arg2_shape[0] << "]"
<< " = { 1.0f";
for (size_t i = 1; i < arg2_shape[0]; ++i)
{
writer << ", 1.0f";
}
writer << "};\n";
writer << "cblas::cblas_sgemm("
<< "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
<< arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
<< ",\n"
<< " 1.0f, ones_scalar, "
<< "1"
<< ", "
<< "bias"
<< ", " << max(1UL, arg2_shape[1]) << ", "
<< "1.0f"
<< ",\n"
<< " " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
<< ");\n";
}
}
writer.indent--;
writer << "}\n";
}
......
......@@ -32,7 +32,8 @@ std::shared_ptr<ngraph::Node>
m_shape_w,
m_shape_x,
m_transpose_w,
m_transpose_x);
m_transpose_x,
m_broadcast_axes);
}
ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
......@@ -41,7 +42,8 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
Shape shape_w,
Shape shape_x,
bool transpose_w,
bool transpose_x)
bool transpose_x,
AxisSet axes)
: RequiresTensorViewArgs("MatMulBias",
b == nullptr ? std::vector<std::shared_ptr<Node>>{W, x}
: std::vector<std::shared_ptr<Node>>{W, x, b})
......@@ -49,8 +51,24 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
, m_shape_x(shape_x)
, m_transpose_w(transpose_w)
, m_transpose_x(transpose_x)
, m_broadcast_axes(axes)
{
if (axes.size() == 0 && b != nullptr)
{
throw ngraph_error("Bias but no broadcast axes");
}
if (b == nullptr && axes.size() != 0)
{
throw ngraph_error("Broadcast axes but no bias");
}
if (axes.size() > 2)
{
throw ngraph_error("Broadcasting to > 2D tensor");
}
if (shape_w.size() != 2)
{
NGRAPH_DEBUG << "W shape = " << vector_to_string(shape_w);
......
......@@ -16,6 +16,7 @@
#pragma once
#include "ngraph/axis_set.hpp"
#include "ngraph/ops/util/requires_tensor_view_args.hpp"
namespace ngraph
......@@ -31,12 +32,14 @@ namespace ngraph
Shape shape_w,
Shape shape_x,
bool transpose_w,
bool transpose_x);
bool transpose_x,
AxisSet axes = AxisSet{});
bool get_is_arg0_transposed() const { return m_transpose_w; }
bool get_is_arg1_transposed() const { return m_transpose_x; }
Shape get_arg0_shape() const { return m_shape_w; }
Shape get_arg1_shape() const { return m_shape_x; }
const AxisSet& get_broadcast_axes() const { return m_broadcast_axes; }
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
......@@ -45,6 +48,7 @@ namespace ngraph
Shape m_shape_x;
bool m_transpose_w;
bool m_transpose_x;
AxisSet m_broadcast_axes;
};
}
}
......@@ -134,12 +134,21 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_matmulbias_pattern()
<< m.match_root()->get_name();
auto mpattern = m.match_root(); //add
auto m_matmul = mpattern->get_input_op(0);
auto m_broadcast = mpattern->get_input_op(1);
auto m_matmul = std::dynamic_pointer_cast<op::MatmulBias>(mpattern->get_input_op(0));
auto m_broadcast = std::dynamic_pointer_cast<op::Broadcast>(mpattern->get_input_op(1));
auto m_bias = m_broadcast->get_input_op(0);
auto pattern_map = m.get_pattern_map();
return m_matmul->copy_with_new_args(
NodeVector{pattern_map[W], pattern_map[x], m_broadcast});
auto mmb = std::make_shared<op::MatmulBias>(pattern_map[W],
pattern_map[x],
m_bias,
m_matmul->get_arg0_shape(),
m_matmul->get_arg1_shape(),
m_matmul->get_is_arg0_transposed(),
m_matmul->get_is_arg1_transposed(),
m_broadcast->get_broadcast_axes());
return mmb;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(padd, callback);
......
......@@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
namespace ngraph
......@@ -22,10 +21,10 @@ namespace ngraph
{
namespace gpu
{
void CudaKernelBuilder::get_1_element_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
void CudaKernelBuilder::get_unary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
{
kernel = R"(
extern "C" __global__
......@@ -40,10 +39,10 @@ out[tid] =)" + op + "(in[tid]);\n" +
return;
}
void CudaKernelBuilder::get_2_element_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
void CudaKernelBuilder::get_binary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
{
kernel = R"(
extern "C" __global__
......@@ -60,10 +59,11 @@ out[tid] = in1[tid] )" + op +
return;
}
void CudaKernelBuilder::get_n_element_op(const std::string& name,
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel)
void
CudaKernelBuilder::get_arbitrary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel)
{
kernel = "";
return;
......
......@@ -28,20 +28,20 @@ namespace ngraph
class CudaKernelBuilder
{
public:
static void get_1_element_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel);
static void get_unary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel);
static void get_2_element_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel);
static void get_binary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel);
static void get_n_element_op(const std::string& name,
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel);
static void get_arbitrary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel);
};
}
}
......
......@@ -17,10 +17,8 @@
#include <algorithm>
#include <map>
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
namespace ngraph
{
......@@ -28,40 +26,6 @@ namespace ngraph
{
namespace gpu
{
void emit_abs(void* in, void* out, size_t count)
{
std::string name = "abs";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
CudaKernelBuilder::get_1_element_op(name, "float", "fabsf", kernel);
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = (CUdeviceptr)in;
d_ptr_out = (CUdeviceptr)out;
void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
count,
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
{
......
......@@ -18,6 +18,9 @@
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/coordinate.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/strides.hpp"
namespace ngraph
......@@ -26,9 +29,46 @@ namespace ngraph
{
namespace gpu
{
void emit_abs(void* in, void* out, size_t count);
template <typename T>
struct CudaOpMap;
void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
template <typename T>
void emit_unary_elementwise_op(void* in, void* out, size_t count, std::string name)
{
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
CudaKernelBuilder::get_unary_elementwise_op(
name, "float", CudaOpMap<T>::op, kernel);
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = (CUdeviceptr)in;
d_ptr_out = (CUdeviceptr)out;
void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
count,
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
}
}
}
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
namespace ngraph
{
namespace op
{
class Abs;
class Acos;
class Asin;
class Atan;
class Ceiling;
class Cos;
class Cosh;
class Exp;
class Floor;
class Log;
class Sin;
class Sinh;
class Tan;
class Tanh;
// Unimplemented or unused in favor of cuDNN impl.
class Max;
class Min;
class Negative;
class Not;
class Sign;
class Sqrt;
}
namespace runtime
{
namespace gpu
{
template <>
struct CudaOpMap<ngraph::op::Abs>
{
static constexpr const char* op = "fabsf";
};
template <>
struct CudaOpMap<ngraph::op::Acos>
{
static constexpr const char* op = "acosf";
};
template <>
struct CudaOpMap<ngraph::op::Asin>
{
static constexpr const char* op = "asinf";
};
template <>
struct CudaOpMap<ngraph::op::Atan>
{
static constexpr const char* op = "atanf";
};
template <>
struct CudaOpMap<ngraph::op::Ceiling>
{
static constexpr const char* op = "ceilf";
};
template <>
struct CudaOpMap<ngraph::op::Cos>
{
static constexpr const char* op = "cosf";
};
template <>
struct CudaOpMap<ngraph::op::Cosh>
{
static constexpr const char* op = "coshf";
};
template <>
struct CudaOpMap<ngraph::op::Exp>
{
static constexpr const char* op = "expf";
};
template <>
struct CudaOpMap<ngraph::op::Floor>
{
static constexpr const char* op = "floorf";
};
template <>
struct CudaOpMap<ngraph::op::Log>
{
static constexpr const char* op = "logf";
};
template <>
struct CudaOpMap<ngraph::op::Max>
{
static constexpr const char* op = "fmaxf";
};
template <>
struct CudaOpMap<ngraph::op::Min>
{
static constexpr const char* op = "fminf";
};
template <>
struct CudaOpMap<ngraph::op::Sin>
{
static constexpr const char* op = "sinf";
};
template <>
struct CudaOpMap<ngraph::op::Sinh>
{
static constexpr const char* op = "sinhf";
};
template <>
struct CudaOpMap<ngraph::op::Sqrt>
{
static constexpr const char* op = "sqrtf";
};
template <>
struct CudaOpMap<ngraph::op::Tan>
{
static constexpr const char* op = "tanf";
};
template <>
struct CudaOpMap<ngraph::op::Tanh>
{
static constexpr const char* op = "tanhf";
};
}
}
}
This diff is collapsed.
......@@ -45,8 +45,8 @@ namespace ngraph
static void EMITTER_DECL(EmitMultiply);
static void EMITTER_DECL(EmitGetOutputElement);
static void EMITTER_DECL(EmitXLAGetTupleElement);
static void EMITTER_DECL(EmitUnaryElementwise);
static void EMITTER_DECL(EmitTuple);
static void EMITTER_DECL(EmitAbs);
static void EMITTER_DECL(EmitConcat);
static void EMITTER_DECL(EmitDivide);
static void EMITTER_DECL(EmitEqual);
......@@ -54,7 +54,6 @@ namespace ngraph
static void EMITTER_DECL(EmitGreaterEq);
static void EMITTER_DECL(EmitLess);
static void EMITTER_DECL(EmitLessEq);
static void EMITTER_DECL(EmitLog);
static void EMITTER_DECL(EmitMaximum);
static void EMITTER_DECL(EmitMinimum);
static void EMITTER_DECL(EmitNegative);
......@@ -67,31 +66,18 @@ namespace ngraph
static void EMITTER_DECL(EmitReshape);
static void EMITTER_DECL(EmitFunctionCall);
static void EMITTER_DECL(EmitReduce);
static void EMITTER_DECL(EmitSign);
static void EMITTER_DECL(EmitSlice);
static void EMITTER_DECL(EmitSum);
static void EMITTER_DECL(EmitExp);
static void EMITTER_DECL(EmitSin);
static void EMITTER_DECL(EmitSinh);
static void EMITTER_DECL(EmitCos);
static void EMITTER_DECL(EmitCosh);
static void EMITTER_DECL(EmitTan);
static void EMITTER_DECL(EmitTanh);
static void EMITTER_DECL(EmitAsin);
static void EMITTER_DECL(EmitAcos);
static void EMITTER_DECL(EmitAtan);
static void EMITTER_DECL(EmitPower);
static void EMITTER_DECL(EmitReplaceSlice);
static void EMITTER_DECL(EmitOneHot);
static void EMITTER_DECL(EmitFloor);
static void EMITTER_DECL(EmitCeiling);
static void EMITTER_DECL(EmitSqrt);
static void EMITTER_DECL(EmitConvolution);
static void EMITTER_DECL(EmitNot);
static void EMITTER_DECL(EmitMaxPool);
static void EMITTER_DECL(EmitReverse);
static void EMITTER_DECL(EmitReduceWindow);
static void EMITTER_DECL(EmitSelectAndScatter);
static void EMITTER_DECL(EmitResult);
};
}
}
......
......@@ -151,7 +151,7 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitAbs},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
......@@ -159,7 +159,7 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitLog},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
......@@ -173,30 +173,31 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitSign},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitExp},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitSin},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitSinh},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitCos},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitCosh},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitTan},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitTanh},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitAsin},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitAcos},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitAtan},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitFloor},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitCeiling},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitNot},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
{TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
{TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::EmitResult},
};
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
......@@ -250,6 +251,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp"
)";
......@@ -346,12 +348,15 @@ using namespace std;
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings();
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
<< tv->get_tensor().get_name() << "[" << c_value_strings.size() << "] =\n";
<< tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
<< "] =\n";
writer << "{\n";
writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
}
}
......@@ -485,6 +490,26 @@ using namespace std;
writer << "{\n";
writer.indent++;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
writer << "{\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
<< tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
<< ");\n";
writer.indent--;
writer << "}\n";
}
}
}
bool temporaries_used = false;
size_t worst_case_tmp_size = 0;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
......@@ -657,7 +682,6 @@ using namespace std;
// Emit operation epilogue
if (!node->is_parameter() && !node->is_constant())
{
handle_output_alias(writer, *node, output_alias_map);
if (m_emit_timing)
{
emit_debug_function_exit(writer, node.get(), in, out);
......
......@@ -1305,6 +1305,7 @@ TEST(${BACKEND_NAME}, backwards_slice)
TEST(${BACKEND_NAME}, backwards_softmax_all)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......@@ -1322,6 +1323,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_all)
TEST(${BACKEND_NAME}, backwards_softmax_axis)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......@@ -1339,6 +1341,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_axis)
TEST(${BACKEND_NAME}, backwards_softmax_underflow)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......@@ -1358,6 +1361,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_underflow)
TEST(${BACKEND_NAME}, backwards_softmax_3d)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......
......@@ -120,6 +120,7 @@ TEST(${BACKEND_NAME}, component_cleanup)
TEST(${BACKEND_NAME}, aliased_output)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -335,7 +336,6 @@ TEST(${BACKEND_NAME}, abs)
TEST(${BACKEND_NAME}, ceiling)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Ceiling>(A), op::ParameterVector{A});
......@@ -776,7 +776,6 @@ TEST(${BACKEND_NAME}, equal)
TEST(${BACKEND_NAME}, floor)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Floor>(A), op::ParameterVector{A});
......@@ -1370,7 +1369,6 @@ TEST(${BACKEND_NAME}, lesseq_bool)
TEST(${BACKEND_NAME}, log)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Log>(A), op::ParameterVector{A});
......@@ -2673,7 +2671,6 @@ TEST(${BACKEND_NAME}, reshape_6d)
TEST(${BACKEND_NAME}, sin)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sin>(A), op::ParameterVector{A});
......@@ -2699,7 +2696,6 @@ TEST(${BACKEND_NAME}, sin)
TEST(${BACKEND_NAME}, cos)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Cos>(A), op::ParameterVector{A});
......@@ -2725,7 +2721,6 @@ TEST(${BACKEND_NAME}, cos)
TEST(${BACKEND_NAME}, tan)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Tan>(A), op::ParameterVector{A});
......@@ -2746,12 +2741,11 @@ TEST(${BACKEND_NAME}, tan)
input.begin(), input.end(), input.begin(), [](float x) -> float { return tanf(x); });
cf->call({a}, {result});
EXPECT_EQ(input, read_vector<float>(result));
EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
}
TEST(${BACKEND_NAME}, asin)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Asin>(A), op::ParameterVector{A});
......@@ -2776,7 +2770,6 @@ TEST(${BACKEND_NAME}, asin)
TEST(${BACKEND_NAME}, acos)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Acos>(A), op::ParameterVector{A});
......@@ -2801,7 +2794,6 @@ TEST(${BACKEND_NAME}, acos)
TEST(${BACKEND_NAME}, atan)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Atan>(A), op::ParameterVector{A});
......@@ -2826,7 +2818,6 @@ TEST(${BACKEND_NAME}, atan)
TEST(${BACKEND_NAME}, sinh)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sinh>(A), op::ParameterVector{A});
......@@ -2851,7 +2842,6 @@ TEST(${BACKEND_NAME}, sinh)
TEST(${BACKEND_NAME}, cosh)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Cosh>(A), op::ParameterVector{A});
......@@ -2876,7 +2866,6 @@ TEST(${BACKEND_NAME}, cosh)
TEST(${BACKEND_NAME}, tanh)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{6};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Tanh>(A), op::ParameterVector{A});
......@@ -2896,12 +2885,11 @@ TEST(${BACKEND_NAME}, tanh)
input.begin(), input.end(), input.begin(), [](float x) -> float { return tanhf(x); });
cf->call({a}, {result});
EXPECT_EQ(input, read_vector<float>(result));
EXPECT_TRUE(test::all_close(input, read_vector<float>(result)));
}
TEST(${BACKEND_NAME}, exp)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{8};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Exp>(A), op::ParameterVector{A});
......@@ -8441,6 +8429,7 @@ TEST(${BACKEND_NAME}, relu_4Dbackprop)
TEST(${BACKEND_NAME}, softmax_all)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f =
......@@ -8473,6 +8462,7 @@ TEST(${BACKEND_NAME}, softmax_all)
TEST(${BACKEND_NAME}, softmax_axis)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A});
......@@ -8501,6 +8491,7 @@ TEST(${BACKEND_NAME}, softmax_axis)
TEST(${BACKEND_NAME}, softmax_underflow)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
......
......@@ -91,11 +91,89 @@ TEST(cpu_fusion, gemm_pattern)
ASSERT_EQ(n.get_pattern_map()[x], B);
ASSERT_EQ(n.get_pattern_map()[b], C);
auto cg =
make_shared<op::MatmulBias>(W, x, broadcast, W->get_shape(), x->get_shape(), false, false);
auto cg = make_shared<op::MatmulBias>(
W, x, C, W->get_shape(), x->get_shape(), false, false, AxisSet{0});
}
TEST(cpu_fusion, gemm_cpu_broadcast_row)
{
Shape shapeA{3, 2};
Shape shapeB{2, 3};
Shape shapeC{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeB);
auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0});
auto cg = make_shared<op::MatmulBias>(
A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
auto manager = runtime::Manager::get("CPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
shared_ptr<runtime::TensorView> result =
backend->make_primary_tensor_view(element::f32, shapeC);
vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
copy_data(a, dataA);
copy_data(b, dataB);
cf->call({a, b}, {result});
vector<float> expected{10, 28, 37, 109};
ASSERT_TRUE(read_vector<float>(result) == expected);
}
TEST(cpu_fusion, gemm_cpu)
TEST(cpu_fusion, gemm_cpu_broadcast_column)
{
Shape shapeA{3, 2};
Shape shapeB{2, 3};
Shape shapeC{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeB);
auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{1});
auto cg = make_shared<op::MatmulBias>(
A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
auto manager = runtime::Manager::get("CPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
shared_ptr<runtime::TensorView> result =
backend->make_primary_tensor_view(element::f32, shapeC);
vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
copy_data(a, dataA);
copy_data(b, dataB);
cf->call({a, b}, {result});
vector<float> expected{10, 28, 37, 109};
ASSERT_TRUE(read_vector<float>(result) == expected);
}
TEST(cpu_fusion, gemm_cpu_broadcast_matrix)
{
Shape shapeA{3, 2};
Shape shapeB{2, 3};
......@@ -109,8 +187,8 @@ TEST(cpu_fusion, gemm_cpu)
auto one = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});
auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0, 1});
auto cg =
make_shared<op::MatmulBias>(A, B, broadcast, A->get_shape(), B->get_shape(), true, true);
auto cg = make_shared<op::MatmulBias>(
A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0, 1});
auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
......@@ -212,7 +290,7 @@ TEST(cpu_fusion, cpu_fusion_pass_matmul_bias)
pass_manager.run_passes(func);
auto gmm = graph->get_input_op(0);
ASSERT_TRUE(std::dynamic_pointer_cast<op::MatmulBias>(gmm));
ASSERT_EQ(gmm->get_input_op(2), broadcast);
ASSERT_EQ(gmm->get_input_op(2), b);
}
TEST(cpu_fusion, cpu_fusion_pass_matmul_no_bias)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment