Unverified Commit 7df687c1 authored by Matthew Brookhart's avatar Matthew Brookhart Committed by GitHub

Switch from Eigen to OpenMP for loops for DS2 kernels (#345)

* speed up reduceslice with kernel emitter

* const-ify and fix a clang warning

* add elementwise ops, slice to for loops

* add broadcast codegen

* add Exp

* fix bugs introduced in eigen kernels

* fix another introduced bug in Eigen

* Fix an Atomic Bug with Sum, do some cleanup

* unit tests pass

* Add Reshape Op, passes Tests

* rewrite sum to correctly handle muti-threading

* Code Cleanup

* add some extra unary ops

* Address review comments

* fix an error in the review comment refactor

* Add Power op

* Add (most) of the Logic Ops

* Make Concat default to OpenMP kernel

* fix n-D reshape issue
parent 7e89f1bb
......@@ -43,6 +43,8 @@
using namespace std;
using namespace ngraph;
#define PREFER_EIGEN 0
static string eigen_vector_format(const runtime::cpu::TensorViewWrapper& tvi)
{
return "fmt::V{" + to_string(tvi.get_size()) + "}";
......@@ -69,6 +71,7 @@ void runtime::cpu::CPU_Emitter::EmitAdd(const ngraph::Node* n,
// the right alignment instead of Eigen::Unaligned
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << "Eigen::Map<Eigen::Array<" << out[0].get_element_type().c_type_string() << ", "
<< out[0].get_size() << ", 1>, Eigen::Unaligned> out(" << out[0].get_name() << ");\n";
m_out << "Eigen::Map<Eigen::Array<" << args[0].get_element_type().c_type_string() << ", "
......@@ -76,7 +79,14 @@ void runtime::cpu::CPU_Emitter::EmitAdd(const ngraph::Node* n,
m_out << "Eigen::Map<Eigen::Array<" << args[1].get_element_type().c_type_string() << ", "
<< args[1].get_size() << ", 1>, Eigen::Unaligned> arg1(" << args[1].get_name() << ");\n";
m_out << "out = arg0 + arg1;\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] + "
<< args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -166,9 +176,18 @@ void runtime::cpu::CPU_Emitter::EmitMultiply(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << " *\n"
<< " " << emit_array1d(args[1]) << ";\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] * "
<< args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -209,8 +228,16 @@ void runtime::cpu::CPU_Emitter::EmitAbs(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n";
m_out << "Eigen::abs(" << emit_array1d(args[0]) << ");\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = std::abs(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -221,6 +248,7 @@ void runtime::cpu::CPU_Emitter::EmitConcat(const ngraph::Node* n,
{
auto result_shape = out[0].get_shape();
#if PREFER_EIGEN == 1
if (result_shape.size() == 1)
{
m_out << "{ // " << n->get_name() << "\n";
......@@ -294,15 +322,35 @@ void runtime::cpu::CPU_Emitter::EmitConcat(const ngraph::Node* n,
arg_shapes.push_back(arg.get_shape());
}
kernels::emit_concat(m_out,
args[0].get_element_type().c_type_string(),
arg_names,
out[0].get_name(),
arg_shapes,
result_shape,
axis);
kernel::emit_concat(m_out,
args[0].get_element_type().c_type_string(),
arg_names,
out[0].get_name(),
arg_shapes,
result_shape,
axis);
}
}
#else
auto axis = (dynamic_cast<const op::Concat*>(n))->get_concatenation_axis();
std::vector<std::string> arg_names;
std::vector<Shape> arg_shapes;
for (auto arg : args)
{
arg_names.push_back(arg.get_name());
arg_shapes.push_back(arg.get_shape());
}
kernel::emit_concat(m_out,
args[0].get_element_type().c_type_string(),
arg_names,
out[0].get_name(),
arg_shapes,
result_shape,
axis);
#endif
}
void runtime::cpu::CPU_Emitter::EmitDivide(const ngraph::Node* n,
......@@ -321,9 +369,18 @@ void runtime::cpu::CPU_Emitter::EmitDivide(const ngraph::Node* n,
<< "[i] == 0) throw std::runtime_error(\"integer divide by zero\");\n";
m_out << "}\n";
}
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << " /\n"
<< " " << emit_array1d(args[1]) << ";\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] / "
<< args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -334,9 +391,18 @@ void runtime::cpu::CPU_Emitter::EmitEqual(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " (" << emit_array1d(args[0]) << " ==\n"
<< " " << emit_array1d(args[1]) << ").template cast<char>();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name()
<< "[i] == " << args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -347,9 +413,18 @@ void runtime::cpu::CPU_Emitter::EmitGreater(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << " xxx\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " (" << emit_array1d(args[0]) << " >\n"
<< " " << emit_array1d(args[1]) << ").template cast<char>();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] > "
<< args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -360,9 +435,18 @@ void runtime::cpu::CPU_Emitter::EmitGreaterEq(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " (" << emit_array1d(args[0]) << " >=\n"
<< " " << emit_array1d(args[1]) << ").template cast<char>();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name()
<< "[i] >= " << args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -373,9 +457,18 @@ void runtime::cpu::CPU_Emitter::EmitLess(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " (" << emit_array1d(args[0]) << " <\n"
<< " " << emit_array1d(args[1]) << ").template cast<char>();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] < "
<< args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -386,9 +479,18 @@ void runtime::cpu::CPU_Emitter::EmitLessEq(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " (" << emit_array1d(args[0]) << " <=\n"
<< " " << emit_array1d(args[1]) << ").template cast<char>();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name()
<< "[i] <= " << args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -399,8 +501,16 @@ void runtime::cpu::CPU_Emitter::EmitLog(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " Eigen::log(" << emit_array1d(args[0]) << ");\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = log(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -411,9 +521,19 @@ void runtime::cpu::CPU_Emitter::EmitMaximum(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".max(\n"
<< " " << emit_array1d(args[1]) << ");\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] > "
<< args[1].get_name() << "[i] ? " << args[0].get_name() << "[i] : " << args[1].get_name()
<< "[i] ;\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -424,9 +544,19 @@ void runtime::cpu::CPU_Emitter::EmitMinimum(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".min(\n"
<< " " << emit_array1d(args[1]) << ");\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] < "
<< args[1].get_name() << "[i] ? " << args[0].get_name() << "[i] : " << args[1].get_name()
<< "[i] ;\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -437,8 +567,16 @@ void runtime::cpu::CPU_Emitter::EmitNegative(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " -" << emit_array1d(args[0]) << ";\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = -" << args[0].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -449,9 +587,18 @@ void runtime::cpu::CPU_Emitter::EmitNotEqual(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " (" << emit_array1d(args[0]) << " !=\n"
<< " " << emit_array1d(args[1]) << ").template cast<char>();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name()
<< "[i] != " << args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -476,9 +623,18 @@ void runtime::cpu::CPU_Emitter::EmitSubtract(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << " -\n"
<< " " << emit_array1d(args[1]) << ";\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] - "
<< args[1].get_name() << "[i];\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -489,6 +645,9 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(const ngraph::Node* n,
{
auto broadcast = static_cast<const op::Broadcast*>(n);
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
......@@ -554,6 +713,17 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(const ngraph::Node* n,
m_out << " {" << join(result_shape) << "},\n";
m_out << " {" << join(broadcast->get_broadcast_axes()) << "});\n";
}
#else
kernel::emit_broadcast(m_out,
args[0].get_element_type().c_type_string(),
args[0].get_name(),
out[0].get_name(),
args[0].get_shape(),
out[0].get_shape(),
broadcast->get_broadcast_axes());
#endif
m_out.indent--;
m_out << "}\n";
}
void runtime::cpu::CPU_Emitter::EmitConvert(const ngraph::Node* n,
......@@ -582,7 +752,9 @@ void runtime::cpu::CPU_Emitter::EmitReshape(const ngraph::Node* n,
const vector<runtime::cpu::TensorViewWrapper>& out)
{
auto reshape = static_cast<const op::Reshape*>(n);
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size();
......@@ -645,6 +817,17 @@ void runtime::cpu::CPU_Emitter::EmitReshape(const ngraph::Node* n,
throw ngraph_error(
"Axis permutation in reshape is not implemented yet for tensors with rank>2");
}
#else
kernel::emit_reshape(m_out,
args[0].get_element_type().c_type_string(),
args[0].get_name(),
out[0].get_name(),
args[0].get_shape(),
out[0].get_shape(),
reshape->get_input_order());
#endif
m_out.indent--;
m_out << "}\n";
}
void runtime::cpu::CPU_Emitter::EmitFunctionCall(
......@@ -860,6 +1043,9 @@ void runtime::cpu::CPU_Emitter::EmitSlice(const ngraph::Node* n,
{
const op::Slice* slice = static_cast<const op::Slice*>(n);
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
size_t arg_rank = args[0].get_shape().size();
const Coordinate& lower_bounds = slice->get_lower_bounds();
......@@ -919,6 +1105,19 @@ void runtime::cpu::CPU_Emitter::EmitSlice(const ngraph::Node* n,
m_out << " {" << join(slice->get_strides()) << "},\n";
m_out << " {" << join(out[0].get_shape()) << "});\n";
}
#else
kernel::emit_slice(m_out,
args[0].get_element_type().c_type_string(),
args[0].get_name(),
out[0].get_name(),
args[0].get_shape(),
out[0].get_shape(),
slice->get_lower_bounds(),
slice->get_upper_bounds(),
slice->get_strides());
#endif
m_out.indent--;
m_out << "}\n";
}
void runtime::cpu::CPU_Emitter::EmitSum(const ngraph::Node* n,
......@@ -926,6 +1125,9 @@ void runtime::cpu::CPU_Emitter::EmitSum(const ngraph::Node* n,
const vector<runtime::cpu::TensorViewWrapper>& out)
{
const op::Sum* sum = static_cast<const op::Sum*>(n);
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
const Shape& arg_shape = args[0].get_shape();
size_t arg_rank = arg_shape.size();
const AxisSet& reduction_axes = sum->get_reduction_axes();
......@@ -977,6 +1179,17 @@ void runtime::cpu::CPU_Emitter::EmitSum(const ngraph::Node* n,
m_out << " {" << join(out[0].get_shape()) << "},\n";
m_out << " {" << join(sum->get_reduction_axes()) << "});\n";
}
#else
kernel::emit_sum(m_out,
args[0].get_element_type().c_type_string(),
args[0].get_name(),
out[0].get_name(),
args[0].get_shape(),
out[0].get_shape(),
sum->get_reduction_axes());
#endif
m_out.indent--;
m_out << "}\n";
}
void runtime::cpu::CPU_Emitter::EmitExp(const ngraph::Node* n,
......@@ -985,8 +1198,16 @@ void runtime::cpu::CPU_Emitter::EmitExp(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".exp();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = exp(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -997,8 +1218,16 @@ void runtime::cpu::CPU_Emitter::EmitSin(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".sin();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = sin(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1009,8 +1238,16 @@ void runtime::cpu::CPU_Emitter::EmitSinh(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".sinh();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = sinh(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1021,8 +1258,16 @@ void runtime::cpu::CPU_Emitter::EmitCos(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".cos();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = cos(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1033,8 +1278,16 @@ void runtime::cpu::CPU_Emitter::EmitCosh(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".cosh();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = cosh(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1045,8 +1298,16 @@ void runtime::cpu::CPU_Emitter::EmitTan(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".tan();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = tan(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1061,6 +1322,9 @@ void runtime::cpu::CPU_Emitter::EmitTanh(const ngraph::Node* n,
// by models
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 0
m_out << "#pragma omp parallel for\n";
#endif
m_out << "for (size_t i=0; i<" << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = tanh(" << args[0].get_name() << "[i]);\n";
......@@ -1075,8 +1339,16 @@ void runtime::cpu::CPU_Emitter::EmitAsin(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".asin();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = asin(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1087,8 +1359,16 @@ void runtime::cpu::CPU_Emitter::EmitAcos(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".acos();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = acos(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1099,8 +1379,16 @@ void runtime::cpu::CPU_Emitter::EmitAtan(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " =\n"
<< " " << emit_array1d(args[0]) << ".atan();\n";
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = atan(" << args[0].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1111,11 +1399,21 @@ void runtime::cpu::CPU_Emitter::EmitPower(const ngraph::Node* n,
{
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
m_out << emit_array1d(out[0]) << " = \n";
m_out.indent++;
m_out << emit_array1d(args[0]) << ".pow(\n ";
m_out << emit_array1d(args[1]) << ");\n";
m_out.indent -= 2;
m_out.indent--;
#else
m_out << "#pragma omp parallel for\n";
m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = pow(" << args[0].get_name() << "[i], "
<< args[1].get_name() << "[i]);\n";
m_out << "}\n";
#endif
m_out.indent--;
m_out << "}\n";
}
......@@ -1125,7 +1423,9 @@ void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
const vector<runtime::cpu::TensorViewWrapper>& out)
{
auto replace_slice = static_cast<const op::Slice*>(n);
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
#if PREFER_EIGEN == 1
size_t arg0_rank = args[0].get_shape().size();
auto& lower_bounds = replace_slice->get_lower_bounds();
......@@ -1192,6 +1492,20 @@ void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
m_out << " {" << join(replace_slice->get_strides()) << "},\n";
m_out << " {" << join(out[0].get_shape()) << "});\n";
}
#else
kernel::emit_replace_slice(m_out,
args[0].get_element_type().c_type_string(),
args[0].get_name(),
args[1].get_name(),
out[0].get_name(),
args[1].get_shape(),
out[0].get_shape(),
replace_slice->get_lower_bounds(),
replace_slice->get_upper_bounds(),
replace_slice->get_strides());
#endif
m_out.indent--;
m_out << "}\n";
}
void runtime::cpu::CPU_Emitter::EmitOneHot(const ngraph::Node* n,
......@@ -1295,6 +1609,9 @@ void runtime::cpu::CPU_Emitter::EmitCeiling(const ngraph::Node* n,
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
size_t element_count = out[0].get_size();
#if PREFER_EIGEN == 0
m_out << "#pragma omp parallel for\n";
#endif
m_out << "for (size_t i = 0; i < " << element_count << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = ceil(" << args[0].get_name() << "[i]);\n";
......@@ -1310,6 +1627,9 @@ void runtime::cpu::CPU_Emitter::EmitFloor(const ngraph::Node* n,
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
size_t element_count = out[0].get_size();
#if PREFER_EIGEN == 0
m_out << "#pragma omp parallel for\n";
#endif
m_out << "for (size_t i = 0; i < " << element_count << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = floor(" << args[0].get_name() << "[i]);\n";
......@@ -1325,6 +1645,9 @@ void runtime::cpu::CPU_Emitter::EmitSqrt(const ngraph::Node* n,
m_out << "{ // " << n->get_name() << "\n";
m_out.indent++;
size_t element_count = out[0].get_size();
#if PREFER_EIGEN == 0
m_out << "#pragma omp parallel for\n";
#endif
m_out << "for (size_t i = 0; i < " << element_count << "; i++)\n";
m_out << "{\n";
m_out << " " << out[0].get_name() << "[i] = sqrt(" << args[0].get_name() << "[i]);\n";
......
......@@ -11,24 +11,129 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#include <algorithm>
#include <map>
#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
#include "ngraph/runtime/cpu/cpu_kernel_utils.hpp"
using namespace ngraph;
using namespace ngraph::runtime::cpu::kernels;
using namespace ngraph::runtime::cpu::kernel;
// Function to take a vector of data, say 1,2,3 and return
// a string representing multi-index access, i.e "[1][2][3]"
template <typename T>
std::string emit_bracketed_string(std::vector<T> data)
{
std::stringstream ss;
if (data.size() == 0)
return "";
for (auto s : data)
{
ss << "[" << s << "]";
}
return ss.str();
}
// Convert a buffer into a C-style multi-index array
std::string recast_tmp_var(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg_name,
const Shape& arg_shape,
const std::string& tmp_name)
{
std::string nd_name = writer.generate_temporary_name(tmp_name);
std::string bracketed_shape = emit_bracketed_string(arg_shape);
writer << element_type << "(&" << nd_name << ")" << bracketed_shape << " = *reinterpret_cast<"
<< element_type << "(*)" << bracketed_shape << ">(" << arg_name << ");\n";
return nd_name;
}
// write openings to for loops, for variables in the order of top,
// where each loop ranges from bottom[i] to top[i]
// creates index variables for each loop and returns them
std::vector<std::string>
open_for_loops(codegen::CodeWriter& writer, const Shape& top, const Shape& bottom = {})
{
Shape new_bottom;
if (bottom.size() == 0)
{
new_bottom = Shape(top.size(), 0);
}
else
{
new_bottom = bottom;
}
std::vector<std::string> index_vars;
for (size_t i = 0; i < top.size(); i++)
{
std::string index_var = writer.generate_temporary_name("i");
writer << start_index_loop(index_var, new_bottom[i], top[i], i == 0);
writer.indent++;
index_vars.push_back(index_var);
}
return index_vars;
}
//close the for loops created by open_for_loops
void close_for_loops(codegen::CodeWriter& writer, const std::vector<std::string>& index_vars)
{
for (size_t i = index_vars.size(); i-- > 0;)
{
writer.indent--;
writer << end_index_loop(index_vars[i]);
}
}
void ngraph::runtime::cpu::kernel::emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& broadcast_axes)
{
// create input and output arrays
auto source_nd_name = recast_tmp_var(writer, element_type, arg0, arg0_shape, "source_nd");
auto dest_nd_name = recast_tmp_var(writer, element_type, out, out_shape, "dest_nd");
// create the for loops
auto index_vars = open_for_loops(writer, out_shape);
// match positions in output to positions in the input
std::vector<std::string> source_indexes;
for (size_t i = 0; i < out_shape.size(); ++i)
{
if (broadcast_axes.count(i) == 0)
{
source_indexes.push_back(index_vars[i]);
}
}
// write the operation
writer << dest_nd_name << emit_bracketed_string(index_vars) << " = " << source_nd_name
<< emit_bracketed_string(source_indexes) << ";\n";
close_for_loops(writer, index_vars);
}
//
// For the reference kernel this is based on, see ngraph/runtime/kernel/concat.hpp.
//
void ngraph::runtime::cpu::kernels::emit_concat(codegen::CodeWriter& writer,
std::string element_type,
const std::vector<std::string> args,
std::string out,
const std::vector<Shape>& in_shapes,
const Shape& out_shape,
size_t concatenation_axis)
void ngraph::runtime::cpu::kernel::emit_concat(codegen::CodeWriter& writer,
const std::string& element_type,
const std::vector<std::string>& args,
const std::string& out,
const std::vector<Shape>& in_shapes,
const Shape& out_shape,
size_t concatenation_axis)
{
size_t concatenation_pos = 0;
......@@ -49,3 +154,213 @@ void ngraph::runtime::cpu::kernels::emit_concat(codegen::CodeWriter& writer,
concatenation_pos += in_shapes[i][concatenation_axis];
}
}
void ngraph::runtime::cpu::kernel::emit_replace_slice(
codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& arg1, // replacement value
const std::string& out,
const Shape& arg1_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides)
{
// Step 1: Copy the entire replacement context to the output.
CoordinateTransform copy_transform(out_shape);
emit_pointwise_copy(writer, element_type, arg0, out, copy_transform, copy_transform);
// Step 2: Overwrite the slice for replacement.
CoordinateTransform input_transform(arg1_shape);
CoordinateTransform output_transform(out_shape, lower_bounds, upper_bounds, strides);
emit_pointwise_copy(writer, element_type, arg1, out, input_transform, output_transform);
}
void ngraph::runtime::cpu::kernel::emit_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides)
{
// create input and output arrays
auto source_nd_name = recast_tmp_var(writer, element_type, arg0, arg0_shape, "source_nd");
auto dest_nd_name = recast_tmp_var(writer, element_type, out, out_shape, "dest_nd");
// create the for loops
auto index_vars = open_for_loops(writer, out_shape);
// map the position in the output to a position in the input
std::vector<std::string> source_indexes;
size_t j = 0;
for (size_t i = 0; i < lower_bounds.size(); ++i)
{
if (lower_bounds[i] == upper_bounds[i])
{
source_indexes.push_back(std::to_string(lower_bounds[i]));
}
else
{
std::stringstream ss;
ss << lower_bounds[i];
ss << " + " << index_vars[j];
ss << " * " << strides[i];
source_indexes.push_back(ss.str());
j += 1;
}
}
// write the element copy operation
writer << dest_nd_name << emit_bracketed_string(index_vars) << " = " << source_nd_name
<< emit_bracketed_string(source_indexes) << ";\n";
close_for_loops(writer, index_vars);
}
void ngraph::runtime::cpu::kernel::emit_reshape(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisVector& arg0_axis_order)
{
// get the total number of elements
size_t size = 1;
for (auto x : out_shape)
{
if (x != 0)
size *= x;
}
// create input and output arrays
auto source_nd_name = recast_tmp_var(writer, element_type, arg0, arg0_shape, "source_nd");
auto dest_nd_name = recast_tmp_var(writer, element_type, out, {size}, "dest_nd");
std::map<size_t, size_t> input_to_loop_pos;
std::map<size_t, size_t> loop_to_input_pos;
// loop over the input in the order of arg0_axis_order
int input_pos = 0;
Shape ordered_input_shape;
for (size_t i = 0; i < arg0_shape.size(); i++)
{
ordered_input_shape.push_back(arg0_shape[arg0_axis_order[i]]);
input_to_loop_pos[input_pos] = arg0_axis_order[i];
input_pos += 1;
}
for (auto kv : input_to_loop_pos)
{
loop_to_input_pos[kv.second] = kv.first;
}
auto index_vars = open_for_loops(writer, ordered_input_shape);
// write the output reshape as a 1D array by calculating the
// position of the input iterators in the output array
writer << dest_nd_name << "[ 0";
for (size_t i = 0; i < arg0_shape.size(); i++)
{
writer << " + " << index_vars[i];
for (auto j = i + 1; j < arg0_shape.size(); j++)
{
if (arg0_shape[j] > 0)
{
writer << " * " << ordered_input_shape[j];
}
}
}
writer << "] = " << source_nd_name;
for (size_t i = 0; i < arg0_shape.size(); i++)
{
writer << "[" << index_vars[loop_to_input_pos[i]] << "]";
}
writer << ";\n";
close_for_loops(writer, index_vars);
}
void ngraph::runtime::cpu::kernel::emit_sum(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& reduction_axes)
{
// create input and output arrays
auto source_nd_name = recast_tmp_var(writer, element_type, arg0, arg0_shape, "source_nd");
auto dest_nd_name = recast_tmp_var(writer, element_type, out, out_shape, "dest_nd");
// zero the output to make sure we don't have randomly initialized data
if (out_shape.size() == 0)
{
writer << dest_nd_name << " = 0;\n";
}
else
{
auto output_vars = open_for_loops(writer, out_shape);
writer << dest_nd_name << emit_bracketed_string(output_vars) << " = 0;\n";
close_for_loops(writer, output_vars);
}
// If we don't have a zero index in the input, perform the sum
if (std::find(arg0_shape.begin(), arg0_shape.end(), 0) == arg0_shape.end())
{
// create the the interation variables without writing the for loops
std::vector<std::string> index_vars;
for (size_t i = 0; i < arg0_shape.size(); i++)
{
std::string index_var = writer.generate_temporary_name("i");
index_vars.push_back(index_var);
}
// calculate the output indexes based on what's being reduced
std::vector<std::string> out_indexes;
size_t outer_arg_index = -1;
for (size_t i = 0; i < index_vars.size(); ++i)
{
if (reduction_axes.count(i) == 0)
{
if (out_indexes.size() == 0)
{
outer_arg_index = i;
}
out_indexes.push_back(index_vars[i]);
}
}
// make the first output shape our outer loop, optimize with openmp
if (outer_arg_index != -1)
{
writer << start_index_loop(
index_vars[outer_arg_index], 0, arg0_shape[outer_arg_index], true);
writer.indent++;
}
// create the rest of the loops, don't parallelize.
for (size_t i = 0; i < arg0_shape.size(); i++)
{
if (i != outer_arg_index)
{
std::string index_var = index_vars[i];
writer << start_index_loop(index_var, 0, arg0_shape[i], false);
writer.indent++;
}
}
writer << dest_nd_name << emit_bracketed_string(out_indexes) << " += " << source_nd_name
<< emit_bracketed_string(index_vars) << ";\n";
close_for_loops(writer, index_vars);
}
}
......@@ -23,15 +23,56 @@ namespace ngraph
{
namespace cpu
{
namespace kernels
namespace kernel
{
void emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& broadcast_axes);
void emit_concat(codegen::CodeWriter& writer,
std::string element_type,
const std::vector<std::string> args,
std::string out,
const std::string& element_type,
const std::vector<std::string>& args,
const std::string& out,
const std::vector<Shape>& in_shapes,
const Shape& out_shape,
size_t concatenation_axis);
const size_t concatenation_axis);
void emit_replace_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& arg1, // replacement value
const std::string& out,
const Shape& arg1_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides);
void emit_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides);
void emit_reshape(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisVector& arg0_axis_order);
void emit_sum(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& reduction_axes);
}
}
}
......
......@@ -19,7 +19,7 @@
#include "ngraph/util.hpp"
using namespace ngraph;
using namespace ngraph::runtime::cpu::kernels;
using namespace ngraph::runtime::cpu::kernel;
//
// Given a coordinate transform and a vector of index expressions relative to
......@@ -38,8 +38,8 @@ using namespace ngraph::runtime::cpu::kernels;
//
//
std::vector<std::string>
ngraph::runtime::cpu::kernels::emit_multi_indices(CoordinateTransform trans,
std::vector<std::string> index_vars)
ngraph::runtime::cpu::kernel::emit_multi_indices(CoordinateTransform& trans,
const std::vector<std::string>& index_vars)
{
std::vector<std::string> result;
......@@ -90,8 +90,9 @@ std::vector<std::string>
// "((4 * ((k) * 2 + 5)) + (2 * ((i) * 2 + 3)) + ((j) * 2 + 4))"
//
//
std::string ngraph::runtime::cpu::kernels::emit_linear_index(CoordinateTransform trans,
std::vector<std::string> index_vars)
std::string
ngraph::runtime::cpu::kernel::emit_linear_index(CoordinateTransform& trans,
const std::vector<std::string>& index_vars)
{
std::vector<std::string> multi_indices = emit_multi_indices(trans, index_vars);
......@@ -122,10 +123,10 @@ std::string ngraph::runtime::cpu::kernels::emit_linear_index(CoordinateTransform
//
// Optionally emits an OpenMP parallel pragma, if "omp" is true.
//
std::string ngraph::runtime::cpu::kernels::start_index_loop(std::string index_var,
size_t start,
size_t end,
bool omp)
std::string ngraph::runtime::cpu::kernel::start_index_loop(const std::string& index_var,
size_t start,
size_t end,
bool omp)
{
std::stringstream ss;
......@@ -144,7 +145,7 @@ std::string ngraph::runtime::cpu::kernels::start_index_loop(std::string index_va
//
// Ends an indexing loop on the index variable [index_var].
//
std::string ngraph::runtime::cpu::kernels::end_index_loop(std::string index_var)
std::string ngraph::runtime::cpu::kernel::end_index_loop(const std::string& index_var)
{
std::stringstream ss;
......@@ -153,7 +154,7 @@ std::string ngraph::runtime::cpu::kernels::end_index_loop(std::string index_var)
return ss.str();
}
std::string ngraph::runtime::cpu::kernels::emit_nd_sizes(CoordinateTransform trans)
std::string ngraph::runtime::cpu::kernel::emit_nd_sizes(CoordinateTransform& trans)
{
std::stringstream ss;
......@@ -165,8 +166,8 @@ std::string ngraph::runtime::cpu::kernels::emit_nd_sizes(CoordinateTransform tra
return ss.str();
}
std::string ngraph::runtime::cpu::kernels::emit_nd_index(CoordinateTransform trans,
std::vector<std::string> index_vars)
std::string ngraph::runtime::cpu::kernel::emit_nd_index(CoordinateTransform& trans,
const std::vector<std::string>& index_vars)
{
std::stringstream ss;
......@@ -182,12 +183,12 @@ std::string ngraph::runtime::cpu::kernels::emit_nd_index(CoordinateTransform tra
// Emits a pointwise copy from source_buffer mediated by in_trans, to
// dest_buffer mediated by dest_trans.
//
void ngraph::runtime::cpu::kernels::emit_pointwise_copy(codegen::CodeWriter& writer,
std::string element_type,
std::string source_buffer,
std::string dest_buffer,
CoordinateTransform source_trans,
CoordinateTransform dest_trans)
void ngraph::runtime::cpu::kernel::emit_pointwise_copy(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& source_buffer,
const std::string& dest_buffer,
CoordinateTransform& source_trans,
CoordinateTransform& dest_trans)
{
std::vector<std::string> index_vars;
......
......@@ -24,24 +24,27 @@ namespace ngraph
{
namespace cpu
{
namespace kernels
namespace kernel
{
std::vector<std::string> emit_multi_indices(CoordinateTransform trans,
std::vector<std::string> index_vars);
std::string emit_linear_index(CoordinateTransform trans,
std::vector<std::string> index_vars);
std::string
start_index_loop(std::string index_var, size_t start, size_t end, bool omp);
std::string end_index_loop(std::string index_var);
std::string emit_nd_sizes(CoordinateTransform trans);
std::string emit_nd_index(CoordinateTransform trans,
std::vector<std::string> index_vars);
std::vector<std::string>
emit_multi_indices(CoordinateTransform& trans,
const std::vector<std::string>& index_vars);
std::string emit_linear_index(CoordinateTransform& trans,
const std::vector<std::string>& index_vars);
std::string start_index_loop(const std::string& index_var,
size_t start,
size_t end,
bool omp);
std::string end_index_loop(const std::string& index_var);
std::string emit_nd_sizes(CoordinateTransform& trans);
std::string emit_nd_index(CoordinateTransform& trans,
const std::vector<std::string>& index_vars);
void emit_pointwise_copy(codegen::CodeWriter& writer,
std::string element_type,
std::string source_buffer,
std::string dest_buffer,
CoordinateTransform source_trans,
CoordinateTransform dest_trans);
const std::string& element_type,
const std::string& source_buffer,
const std::string& dest_buffer,
CoordinateTransform& source_trans,
CoordinateTransform& dest_trans);
}
}
}
......
......@@ -2368,9 +2368,7 @@ TEST(${BACKEND_NAME}, reshape_m2m_dim_change_transpose)
// 198., 270., 206., 278., 214., 286., 199., 271., 207.,
// 279., 215., 287., 200., 272., 208., 280., 216., 288.])
//
// Disabled because it doesn't work on CPU yet.
//
TEST(DISABLED_${BACKEND_NAME}, reshape_6d)
TEST(${BACKEND_NAME}, reshape_6d)
{
vector<float> a_data(2 * 2 * 3 * 3 * 2 * 4);
for (int i = 0; i < 2 * 2 * 3 * 3 * 2 * 4; i++)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment