Commit ea64f5bf authored by Amy Zhuang's avatar Amy Zhuang Committed by Scott Cyphers

Use Eigen kernel for Gather when axis is 0. (#3014)

* Use Eigen kernel for Gather when axis is 0.

* Fix style error.

* Rename variables.

Fix bugs.

Use helper function.

* Add one unit test.

* Check if openmp is defined.

* Change unit tests.

* Use Eigen kernel for uint8_t type.

Add one uint8_t unit test.

Address PR feedback.

* Update cpu emitter.
parent 002aef50
......@@ -18,6 +18,7 @@
#include "ngraph/op/gather.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/kernel/gather.hpp"
#include "ngraph/runtime/reference/gather.hpp"
using namespace std;
......@@ -52,15 +53,49 @@ namespace ngraph
if (is_int64)
{
return
[&,
if ((args[0].get_element_type() == element::f32 ||
args[0].get_element_type() == element::f64 ||
args[0].get_element_type() == element::u8) &&
axis == 0)
{
std::function<decltype(runtime::cpu::kernel::gather_i64<float, 2, 2>)>
kernel;
SELECT_KERNEL_BY_2RANKS(kernel,
args[0].get_element_type(),
params_shape.size(),
out_shape.size(),
runtime::cpu::kernel::gather_i64);
return [&,
kernel,
params_shape,
indices_shape,
out_shape,
params_buffer_index,
indices_buffer_index,
out_buffer_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
kernel(ctx->buffer_data[params_buffer_index],
ctx->buffer_data[indices_buffer_index],
ctx->buffer_data[out_buffer_index],
params_shape,
indices_shape,
out_shape,
ectx->arena);
};
}
else
{
return [&,
params_shape,
indices_shape,
out_shape,
axis,
params_buffer_index,
indices_buffer_index,
out_buffer_index](CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
out_buffer_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
ngraph::runtime::reference::gather<T, int64_t>(
static_cast<T*>(ctx->buffer_data[params_buffer_index]),
static_cast<int64_t*>(ctx->buffer_data[indices_buffer_index]),
......@@ -71,17 +106,53 @@ namespace ngraph
axis);
};
}
}
else
{
return
[&,
if ((args[0].get_element_type() == element::f32 ||
args[0].get_element_type() == element::f64 ||
args[0].get_element_type() == element::u8) &&
axis == 0)
{
std::function<decltype(runtime::cpu::kernel::gather_i32<float, 2, 2>)>
kernel;
SELECT_KERNEL_BY_2RANKS(kernel,
args[0].get_element_type(),
params_shape.size(),
out_shape.size(),
runtime::cpu::kernel::gather_i32);
return [&,
kernel,
params_shape,
indices_shape,
out_shape,
params_buffer_index,
indices_buffer_index,
out_buffer_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
kernel(ctx->buffer_data[params_buffer_index],
ctx->buffer_data[indices_buffer_index],
ctx->buffer_data[out_buffer_index],
params_shape,
indices_shape,
out_shape,
ectx->arena);
};
}
else
{
return [&,
params_shape,
indices_shape,
out_shape,
axis,
params_buffer_index,
indices_buffer_index,
out_buffer_index](CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
out_buffer_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
ngraph::runtime::reference::gather<T, int32_t>(
static_cast<T*>(ctx->buffer_data[params_buffer_index]),
static_cast<int32_t*>(ctx->buffer_data[indices_buffer_index]),
......@@ -93,6 +164,7 @@ namespace ngraph
};
}
}
}
} // namespace
template <>
......
......@@ -266,6 +266,10 @@
{ \
SELECT_2RANKS(KV, double, R1, R2, K); \
} \
else if (ET == element::u8) \
{ \
SELECT_2RANKS(KV, uint8_t, R1, R2, K); \
} \
else \
{ \
throw ngraph_error("Unsupported element type " + ET.c_type_string() + " for kernel " #K); \
......
......@@ -1815,15 +1815,34 @@ namespace ngraph
}
writer.block_begin();
if ((args[0].get_element_type() == element::f64 ||
args[0].get_element_type() == element::f32 ||
args[0].get_element_type() == element::u8) &&
gather->get_axis() == 0)
{
writer << "cpu::kernel::gather<" << args[0].get_type() << ", "
<< args[1].get_element_type().c_type_string() << ", "
<< args[0].get_shape().size() << ", " << out[0].get_shape().size()
<< ">(" << args[0].get_name() << ",\n";
writer << " " << args[1].get_name() << ",\n";
writer << " " << out[0].get_name() << ",\n";
writer << " {" << join(args[0].get_shape()) << "},\n";
writer << " {" << join(args[1].get_shape()) << "},\n";
writer << " {" << join(out[0].get_shape()) << "},\n";
writer << " 0);\n";
}
else
{
writer << "reference::gather<" << args[0].get_type() << ", "
<< args[1].get_element_type().c_type_string() << ">(" << args[0].get_name()
<< ",\n";
<< args[1].get_element_type().c_type_string() << ">("
<< args[0].get_name() << ",\n";
writer << " " << args[1].get_name() << ",\n";
writer << " " << out[0].get_name() << ",\n";
writer << " {" << join(args[0].get_shape()) << "},\n";
writer << " {" << join(args[1].get_shape()) << "},\n";
writer << " {" << join(out[0].get_shape()) << "},\n";
writer << " " << gather->get_axis() << ");\n";
}
writer.block_end();
}
......
......@@ -225,6 +225,18 @@ namespace ngraph
template <typename ElementType>
void reference_erf(void* arg, void* out, size_t count);
template <typename ElementType,
typename IndicesType,
unsigned int Rank1,
unsigned int Rank2>
void gather(void* inputs,
void* indices,
void* output,
const Shape& inputs_shape,
const Shape& indices_shape,
const Shape& output_shape,
int arena);
template <typename ElementType,
typename IndicesType,
unsigned int Rank1,
......
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#define EIGEN_USE_THREADS
#include <unsupported/Eigen/CXX11/Tensor>
#include "ngraph/coordinate.hpp"
#include "ngraph/runtime/cpu/cpu_executor.hpp"
#include "ngraph/shape.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
static void
get_leading_indices(const Shape& shape, int index, std::vector<int>& indices)
{
auto rank = shape.size();
std::vector<int> partial_sum(rank);
partial_sum[rank - 1] = 1;
for (int j = rank - 2; j >= 0; j--)
{
partial_sum[j] = partial_sum[j + 1] * shape[j + 1];
}
for (int j = 0; j < rank; j++)
{
indices[j] = index / partial_sum[j];
index = index % partial_sum[j];
}
}
template <typename ElementType,
typename IndicesType,
unsigned int Rank1,
unsigned int Rank2>
void gather(void* inputs,
void* indices,
void* output,
const Shape& inputs_shape,
const Shape& indices_shape,
const Shape& output_shape,
int arena)
{
Eigen::array<Eigen::Index, Rank1> in_dims;
Eigen::array<Eigen::Index, Rank2> out_dims;
for (int i = 0; i < Rank1; i++)
{
in_dims[i] = inputs_shape[i];
}
for (int i = 0; i < Rank2; i++)
{
out_dims[i] = output_shape[i];
}
Eigen::TensorMap<Eigen::Tensor<ElementType, Rank2, Eigen::RowMajor>> out(
static_cast<ElementType*>(output), out_dims);
Eigen::TensorMap<Eigen::Tensor<ElementType, Rank1, Eigen::RowMajor>> in(
static_cast<ElementType*>(inputs), in_dims);
auto indices_ptr = static_cast<IndicesType*>(indices);
auto indices_rank = indices_shape.size();
if (indices_rank == 0)
{
Eigen::array<Eigen::Index, Rank1> in_extents, in_offsets;
Eigen::array<Eigen::Index, Rank2> out_extents, out_offsets;
for (int i = 0; i < Rank1; i++)
{
in_extents[i] = inputs_shape[i];
in_offsets[i] = 0;
}
in_extents[0] = 1;
in_offsets[0] = indices_ptr[0];
for (int i = 0; i < Rank2; i++)
{
out_extents[i] = output_shape[i];
out_offsets[i] = 0;
}
out.slice(out_offsets, out_extents)
.device(ngraph::runtime::cpu::executor::GetCPUExecutor().get_device(
arena)) = in.slice(in_offsets, in_extents).reshape(out_extents);
}
else
{
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i = 0; i < shape_size(indices_shape); i++)
{
// Declare these inside the loop for omp parallel
Eigen::array<Eigen::Index, Rank1> in_extents, in_offsets;
Eigen::array<Eigen::Index, Rank2> out_extents, out_offsets;
std::vector<int> leading_indices(indices_rank);
for (int r = 0; r < Rank1; r++)
{
in_extents[r] = inputs_shape[r];
in_offsets[r] = 0;
}
in_extents[0] = 1;
in_offsets[0] = indices_ptr[i];
for (int r = 0; r < Rank2; r++)
{
out_extents[r] = output_shape[r];
out_offsets[r] = 0;
}
get_leading_indices(indices_shape, i, leading_indices);
for (int j = 0; j < indices_rank; j++)
{
out_extents[j] = 1;
out_offsets[j] = leading_indices[j];
}
out.slice(out_offsets, out_extents)
.device(ngraph::runtime::cpu::executor::GetCPUExecutor().get_device(
arena)) = in.slice(in_offsets, in_extents).reshape(out_extents);
}
}
}
template <typename ElementType, unsigned int Rank1, unsigned int Rank2>
void gather_i64(void* inputs,
void* indices,
void* output,
const Shape& inputs_shape,
const Shape& indices_shape,
const Shape& output_shape,
int arena)
{
gather<ElementType, int64_t, Rank1, Rank2>(
inputs, indices, output, inputs_shape, indices_shape, output_shape, arena);
}
template <typename ElementType, unsigned int Rank1, unsigned int Rank2>
void gather_i32(void* inputs,
void* indices,
void* output,
const Shape& inputs_shape,
const Shape& indices_shape,
const Shape& output_shape,
int arena)
{
gather<ElementType, int32_t, Rank1, Rank2>(
inputs, indices, output, inputs_shape, indices_shape, output_shape, arena);
}
}
}
}
}
......@@ -142,7 +142,6 @@ erf
zero_sized_erf
model_erf
model_erf_int32
gather_no_axis
gather
gather_nd_scalar_from_2d
gather_nd_1d_from_2d
......@@ -154,9 +153,13 @@ gather_nd_batch_1d_from_2d
gather_nd_batch_scalar_from_3d
gather_nd_batch_1d_from_3d
gather_nd_batch_2d_from_3d
gather_scalar_indices_no_axis
gather_scalar_indices
gather_nd_single_indices
gather_scalar_indices
gather_scalar_indices_no_axis
gather_2d_indices_no_axis
gather_3d_indices_no_axis
gather_4d_indices_no_axis
gather_4d_indices_no_axis_uint8
gemm
gemm_broadcast_input_C
model_hardmax
......
......@@ -43,7 +43,6 @@ pad_reflect_2d_with_neg
batch_mat_mul_forward
backwards_batchmatmul_tensor2_tensor2
erf
gather_no_axis
gather
gather_nd_scalar_from_2d
gather_nd_1d_from_2d
......@@ -55,9 +54,13 @@ gather_nd_batch_1d_from_2d
gather_nd_batch_scalar_from_3d
gather_nd_batch_1d_from_3d
gather_nd_batch_2d_from_3d
gather_scalar_indices_no_axis
gather_scalar_indices
gather_nd_single_indices
gather_scalar_indices
gather_scalar_indices_no_axis
gather_2d_indices_no_axis
gather_3d_indices_no_axis
gather_4d_indices_no_axis
gather_4d_indices_no_axis_uint8
gemm
gemm_broadcast_input_C
normalize_across_chw_scalar_scale_4d
......
......@@ -102,7 +102,6 @@ embedding_lookup_10x1_arbitrary
embedding_lookup_10x1_arbitrary_index_type_int
embedding_lookup_10x1_arbitrary_index_type_int64
floor_int32
gather_no_axis
gather
gather_nd_scalar_from_2d
gather_nd_1d_from_2d
......@@ -114,9 +113,13 @@ gather_nd_batch_1d_from_2d
gather_nd_batch_scalar_from_3d
gather_nd_batch_1d_from_3d
gather_nd_batch_2d_from_3d
gather_scalar_indices_no_axis
gather_scalar_indices
gather_nd_single_indices
gather_scalar_indices
gather_scalar_indices_no_axis
gather_2d_indices_no_axis
gather_3d_indices_no_axis
gather_4d_indices_no_axis
gather_4d_indices_no_axis_uint8
scatter_add_4d_indices
scatter_add_3d_indices
scatter_add_2d_indices
......
......@@ -36,7 +36,107 @@ using namespace ngraph;
static string s_manifest = "${MANIFEST}";
NGRAPH_TEST(${BACKEND_NAME}, gather_no_axis)
NGRAPH_TEST(${BACKEND_NAME}, gather_4d_indices_no_axis_uint8)
{
Shape params_shape{3, 2};
Shape indices_shape{2, 2, 3, 4};
Shape out_shape{2, 2, 3, 4, 2};
auto P = make_shared<op::Parameter>(element::u8, params_shape);
auto I = make_shared<op::Parameter>(element::i32, indices_shape);
auto G = make_shared<op::Gather>(P, I);
auto f = make_shared<Function>(make_shared<op::GetOutputElement>(G, 0), ParameterVector{P, I});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto p = backend->create_tensor(element::u8, params_shape);
copy_data(p, vector<uint8_t>{10, 11, 20, 21, 30, 31});
auto i = backend->create_tensor(element::i32, indices_shape);
copy_data(i, vector<int32_t>{0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2});
auto result = backend->create_tensor(element::u8, out_shape);
auto c = backend->compile(f);
c->call_with_validate({result}, {p, i});
EXPECT_TRUE(test::all_close(
(vector<uint8_t>{10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31}),
read_vector<uint8_t>(result)));
}
NGRAPH_TEST(${BACKEND_NAME}, gather_4d_indices_no_axis)
{
Shape params_shape{3, 2};
Shape indices_shape{2, 2, 3, 4};
Shape out_shape{2, 2, 3, 4, 2};
auto P = make_shared<op::Parameter>(element::f32, params_shape);
auto I = make_shared<op::Parameter>(element::i32, indices_shape);
auto G = make_shared<op::Gather>(P, I);
auto f = make_shared<Function>(make_shared<op::GetOutputElement>(G, 0), ParameterVector{P, I});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto p = backend->create_tensor(element::f32, params_shape);
copy_data(p, vector<float>{1.0f, 1.1f, 2.0f, 2.1f, 3.0f, 3.1f});
auto i = backend->create_tensor(element::i32, indices_shape);
copy_data(i, vector<int32_t>{0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2});
auto result = backend->create_tensor(element::f32, out_shape);
auto c = backend->compile(f);
c->call_with_validate({result}, {p, i});
EXPECT_TRUE(test::all_close_f(
(vector<float>{1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f,
1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f,
1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f,
1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f}),
read_vector<float>(result),
MIN_FLOAT_TOLERANCE_BITS));
}
NGRAPH_TEST(${BACKEND_NAME}, gather_3d_indices_no_axis)
{
Shape params_shape{3, 2};
Shape indices_shape{2, 3, 4};
Shape out_shape{2, 3, 4, 2};
auto P = make_shared<op::Parameter>(element::f32, params_shape);
auto I = make_shared<op::Parameter>(element::i32, indices_shape);
auto G = make_shared<op::Gather>(P, I);
auto f = make_shared<Function>(make_shared<op::GetOutputElement>(G, 0), ParameterVector{P, I});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto p = backend->create_tensor(element::f32, params_shape);
copy_data(p, vector<float>{1.0f, 1.1f, 2.0f, 2.1f, 3.0f, 3.1f});
auto i = backend->create_tensor(element::i32, indices_shape);
copy_data(
i, vector<int32_t>{0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2});
auto result = backend->create_tensor(element::f32, out_shape);
auto c = backend->compile(f);
c->call_with_validate({result}, {p, i});
EXPECT_TRUE(test::all_close_f(
(vector<float>{1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f,
1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f}),
read_vector<float>(result),
MIN_FLOAT_TOLERANCE_BITS));
}
NGRAPH_TEST(${BACKEND_NAME}, gather_2d_indices_no_axis)
{
Shape params_shape{3, 2};
Shape indices_shape{2, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment