Commit 8fc481a3 authored by Nick Korovaiko's avatar Nick Korovaiko Committed by Scott Cyphers

DEX Loop Kernel (updated) (#2156)

* one output

passing tests

clean up

fix build breaks

* move generators into a separate file
parent 56980738
......@@ -127,6 +127,8 @@ if (NGRAPH_HALIDE)
set(SRC
${SRC}
builder/halide_op.cpp
builder/loop_kernel.cpp
builder/halide_generators.cpp
pass/halide_subgraph_extraction.cpp
)
endif()
......
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "halide_generators.hpp"
#include "ngraph/op/abs.hpp"
#include "ngraph/op/add.hpp"
#include "ngraph/op/divide.hpp"
#include "ngraph/op/maximum.hpp"
#include "ngraph/op/minimum.hpp"
#include "ngraph/op/multiply.hpp"
#include "ngraph/op/negative.hpp"
#include "ngraph/op/relu.hpp"
#include "ngraph/op/reshape.hpp"
#include "ngraph/op/subtract.hpp"
using namespace ngraph;
using namespace std;
#define TI(x) std::type_index(typeid(x))
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace halide
{
const std::unordered_map<std::type_index,
std::function<Halide::Func(std::vector<Halide::Func>)>>&
get_halide_generators()
{
const static std::unordered_map<
std::type_index,
std::function<Halide::Func(std::vector<Halide::Func>)>>
generators{{TI(ngraph::op::Add),
[](std::vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = in[0](x) + in[1](x);
return func;
}},
{TI(ngraph::op::Multiply),
[](std::vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = in[0](x) * in[1](x);
return func;
}},
{TI(ngraph::op::Negative),
[](std::vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = -in[0](x);
return func;
}},
{TI(ngraph::op::Abs),
[](std::vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = Halide::abs(in[0](x));
return func;
}},
{TI(ngraph::op::Divide),
[](std::vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = in[0](x) / in[1](x);
return func;
}},
{TI(ngraph::op::Maximum),
[](std::vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = Halide::max(in[0](x), 0);
return func;
}},
{TI(ngraph::op::Minimum),
[](std::vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = Halide::min(in[0](x), 0);
return func;
}},
{TI(ngraph::op::Relu), [](std::vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = Halide::max(in[0](x), 0);
return func;
}}};
return generators;
}
}
}
}
}
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <Halide.h>
#include <HalideBuffer.h>
#include <functional>
#include <memory>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include <vector>
#include "ngraph/node.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace halide
{
const std::unordered_map<std::type_index,
std::function<Halide::Func(std::vector<Halide::Func>)>>&
get_halide_generators();
}
}
}
}
......@@ -26,6 +26,7 @@
#include "ngraph/op/multiply.hpp"
#include "ngraph/op/relu.hpp"
#include "halide_generators.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/op/halide_op.hpp"
......@@ -40,38 +41,14 @@ namespace ngraph
{
namespace cpu
{
namespace halide
{
static const std::unordered_map<std::type_index,
std::function<Halide::Func(vector<Halide::Func>)>>
generators{{TI(ngraph::op::Add),
[](vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = in[0](x) + in[1](x);
return func;
}},
{TI(ngraph::op::Multiply),
[](vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = in[0](x) * in[1](x);
return func;
}},
{TI(ngraph::op::Relu), [](vector<Halide::Func> in) {
Halide::Var x;
Halide::Func func;
func(x) = Halide::max(in[0](x), 0);
return func;
}}};
}
template <>
void Builder::BUILDER_DECL(ngraph::runtime::cpu::op::HalideOp)
{
const ngraph::runtime::cpu::op::HalideOp* hs =
static_cast<const ngraph::runtime::cpu::op::HalideOp*>(node);
const auto& generators = ngraph::runtime::cpu::halide::get_halide_generators();
auto& halide_functions = external_function->get_halide_functions();
auto& subgraph_params = external_function->get_subgraph_params();
auto& subgraph_param_sizes = external_function->get_subgraph_param_sizes();
......@@ -79,7 +56,7 @@ namespace ngraph
for (const auto& op : hs->get_ops())
{
if (!halide::generators.count(TI(*op)))
if (!generators.count(TI(*op)))
{
throw ngraph_error("Invalid op in halide subgraph");
}
......@@ -102,7 +79,7 @@ namespace ngraph
}
}
halide_functions[op->get_output_tensor_ptr()->get_name()] =
halide::generators.at(TI(*op))(inputs);
generators.at(TI(*op))(inputs);
}
auto out_tensor_name = hs->get_ops().back()->get_output_tensor_ptr()->get_name();
......
//*****************************************************************************
// Copyright 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <Halide.h>
#include <HalideBuffer.h>
#include <functional>
#include <set>
#include <string>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include "ngraph/op/abs.hpp"
#include "ngraph/op/add.hpp"
#include "ngraph/op/divide.hpp"
#include "ngraph/op/maximum.hpp"
#include "ngraph/op/minimum.hpp"
#include "ngraph/op/multiply.hpp"
#include "ngraph/op/negative.hpp"
#include "ngraph/op/relu.hpp"
#include "ngraph/op/subtract.hpp"
#include "halide_generators.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
using namespace std;
using namespace ngraph;
#define TI(x) type_index(typeid(x))
namespace ngraph
{
namespace runtime
{
namespace cpu
{
template <>
void Builder::BUILDER_DECL(ngraph::runtime::cpu::op::LoopKernel)
{
const ngraph::runtime::cpu::op::LoopKernel* hs =
static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
const auto& generators = ngraph::runtime::cpu::halide::get_halide_generators();
auto& halide_functions = external_function->get_halide_functions();
auto& subgraph_params = external_function->get_subgraph_params();
auto& subgraph_param_sizes = external_function->get_subgraph_param_sizes();
auto& subgraph_param_ptrs = external_function->get_subgraph_param_ptrs();
std::set<std::string> param_names;
for (const auto& op : hs->get_node_list())
{
if (!generators.count(TI(*op)))
{
throw ngraph_error("Invalid op in halide subgraph");
}
vector<Halide::Func> inputs;
for (const auto& input : op->get_inputs())
{
auto tensor_name = input.get_output().get_tensor_ptr()->get_name();
if (halide_functions.count(tensor_name))
{
inputs.emplace_back(halide_functions[tensor_name]);
}
else
{
if (param_names.count(tensor_name) == 0)
{
param_names.insert(tensor_name);
subgraph_params[tensor_name] =
Halide::ImageParam(Halide::Float(32), 1, tensor_name);
subgraph_param_sizes[tensor_name] =
shape_size(input.get_output().get_tensor_ptr()->get_shape());
subgraph_param_ptrs.emplace(
tensor_name, external_function->get_tensor_data(tensor_name));
inputs.emplace_back(subgraph_params[tensor_name]);
}
else
{
inputs.emplace_back(subgraph_params[tensor_name]);
}
}
}
//TODO: this needs to be extended to support multi-output ops inside
//a subgraph
if (op->get_outputs().size() > 1)
{
throw ngraph_error("no multi-output ops in a LoopKernel");
}
halide_functions[op->get_output_tensor_ptr()->get_name()] =
generators.at(TI(*op))(inputs);
}
auto& functors = external_function->get_functors();
std::vector<std::tuple<void*&, size_t>> buffers_data;
std::vector<Halide::Expr> results;
auto output_nodes = hs->get_kernel_outputs();
Halide::Var x;
for (size_t i = 0; i < output_nodes.size(); i++)
{
auto result_func =
halide_functions[output_nodes.at(i)->get_output_tensor_ptr()->get_name()];
results.push_back((result_func(x) + 0));
auto& out_tensor = external_function->get_tensor_data(out[i].get_name());
buffers_data.push_back(
std::tuple<void*&, size_t>(out_tensor, out[i].get_size()));
}
Halide::Func terminal_func;
terminal_func(x) = Halide::Tuple(results);
CPUKernelFunctor functor = [&, terminal_func, buffers_data, param_names](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable {
std::vector<Halide::Argument> halide_args;
for (auto& param : param_names)
{
Halide::Buffer<float> param_buffer(
static_cast<float*>(subgraph_param_ptrs.at(param).get()),
subgraph_param_sizes.at(param));
subgraph_params[param].set(param_buffer);
}
std::vector<Halide::Buffer<>> buffers;
for (auto tuple : buffers_data)
{
buffers.push_back(Halide::Buffer<float>(
static_cast<float*>(std::get<0>(tuple)), std::get<1>(tuple)));
}
Halide::Realization r(buffers);
terminal_func.realize(r);
};
functors.emplace_back(functor);
}
}
}
}
......@@ -99,6 +99,7 @@
#include "ngraph/runtime/cpu/kernel/tanh.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/halide_op.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/type/element_type.hpp"
#include "ngraph/util.hpp"
......@@ -372,6 +373,8 @@ namespace ngraph
{TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
{TI(ngraph::runtime::cpu::op::ConvertLayout),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::ConvertLayout>},
{TI(ngraph::runtime::cpu::op::LoopKernel),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::LoopKernel>},
{TI(ngraph::runtime::cpu::op::HalideOp),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::HalideOp>}};
......
......@@ -1496,34 +1496,69 @@ TEST(cpu_fusion, backwards_maxpool_with_indices_n4_c1_hw4_2x2_max)
ASSERT_TRUE(read_vector<float>(output) == expected);
}
#if 0
TEST(cpu_fusion, loop_kernel_one_input_one_output)
#if defined(NGRAPH_HALIDE)
TEST(cpu_fusion, loop_kernel_one_input_one_output_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto relu_a = make_shared<op::Relu>(A);
auto relu_relu_a = make_shared<op::Relu>(relu_a);
auto lk = make_shared<runtime::cpu::op::LoopKernel>(
NodeVector{neg_a}, NodeVector{neg_a}, NodeVector{A});
NodeVector{relu_a, relu_relu_a}, NodeVector{relu_relu_a}, NodeVector{A});
auto f = make_shared<Function>(NodeVector{lk}, ParameterVector{A});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeA);
vector<int> dataA{1, 4, 1, 4};
vector<float> dataA{-1, 4, -1, 4};
copy_data(a, dataA);
vector<int> expected{-1, -4, -1, -4};
vector<float> expected{0, 4, 0, 4};
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ(read_vector<int>(result), expected);
EXPECT_TRUE(test::all_close(read_vector<float>(result), expected));
}
TEST(cpu_fusion, loop_kernel_embedded_graph)
TEST(cpu_fusion, loop_kernel_two_input_two_output_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto B = make_shared<op::Parameter>(element::i32, shapeA);
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeA);
auto relu_a = make_shared<op::Relu>(A);
auto add_ab = make_shared<op::Add>(relu_a, B);
auto lk = make_shared<runtime::cpu::op::LoopKernel>(
NodeVector{relu_a, add_ab}, NodeVector{relu_a, add_ab}, NodeVector{A, B});
auto goe1 = make_shared<op::GetOutputElement>(lk, 0);
auto goe2 = make_shared<op::GetOutputElement>(lk, 1);
auto f = make_shared<Function>(NodeVector{goe1, goe2}, ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result_relu = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result_add = backend->create_tensor(element::f32, shapeA);
vector<float> dataA{-1, 4, -1, 4};
vector<float> dataB{0, 4, 0, 4};
copy_data(a, dataA);
copy_data(b, dataB);
vector<float> expected_relu{0, 4, 0, 4};
vector<float> expected_add{4, 4, 4, 4};
backend->call_with_validate(f, {result_relu, result_add}, {a, b});
EXPECT_TRUE(test::all_close(read_vector<float>(result_relu), expected_relu));
}
TEST(cpu_fusion, loop_kernel_embedded_graph_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto neg_b = make_shared<op::Negative>(B);
auto add = neg_a + neg_b;
......@@ -1532,52 +1567,52 @@ TEST(cpu_fusion, loop_kernel_embedded_graph)
auto f = make_shared<Function>(NodeVector{lk}, ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeA);
vector<int> dataA{1, 4, 1, 4};
vector<float> dataA{1, 4, 1, 4};
copy_data(a, dataA);
vector<int> dataB{1, 2, 3, 4};
vector<float> dataB{1, 2, 3, 4};
copy_data(b, dataB);
vector<int> expected{-2, -6, -4, -8};
vector<float> expected{-2, -6, -4, -8};
backend->call_with_validate(f, {result}, {a, b});
EXPECT_EQ(read_vector<int>(result), expected);
EXPECT_EQ(read_vector<float>(result), expected);
}
TEST(cpu_fusion, loop_kernel_two_inputs_one_output)
TEST(cpu_fusion, loop_kernel_two_inputs_one_output_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto B = make_shared<op::Parameter>(element::i32, shapeA);
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeA);
auto add = A + B;
auto lk = make_shared<runtime::cpu::op::LoopKernel>(
NodeVector{add}, NodeVector{add}, NodeVector{A, B});
auto f = make_shared<Function>(NodeVector{lk}, ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeA);
vector<int> dataA{1, 4, 1, 4};
vector<float> dataA{1, 4, 1, 4};
copy_data(a, dataA);
vector<int> dataB{1, 2, 3, 4};
vector<float> dataB{1, 2, 3, 4};
copy_data(b, dataB);
vector<int> expected{2, 6, 4, 8};
vector<float> expected{2, 6, 4, 8};
backend->call_with_validate(f, {result}, {a, b});
EXPECT_EQ(read_vector<int>(result), expected);
EXPECT_EQ(read_vector<float>(result), expected);
}
TEST(cpu_fusion, loop_kernel_multiple_outputs)
TEST(cpu_fusion, loop_kernel_multiple_outputs_halide)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto B = make_shared<op::Parameter>(element::i32, shapeA);
auto C = make_shared<op::Parameter>(element::i32, shapeA);
auto D = make_shared<op::Parameter>(element::i32, shapeA);
auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeA);
auto C = make_shared<op::Parameter>(element::f32, shapeA);
auto D = make_shared<op::Parameter>(element::f32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto neg_b = make_shared<op::Negative>(B);
......@@ -1601,18 +1636,18 @@ TEST(cpu_fusion, loop_kernel_multiple_outputs)
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> c = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> d = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::Tensor> r3 = backend->create_tensor(element::i32, shapeA);
vector<int> dataA{1, 4, 1, 4};
vector<int> dataB{3, 3, 3, 9};
vector<int> dataC{1, 2, 3, 4};
vector<int> dataD{-2, 2, -1, 1};
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> d = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::f32, shapeA);
shared_ptr<runtime::Tensor> r3 = backend->create_tensor(element::f32, shapeA);
vector<float> dataA{1, 4, 1, 4};
vector<float> dataB{3, 3, 3, 9};
vector<float> dataC{1, 2, 3, 4};
vector<float> dataD{-2, 2, -1, 1};
copy_data(a, dataA);
copy_data(b, dataB);
copy_data(c, dataC);
......@@ -1620,12 +1655,12 @@ TEST(cpu_fusion, loop_kernel_multiple_outputs)
backend->call_with_validate(f, {r1, r2, r3}, {a, b, c, d});
vector<int> expected1{5, 11, 5, 17};
vector<int> expected2{2, 7, 5, 14};
vector<int> expected3{-3, -3, -3, -9};
EXPECT_EQ(read_vector<int>(r1), expected1);
EXPECT_EQ(read_vector<int>(r2), expected2);
EXPECT_EQ(read_vector<int>(r3), expected3);
vector<float> expected1{5, 11, 5, 17};
vector<float> expected2{2, 7, 5, 14};
vector<float> expected3{-3, -3, -3, -9};
EXPECT_EQ(read_vector<float>(r1), expected1);
EXPECT_EQ(read_vector<float>(r2), expected2);
EXPECT_EQ(read_vector<float>(r3), expected3);
}
TEST(cpu_fusion, loop_kernel_copy_with_new_args)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment