Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
ngraph
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ngraph
Commits
5f914429
Commit
5f914429
authored
Jul 23, 2019
by
nmostafa
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Move CompiledKernel op under mlir_backend lib. Add compiler to op. Track compilation status
parent
607445a4
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
64 additions
and
468 deletions
+64
-468
CMakeLists.txt
src/contrib/mlir/CMakeLists.txt
+2
-0
compiled_kernel.cpp
src/contrib/mlir/compiled_kernel.cpp
+3
-1
compiled_kernel.hpp
src/contrib/mlir/compiled_kernel.hpp
+26
-2
compiler.cpp
src/contrib/mlir/compiler.cpp
+17
-14
compiler.hpp
src/contrib/mlir/compiler.hpp
+9
-5
mlir_subgraph_extraction.cpp
src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
+1
-1
CMakeLists.txt
src/ngraph/CMakeLists.txt
+0
-2
mlir_cpu_compiled_kernel.cpp
src/ngraph/runtime/cpu/builder/mlir_cpu_compiled_kernel.cpp
+4
-8
cpu_builder.cpp
src/ngraph/runtime/cpu/cpu_builder.cpp
+1
-1
cpu_external_function.cpp
src/ngraph/runtime/cpu/cpu_external_function.cpp
+1
-1
serializer.cpp
src/ngraph/serializer.cpp
+0
-1
cpu_fusion.cpp
test/cpu_fusion.cpp
+0
-432
No files found.
src/contrib/mlir/CMakeLists.txt
View file @
5f914429
...
...
@@ -23,6 +23,8 @@ set(SRC
memory_manager.cpp
pass/mlir_subgraph_extraction.cpp
pass/mlir_subgraph_extraction.hpp
compiled_kernel.cpp
compiled_kernel.hpp
)
if
(
NGRAPH_MLIR_ENABLE
)
...
...
src/
ngraph/op/experimental
/compiled_kernel.cpp
→
src/
contrib/mlir
/compiled_kernel.cpp
View file @
5f914429
...
...
@@ -14,7 +14,7 @@
// limitations under the License.
//*****************************************************************************
#include "
ngraph/op/experimental/
compiled_kernel.hpp"
#include "compiled_kernel.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp"
...
...
@@ -67,6 +67,8 @@ ngraph::op::CompiledKernel::CompiledKernel(const NodeVector& node_list,
:
Op
(
"CompiledKernel"
,
check_single_output_args
({
args
}))
,
m_node_list
(
node_list
)
,
m_output_nodes
(
outputs
)
,
m_mlir_compiler
(
this
)
,
m_is_compiled
(
false
)
{
constructor_validate_and_infer_types
();
set_output_size
(
m_output_nodes
.
size
());
...
...
src/
ngraph/op/experimental
/compiled_kernel.hpp
→
src/
contrib/mlir
/compiled_kernel.hpp
View file @
5f914429
...
...
@@ -18,6 +18,7 @@
#include "ngraph/op/op.hpp"
#include "ngraph/util.hpp"
#include "contrib/mlir/compiler.hpp"
namespace
ngraph
{
...
...
@@ -38,11 +39,34 @@ namespace ngraph
virtual
std
::
shared_ptr
<
Node
>
copy_with_new_args
(
const
NodeVector
&
new_args
)
const
override
;
const
NodeVector
&
get_node_list
()
const
{
return
m_node_list
;
}
const
NodeVector
&
get_kernel_outputs
()
const
{
return
m_output_nodes
;
}
const
NodeVector
&
get_node_list
()
const
{
return
m_node_list
;
}
const
NodeVector
&
get_kernel_outputs
()
const
{
return
m_output_nodes
;
}
/// Compiles the sub-graph associated with this CompiledKernel
void
compile
()
{
if
(
m_is_compiled
)
{
return
;
}
m_mlir_compiler
.
compile
();
m_is_compiled
=
true
;
}
/// Runs the sub-graph
void
run
(
std
::
vector
<
void
*>&
ptr_args
)
{
NGRAPH_CHECK
(
m_is_compiled
,
"CompiledKernel node not compiled yet"
);
m_mlir_compiler
.
set_args
(
&
ptr_args
);
m_mlir_compiler
.
run
();
}
bool
is_compiled
()
const
{
return
m_is_compiled
;
}
private
:
NodeVector
m_node_list
;
NodeVector
m_output_nodes
;
ngraph
::
runtime
::
ngmlir
::
MLIRCompiler
m_mlir_compiler
;
bool
m_is_compiled
;
};
}
}
src/contrib/mlir/compiler.cpp
View file @
5f914429
...
...
@@ -29,7 +29,7 @@
#include "ngraph/op/concat.hpp"
#include "ngraph/op/divide.hpp"
#include "ngraph/op/dot.hpp"
#include "
ngraph/op/experimental/
compiled_kernel.hpp"
#include "compiled_kernel.hpp"
#include "ngraph/op/greater.hpp"
#include "ngraph/op/less.hpp"
#include "ngraph/op/maximum.hpp"
...
...
@@ -69,16 +69,6 @@ using namespace ngraph::runtime::ngmlir;
#define COMPILE_OP_DECL(op_name) \
create_op<op_name>(MLIRCompiler & compiler, const ngraph::Node* ng_node)
MLIRCompiler
::
MLIRCompiler
(
const
ngraph
::
op
::
CompiledKernel
*
compiled_kernel
,
const
std
::
vector
<
void
*>&
external_tensors
)
:
m_compiled_kernel
(
compiled_kernel
)
,
m_external_tensors
(
external_tensors
)
{
NGRAPH_CHECK
((
m_compiled_kernel
->
get_arguments
().
size
()
+
m_compiled_kernel
->
get_kernel_outputs
().
size
())
==
external_tensors
.
size
(),
"Number of arguments and outputs doesn't match number of tensors"
);
}
void
MLIRCompiler
::
init_mlir
()
{
// Mutex to safely initialize MLIR.
...
...
@@ -96,11 +86,24 @@ void MLIRCompiler::init_mlir()
}
}
void
MLIRCompiler
::
compile_and_run
()
void
MLIRCompiler
::
set_args
(
std
::
vector
<
void
*>*
external_tensors
)
{
NGRAPH_CHECK
(
m_compiled_kernel
,
"No compiled kernel set for compiler"
);
NGRAPH_CHECK
((
m_compiled_kernel
->
get_arguments
().
size
()
+
m_compiled_kernel
->
get_kernel_outputs
().
size
())
==
external_tensors
->
size
(),
"Number of arguments and outputs doesn't match number of tensors"
);
m_external_tensors
=
external_tensors
;
}
void
MLIRCompiler
::
compile
()
{
build_ng_dialect_module
();
lower_ng_dialect
();
optimize
();
}
void
MLIRCompiler
::
run
()
{
bind_arguments
();
execute
();
cleanup
();
...
...
@@ -471,13 +474,13 @@ void MLIRCompiler::bind_arguments()
NGRAPH_CHECK
(
expected_arguments
.
size
(),
"Arguments can't be created"
);
m_invoke_args
=
std
::
move
(
expected_arguments
);
NGRAPH_CHECK
(
m_invoke_args
.
size
()
==
m_external_tensors
.
size
(),
NGRAPH_CHECK
(
m_invoke_args
.
size
()
==
m_external_tensors
->
size
(),
"Number of external tensors doesn't match number of function arguments"
);
// Assign external tensor pointers to invocation arguments.
for
(
size_t
i
=
0
,
num_args
=
m_invoke_args
.
size
();
i
<
num_args
;
++
i
)
{
((
mlir
::
StaticFloatMemRef
*
)
m_invoke_args
[
i
])
->
data
=
(
float
*
)
m_external_tensors
[
i
];
((
mlir
::
StaticFloatMemRef
*
)
m_invoke_args
[
i
])
->
data
=
(
float
*
)
(
*
m_external_tensors
)
[
i
];
}
// Add pointer to memory manager
...
...
src/contrib/mlir/compiler.hpp
View file @
5f914429
...
...
@@ -60,11 +60,15 @@ namespace ngraph
using
TensorList
=
std
::
vector
<
descriptor
::
Tensor
*>
;
using
TypeList
=
llvm
::
SmallVector
<
mlir
::
Type
,
4
>
;
MLIRCompiler
(
const
ngraph
::
op
::
CompiledKernel
*
compiled_kernel
,
const
std
::
vector
<
void
*>&
external_tensors
);
MLIRCompiler
(
const
ngraph
::
op
::
CompiledKernel
*
compiled_kernel
)
:
m_compiled_kernel
(
compiled_kernel
)
{}
/// Compiles and runs a subgraph in MLIR.
void
compile_and_run
();
/// Set runtime tensor arguments for the sub-graph
void
set_args
(
std
::
vector
<
void
*>*
external_tensors
);
/// Compiles a subgraph with MLIR
void
compile
();
/// Executes a pre-compiled subgraph
void
run
();
/// Returns the memory manager used by this sub-graph compiler.
MLIRMemMgr
&
get_mem_mgr
()
{
return
m_mem_mgr
;
}
...
...
@@ -134,7 +138,7 @@ namespace ngraph
const
ngraph
::
op
::
CompiledKernel
*
m_compiled_kernel
;
// Pointers to externally allocated memory for sub-graph's input and output tensors.
const
std
::
vector
<
void
*>&
m_external_tensors
;
std
::
vector
<
void
*>*
m_external_tensors
;
// Arguments for the MLIR function generated for the nGraph sub-graph.
llvm
::
SmallVector
<
void
*
,
8
>
m_invoke_args
;
...
...
src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
View file @
5f914429
...
...
@@ -24,7 +24,7 @@
#include "ngraph/op/concat.hpp"
#include "ngraph/op/divide.hpp"
#include "ngraph/op/dot.hpp"
#include "
ngraph/op/experimental
/compiled_kernel.hpp"
#include "
contrib/mlir
/compiled_kernel.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp"
#include "ngraph/op/less.hpp"
...
...
src/ngraph/CMakeLists.txt
View file @
5f914429
...
...
@@ -174,8 +174,6 @@ set (SRC
op/experimental/quantized_dot.hpp
op/experimental/quantized_dot_bias.cpp
op/experimental/quantized_dot_bias.hpp
op/experimental/compiled_kernel.cpp
op/experimental/compiled_kernel.hpp
op/experimental/transpose.cpp
op/experimental/transpose.hpp
op/experimental/layers/ctc_greedy_decoder.cpp
...
...
src/ngraph/runtime/cpu/builder/mlir_cpu_compiled_kernel.cpp
View file @
5f914429
...
...
@@ -17,7 +17,7 @@
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "contrib/mlir/compiler.hpp"
#include "
ngraph/op/experimental
/compiled_kernel.hpp"
#include "
contrib/mlir
/compiled_kernel.hpp"
#include "ngraph/runtime/cpu/cpu_runtime_context.hpp"
using
namespace
ngraph
;
...
...
@@ -65,14 +65,10 @@ namespace ngraph
{
ptr_args
.
push_back
(
ctx
->
buffer_data
[
buffer_index
]);
}
// Compile nodes within the CompiledKernel op.
auto
*
compiled_kernel
=
static_cast
<
const
CompiledKernel
*>
(
node
);
MLIRCompiler
mlir_compiler
(
compiled_kernel
,
ptr_args
);
// TODO: Decouple 'compile' and 'run' APIs. We want to be able to run the same
// jitted code on different arguments.
mlir_compiler
.
compile_and_run
();
CompiledKernel
*
compiled_kernel
=
static_cast
<
CompiledKernel
*>
(
const_cast
<
Node
*>
(
node
));
compiled_kernel
->
compile
();
compiled_kernel
->
run
(
ptr_args
);
};
functors
.
emplace_back
(
functor
);
...
...
src/ngraph/runtime/cpu/cpu_builder.cpp
View file @
5f914429
...
...
@@ -38,7 +38,7 @@
#include "ngraph/op/divide.hpp"
#include "ngraph/op/equal.hpp"
#include "ngraph/op/exp.hpp"
#include "
ngraph/op/experimental
/compiled_kernel.hpp"
#include "
contrib/mlir
/compiled_kernel.hpp"
#include "ngraph/op/floor.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp"
...
...
src/ngraph/runtime/cpu/cpu_external_function.cpp
View file @
5f914429
...
...
@@ -73,7 +73,7 @@
#include "ngraph/op/erf.hpp"
#include "ngraph/op/exp.hpp"
#include "ngraph/op/experimental/batch_mat_mul.hpp"
#include "
ngraph/op/experimental
/compiled_kernel.hpp"
#include "
contrib/mlir
/compiled_kernel.hpp"
#include "ngraph/op/experimental/generate_mask.hpp"
#include "ngraph/op/experimental/quantized_avg_pool.hpp"
#include "ngraph/op/experimental/quantized_concat.hpp"
...
...
src/ngraph/serializer.cpp
View file @
5f914429
...
...
@@ -52,7 +52,6 @@
#include "ngraph/op/erf.hpp"
#include "ngraph/op/exp.hpp"
#include "ngraph/op/experimental/batch_mat_mul.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/experimental/dyn_broadcast.hpp"
#include "ngraph/op/experimental/dyn_pad.hpp"
#include "ngraph/op/experimental/dyn_replace_slice.hpp"
...
...
test/cpu_fusion.cpp
View file @
5f914429
...
...
@@ -30,7 +30,6 @@
#include "ngraph/op/batch_norm.hpp"
#include "ngraph/op/concat.hpp"
#include "ngraph/op/dequantize.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/experimental/generate_mask.hpp"
#include "ngraph/op/experimental/quantized_concat.hpp"
#include "ngraph/op/experimental/quantized_conv_bias.hpp"
...
...
@@ -1543,241 +1542,6 @@ TEST(cpu_fusion, backwards_maxpool_with_indices_n4_c1_hw4_2x2_max)
EXPECT_TRUE
(
test
::
all_close_f
(
read_vector
<
float
>
(
output
),
expected
,
MIN_FLOAT_TOLERANCE_BITS
));
}
#if defined(NGRAPH_HALIDE)
TEST
(
cpu_fusion
,
compiled_kernel_one_input_one_output_halide
)
{
Shape
shapeA
{
2
,
2
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
relu_a
=
make_shared
<
op
::
Relu
>
(
A
);
auto
relu_relu_a
=
make_shared
<
op
::
Relu
>
(
relu_a
);
auto
ck
=
make_shared
<
op
::
CompiledKernel
>
(
NodeVector
{
relu_a
,
relu_relu_a
},
NodeVector
{
relu_relu_a
},
NodeVector
{
A
});
auto
f
=
make_shared
<
Function
>
(
NodeVector
{
ck
},
ParameterVector
{
A
});
auto
backend
=
runtime
::
Backend
::
create
(
"CPU"
);
shared_ptr
<
runtime
::
Tensor
>
a
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
result
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
vector
<
float
>
dataA
{
-
1
,
4
,
-
1
,
4
};
copy_data
(
a
,
dataA
);
vector
<
float
>
expected
{
0
,
4
,
0
,
4
};
auto
handle
=
backend
->
compile
(
f
);
handle
->
call_with_validate
({
result
},
{
a
});
EXPECT_TRUE
(
test
::
all_close
(
read_vector
<
float
>
(
result
),
expected
));
}
TEST
(
cpu_fusion
,
compiled_kernel_two_input_two_output_halide
)
{
Shape
shapeA
{
2
,
2
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
B
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
relu_a
=
make_shared
<
op
::
Relu
>
(
A
);
auto
add_ab
=
make_shared
<
op
::
Add
>
(
relu_a
,
B
);
auto
ck
=
make_shared
<
op
::
CompiledKernel
>
(
NodeVector
{
relu_a
,
add_ab
},
NodeVector
{
relu_a
,
add_ab
},
NodeVector
{
A
,
B
});
auto
goe1
=
make_shared
<
op
::
GetOutputElement
>
(
ck
,
0
);
auto
goe2
=
make_shared
<
op
::
GetOutputElement
>
(
ck
,
1
);
auto
f
=
make_shared
<
Function
>
(
NodeVector
{
goe1
,
goe2
},
ParameterVector
{
A
,
B
});
auto
backend
=
runtime
::
Backend
::
create
(
"CPU"
);
shared_ptr
<
runtime
::
Tensor
>
a
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
b
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
result_relu
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
result_add
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
vector
<
float
>
dataA
{
-
1
,
4
,
-
1
,
4
};
vector
<
float
>
dataB
{
0
,
4
,
0
,
4
};
copy_data
(
a
,
dataA
);
copy_data
(
b
,
dataB
);
vector
<
float
>
expected_relu
{
0
,
4
,
0
,
4
};
vector
<
float
>
expected_add
{
4
,
4
,
4
,
4
};
auto
handle
=
backend
->
compile
(
f
);
handle
->
call_with_validate
({
result_relu
,
result_add
},
{
a
,
b
});
EXPECT_TRUE
(
test
::
all_close
(
read_vector
<
float
>
(
result_relu
),
expected_relu
));
}
TEST
(
cpu_fusion
,
compiled_kernel_embedded_graph_halide
)
{
Shape
shapeA
{
2
,
2
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
B
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
neg_a
=
make_shared
<
op
::
Negative
>
(
A
);
auto
neg_b
=
make_shared
<
op
::
Negative
>
(
B
);
auto
add
=
neg_a
+
neg_b
;
auto
ck
=
make_shared
<
op
::
CompiledKernel
>
(
NodeVector
{
add
},
NodeVector
{
add
},
NodeVector
{
neg_a
,
neg_b
});
auto
f
=
make_shared
<
Function
>
(
NodeVector
{
ck
},
ParameterVector
{
A
,
B
});
auto
backend
=
runtime
::
Backend
::
create
(
"CPU"
);
shared_ptr
<
runtime
::
Tensor
>
a
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
b
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
result
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
vector
<
float
>
dataA
{
1
,
4
,
1
,
4
};
copy_data
(
a
,
dataA
);
vector
<
float
>
dataB
{
1
,
2
,
3
,
4
};
copy_data
(
b
,
dataB
);
vector
<
float
>
expected
{
-
2
,
-
6
,
-
4
,
-
8
};
auto
handle
=
backend
->
compile
(
f
);
handle
->
call_with_validate
({
result
},
{
a
,
b
});
EXPECT_TRUE
(
test
::
all_close_f
(
read_vector
<
float
>
(
result
),
expected
,
MIN_FLOAT_TOLERANCE_BITS
));
}
TEST
(
cpu_fusion
,
compiled_kernel_two_inputs_one_output_halide
)
{
Shape
shapeA
{
2
,
2
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
B
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
add
=
A
+
B
;
auto
ck
=
make_shared
<
op
::
CompiledKernel
>
(
NodeVector
{
add
},
NodeVector
{
add
},
NodeVector
{
A
,
B
});
auto
f
=
make_shared
<
Function
>
(
NodeVector
{
ck
},
ParameterVector
{
A
,
B
});
auto
backend
=
runtime
::
Backend
::
create
(
"CPU"
);
shared_ptr
<
runtime
::
Tensor
>
a
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
b
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
result
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
vector
<
float
>
dataA
{
1
,
4
,
1
,
4
};
copy_data
(
a
,
dataA
);
vector
<
float
>
dataB
{
1
,
2
,
3
,
4
};
copy_data
(
b
,
dataB
);
vector
<
float
>
expected
{
2
,
6
,
4
,
8
};
auto
handle
=
backend
->
compile
(
f
);
handle
->
call_with_validate
({
result
},
{
a
,
b
});
EXPECT_TRUE
(
test
::
all_close_f
(
read_vector
<
float
>
(
result
),
expected
,
MIN_FLOAT_TOLERANCE_BITS
));
}
TEST
(
cpu_fusion
,
compiled_kernel_multiple_outputs_halide
)
{
Shape
shapeA
{
2
,
2
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
B
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
C
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
D
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shapeA
);
auto
neg_a
=
make_shared
<
op
::
Negative
>
(
A
);
auto
neg_b
=
make_shared
<
op
::
Negative
>
(
B
);
auto
add_ab
=
neg_a
+
neg_b
;
auto
add_cd
=
C
+
B
;
auto
add_cd_abs
=
make_shared
<
op
::
Abs
>
(
add_cd
);
auto
add_ab_abs
=
make_shared
<
op
::
Abs
>
(
add_ab
);
auto
add_aab
=
add_ab_abs
+
A
;
auto
add_cdd
=
add_cd_abs
+
D
;
auto
ck
=
make_shared
<
op
::
CompiledKernel
>
(
NodeVector
{
neg_a
,
neg_b
,
add_ab
,
add_cd
,
add_cd_abs
,
add_ab_abs
,
add_aab
,
add_cdd
},
NodeVector
{
add_aab
,
add_cdd
,
neg_b
},
NodeVector
{
A
,
B
,
C
,
D
});
auto
add_aab_goe
=
std
::
make_shared
<
op
::
GetOutputElement
>
(
ck
,
0
);
auto
add_cdd_goe
=
std
::
make_shared
<
op
::
GetOutputElement
>
(
ck
,
1
);
auto
neg_b_goe
=
std
::
make_shared
<
op
::
GetOutputElement
>
(
ck
,
2
);
auto
f
=
make_shared
<
Function
>
(
NodeVector
{
add_aab_goe
,
add_cdd_goe
,
neg_b_goe
},
ParameterVector
{
A
,
B
,
C
,
D
});
auto
backend
=
runtime
::
Backend
::
create
(
"CPU"
);
shared_ptr
<
runtime
::
Tensor
>
a
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
b
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
c
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
d
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
r1
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
r2
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
r3
=
backend
->
create_tensor
(
element
::
f32
,
shapeA
);
vector
<
float
>
dataA
{
1
,
4
,
1
,
4
};
vector
<
float
>
dataB
{
3
,
3
,
3
,
9
};
vector
<
float
>
dataC
{
1
,
2
,
3
,
4
};
vector
<
float
>
dataD
{
-
2
,
2
,
-
1
,
1
};
copy_data
(
a
,
dataA
);
copy_data
(
b
,
dataB
);
copy_data
(
c
,
dataC
);
copy_data
(
d
,
dataD
);
auto
handle
=
backend
->
compile
(
f
);
handle
->
call_with_validate
({
r1
,
r2
,
r3
},
{
a
,
b
,
c
,
d
});
vector
<
float
>
expected1
{
5
,
11
,
5
,
17
};
vector
<
float
>
expected2
{
2
,
7
,
5
,
14
};
vector
<
float
>
expected3
{
-
3
,
-
3
,
-
3
,
-
9
};
EXPECT_TRUE
(
test
::
all_close_f
(
read_vector
<
float
>
(
r1
),
expected1
,
MIN_FLOAT_TOLERANCE_BITS
));
EXPECT_TRUE
(
test
::
all_close_f
(
read_vector
<
float
>
(
r2
),
expected2
,
MIN_FLOAT_TOLERANCE_BITS
));
EXPECT_TRUE
(
test
::
all_close_f
(
read_vector
<
float
>
(
r3
),
expected3
,
MIN_FLOAT_TOLERANCE_BITS
));
}
TEST
(
cpu_fusion
,
compiled_kernel_copy_with_new_args
)
{
Shape
shapeA
{
2
,
2
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shapeA
);
auto
B
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shapeA
);
auto
C
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shapeA
);
auto
D
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shapeA
);
auto
neg_a
=
make_shared
<
op
::
Negative
>
(
A
);
auto
neg_b
=
make_shared
<
op
::
Negative
>
(
B
);
auto
add_ab
=
neg_a
+
neg_b
;
auto
add_cd
=
C
+
B
;
auto
add_cd_abs
=
make_shared
<
op
::
Abs
>
(
add_cd
);
auto
add_ab_abs
=
make_shared
<
op
::
Abs
>
(
add_ab
);
auto
add_aab
=
add_ab_abs
+
A
;
auto
add_cdd
=
add_cd_abs
+
D
;
auto
ck
=
make_shared
<
op
::
CompiledKernel
>
(
NodeVector
{
neg_a
,
neg_b
,
add_ab
,
add_cd
,
add_cd_abs
,
add_ab_abs
,
add_aab
,
add_cdd
},
NodeVector
{
add_aab
,
add_cdd
,
neg_b
},
NodeVector
{
A
,
B
,
C
,
D
});
auto
add_aab_goe
=
std
::
make_shared
<
op
::
GetOutputElement
>
(
ck
,
0
);
auto
add_cdd_goe
=
std
::
make_shared
<
op
::
GetOutputElement
>
(
ck
,
1
);
auto
neg_b_goe
=
std
::
make_shared
<
op
::
GetOutputElement
>
(
ck
,
2
);
auto
f
=
make_shared
<
Function
>
(
NodeVector
{
add_aab_goe
,
add_cdd_goe
,
neg_b_goe
},
ParameterVector
{
A
,
B
,
C
,
D
});
auto
copy_f
=
clone_function
(
*
f
);
auto
backend
=
runtime
::
Backend
::
create
(
"CPU"
);
shared_ptr
<
runtime
::
Tensor
>
a
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
b
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
c
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
d
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
r1
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
r2
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
r3
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
copy_r1
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
copy_r2
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
shared_ptr
<
runtime
::
Tensor
>
copy_r3
=
backend
->
create_tensor
(
element
::
i32
,
shapeA
);
vector
<
int
>
dataA
{
1
,
4
,
1
,
4
};
vector
<
int
>
dataB
{
3
,
3
,
3
,
9
};
vector
<
int
>
dataC
{
1
,
2
,
3
,
4
};
vector
<
int
>
dataD
{
-
2
,
2
,
-
1
,
1
};
copy_data
(
a
,
dataA
);
copy_data
(
b
,
dataB
);
copy_data
(
c
,
dataC
);
copy_data
(
d
,
dataD
);
auto
handle
=
backend
->
compile
(
f
);
handle
->
call_with_validate
({
r1
,
r2
,
r3
},
{
a
,
b
,
c
,
d
});
auto
h1
=
backend
->
compile
(
copy_f
);
h1
->
call_with_validate
({
copy_r1
,
copy_r2
,
copy_r3
},
{
a
,
b
,
c
,
d
});
EXPECT_EQ
(
read_vector
<
int
>
(
r1
),
read_vector
<
int
>
(
copy_r1
));
EXPECT_EQ
(
read_vector
<
int
>
(
r2
),
read_vector
<
int
>
(
copy_r2
));
EXPECT_EQ
(
read_vector
<
int
>
(
r3
),
read_vector
<
int
>
(
copy_r3
));
}
#endif
static
std
::
shared_ptr
<
ngraph
::
Function
>
make_forward_function
()
{
Shape
shape_a
{
10
,
3
,
28
,
28
};
...
...
@@ -2296,202 +2060,6 @@ TEST(cpu_fusion, rnn_fprop_1_lstm_cell)
EXPECT_TRUE
(
test
::
all_close
(
expected_ct
,
read_vector
<
float
>
(
result_ct
)));
}
#if 0
TEST(cpu_fusion, compiled_kernel_fusion_multiple_groups_pruned)
{
auto make_function = []() -> std::shared_ptr<Function> {
Shape shape{};
auto a = make_shared<op::Parameter>(element::f32, shape);
auto b = make_shared<op::Parameter>(element::f32, shape);
auto c = make_shared<op::Parameter>(element::f32, shape);
auto add_ab = a + b;
auto add_abs = std::make_shared<op::Abs>(add_ab);
auto abs_neg = std::make_shared<op::Negative>(add_abs);
auto sub_c_neg = c - abs_neg;
auto d = make_shared<op::Parameter>(element::f32, shape);
auto d_abs = std::make_shared<op::Abs>(d);
auto add_d = d_abs + add_ab;
auto neg_d = std::make_shared<op::Negative>(add_d);
auto mul_cd = neg_d * sub_c_neg;
auto f =
std::make_shared<Function>(ngraph::NodeVector{mul_cd}, ParameterVector{a, b, c, d});
return f;
};
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::CPUCompiledKernelFusion>(3);
auto cpu_f = make_function();
auto int_f = make_function();
pass_manager.run_passes(cpu_f);
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
size_t ckn = count_ops_of_type<op::CompiledKernel>(cpu_f);
ASSERT_GT(ckn, 0);
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_fusion, compiled_kernel_fusion_bounded_relu)
{
auto make_function = []() -> std::shared_ptr<Function> {
Shape shape{};
auto a = make_shared<op::Parameter>(element::f32, shape);
auto relu = make_shared<op::Relu>(a);
auto upper_bound =
op::Constant::create<float>(element::f32, shape, std::vector<float>{6.0f});
auto minn = make_shared<op::Minimum>(relu, upper_bound);
auto absn = make_shared<op::Abs>(minn);
auto negn = std::make_shared<op::Negative>(absn);
auto f = std::make_shared<Function>(ngraph::NodeVector{negn}, ParameterVector{a});
return f;
};
pass::Manager pass_manager;
pass_manager.register_pass<pass::VisualizeTree>("before_relu_fusion.png");
pass_manager.register_pass<runtime::cpu::pass::CPUCompiledKernelFusion>(3);
pass_manager.register_pass<pass::VisualizeTree>("after_relu_fusion.png");
auto cpu_f = make_function();
auto int_f = make_function();
pass_manager.run_passes(cpu_f);
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
size_t ckn = count_ops_of_type<op::CompiledKernel>(cpu_f);
ASSERT_GT(ckn, 0);
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_fusion, compiled_kernel_fusion_multiple_groups)
{
auto make_function = []() -> std::shared_ptr<Function> {
Shape shape{};
auto a = make_shared<op::Parameter>(element::f32, shape);
auto b = make_shared<op::Parameter>(element::f32, shape);
auto c = make_shared<op::Parameter>(element::f32, shape);
auto add_ab = a + b;
auto add_abs = std::make_shared<op::Abs>(add_ab);
auto abs_neg = std::make_shared<op::Negative>(add_abs);
auto sub_c_neg = c - abs_neg;
auto d = make_shared<op::Parameter>(element::f32, shape);
auto d_abs = std::make_shared<op::Abs>(d);
auto add_d = d_abs + add_ab;
auto neg_d = std::make_shared<op::Negative>(add_d);
auto mul_cd = neg_d * sub_c_neg;
auto f =
std::make_shared<Function>(ngraph::NodeVector{mul_cd}, ParameterVector{a, b, c, d});
return f;
};
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::CPUCompiledKernelFusion>(2);
auto cpu_f = make_function();
auto int_f = make_function();
pass_manager.run_passes(cpu_f);
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
size_t ckn = count_ops_of_type<op::CompiledKernel>(cpu_f);
ASSERT_GT(ckn, 0);
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_fusion, compiled_kernel_fusion_one_group)
{
auto make_function = []() -> std::shared_ptr<Function> {
Shape shape{};
auto a = make_shared<op::Parameter>(element::f32, shape);
auto b = make_shared<op::Parameter>(element::f32, shape);
auto c = make_shared<op::Parameter>(element::f32, shape);
auto add_ab = a + b;
auto add_abs = std::make_shared<op::Abs>(add_ab);
auto abs_neg = std::make_shared<op::Negative>(add_abs);
auto sub_c_neg = c - abs_neg;
auto d = make_shared<op::Parameter>(element::f32, shape);
auto add_d = sub_c_neg + d;
auto abs_add_d = std::make_shared<op::Abs>(add_d);
auto e = make_shared<op::Parameter>(element::f32, shape);
auto add_e = e + abs_add_d;
auto neg_e = std::make_shared<op::Negative>(add_e);
auto f = std::make_shared<Function>(ngraph::NodeVector{neg_e},
ParameterVector{a, b, c, d, e});
return f;
};
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::CPUCompiledKernelFusion>(2);
auto cpu_f = make_function();
auto int_f = make_function();
pass_manager.run_passes(cpu_f);
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
size_t ckn = count_ops_of_type<op::CompiledKernel>(cpu_f);
ASSERT_GT(ckn, 0);
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
#endif
void
sigmoid_multiply_fusion_forward_compute
(
runtime
::
Backend
*
backend
,
const
ParameterVector
&
input_params
,
const
vector
<
vector
<
float
>>&
input_data
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment