Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
ngraph
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ngraph
Commits
da7a15f8
Commit
da7a15f8
authored
Jul 16, 2019
by
Robert Kimball
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
separate benchmark and pipelined benchmark
parent
8b768fee
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
359 additions
and
243 deletions
+359
-243
CMakeLists.txt
src/tools/nbench/CMakeLists.txt
+2
-0
benchmark.cpp
src/tools/nbench/benchmark.cpp
+4
-230
benchmark.hpp
src/tools/nbench/benchmark.hpp
+0
-12
benchmark_pipelined.cpp
src/tools/nbench/benchmark_pipelined.cpp
+142
-0
benchmark_pipelined.hpp
src/tools/nbench/benchmark_pipelined.hpp
+33
-0
benchmark_utils.cpp
src/tools/nbench/benchmark_utils.cpp
+116
-0
benchmark_utils.hpp
src/tools/nbench/benchmark_utils.hpp
+60
-0
nbench.cpp
src/tools/nbench/nbench.cpp
+2
-1
No files found.
src/tools/nbench/CMakeLists.txt
View file @
da7a15f8
...
@@ -17,6 +17,8 @@
...
@@ -17,6 +17,8 @@
set
(
SRC
set
(
SRC
nbench.cpp
nbench.cpp
benchmark.cpp
benchmark.cpp
benchmark_pipelined.cpp
benchmark_utils.cpp
)
)
add_executable
(
nbench
${
SRC
}
)
add_executable
(
nbench
${
SRC
}
)
...
...
src/tools/nbench/benchmark.cpp
View file @
da7a15f8
...
@@ -14,11 +14,6 @@
...
@@ -14,11 +14,6 @@
// limitations under the License.
// limitations under the License.
//*****************************************************************************
//*****************************************************************************
#include <random>
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark.hpp"
#include "benchmark.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/backend.hpp"
...
@@ -26,118 +21,11 @@
...
@@ -26,118 +21,11 @@
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
#include "ngraph/util.hpp"
#include "benchmark_utils.hpp"
using
namespace
std
;
using
namespace
std
;
using
namespace
ngraph
;
using
namespace
ngraph
;
static
default_random_engine
s_random_engine
;
void
set_denormals_flush_to_zero
()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE
(
_MM_FLUSH_ZERO_ON
);
_MM_SET_DENORMALS_ZERO_MODE
(
_MM_DENORMALS_ZERO_ON
);
#endif
}
template
<
typename
T
>
void
init_int_tensor
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
T
min
,
T
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_int_distribution
<
T
>
dist
(
min
,
max
);
vector
<
T
>
vec
(
size
);
for
(
T
&
element
:
vec
)
{
element
=
dist
(
s_random_engine
);
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
}
template
<>
void
init_int_tensor
<
char
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
char
min
,
char
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
char
>
vec
(
size
);
for
(
char
&
element
:
vec
)
{
element
=
static_cast
<
char
>
(
dist
(
s_random_engine
));
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
char
));
}
template
<>
void
init_int_tensor
<
int8_t
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
int8_t
min
,
int8_t
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
int8_t
>
vec
(
size
);
for
(
int8_t
&
element
:
vec
)
{
element
=
static_cast
<
int8_t
>
(
dist
(
s_random_engine
));
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
int8_t
));
}
template
<>
void
init_int_tensor
<
uint8_t
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
uint8_t
min
,
uint8_t
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
uint8_t
>
vec
(
size
);
for
(
uint8_t
&
element
:
vec
)
{
element
=
static_cast
<
uint8_t
>
(
dist
(
s_random_engine
));
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
uint8_t
));
}
template
<
typename
T
>
void
init_real_tensor
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
T
min
,
T
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_real_distribution
<
T
>
dist
(
min
,
max
);
vector
<
T
>
vec
(
size
);
for
(
T
&
element
:
vec
)
{
element
=
dist
(
s_random_engine
);
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
}
static
void
random_init
(
shared_ptr
<
runtime
::
Tensor
>
tensor
)
{
element
::
Type
et
=
tensor
->
get_element_type
();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
switch
(
et
.
get_type_enum
())
{
case
element
:
:
Type_t
::
boolean
:
init_int_tensor
<
char
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
f32
:
init_real_tensor
<
float
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
f64
:
init_real_tensor
<
double
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i8
:
init_int_tensor
<
int8_t
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i16
:
init_int_tensor
<
int16_t
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i32
:
init_int_tensor
<
int32_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
i64
:
init_int_tensor
<
int64_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u8
:
init_int_tensor
<
uint8_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u16
:
init_int_tensor
<
uint16_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u32
:
init_int_tensor
<
uint32_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u64
:
init_int_tensor
<
uint64_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
undefined
:
case
element
:
:
Type_t
::
dynamic
:
case
element
:
:
Type_t
::
bf16
:
case
element
:
:
Type_t
::
f16
:
default
:
throw
runtime_error
(
"unsupported type"
);
}
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
}
vector
<
runtime
::
PerformanceCounter
>
run_benchmark
(
shared_ptr
<
Function
>
f
,
vector
<
runtime
::
PerformanceCounter
>
run_benchmark
(
shared_ptr
<
Function
>
f
,
const
string
&
backend_name
,
const
string
&
backend_name
,
size_t
iterations
,
size_t
iterations
,
...
@@ -148,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
...
@@ -148,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
stopwatch
timer
;
stopwatch
timer
;
timer
.
start
();
timer
.
start
();
auto
backend
=
runtime
::
Backend
::
create
(
backend_name
);
auto
backend
=
runtime
::
Backend
::
create
(
backend_name
);
auto
compiled_fun
c
=
backend
->
compile
(
f
,
timing_detail
);
auto
exe
c
=
backend
->
compile
(
f
,
timing_detail
);
timer
.
stop
();
timer
.
stop
();
cout
.
imbue
(
locale
(
""
));
cout
.
imbue
(
locale
(
""
));
cout
<<
"compile time: "
<<
timer
.
get_milliseconds
()
<<
"ms"
<<
endl
;
cout
<<
"compile time: "
<<
timer
.
get_milliseconds
()
<<
"ms"
<<
endl
;
...
@@ -209,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
...
@@ -209,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
}
}
}
}
}
}
compiled_fun
c
->
call
(
results
,
args
);
exe
c
->
call
(
results
,
args
);
if
(
copy_data
)
if
(
copy_data
)
{
{
for
(
size_t
result_index
=
0
;
result_index
<
results
.
size
();
result_index
++
)
for
(
size_t
result_index
=
0
;
result_index
<
results
.
size
();
result_index
++
)
...
@@ -225,120 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
...
@@ -225,120 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
float
time
=
t1
.
get_milliseconds
();
float
time
=
t1
.
get_milliseconds
();
cout
<<
time
/
iterations
<<
"ms per iteration"
<<
endl
;
cout
<<
time
/
iterations
<<
"ms per iteration"
<<
endl
;
vector
<
runtime
::
PerformanceCounter
>
perf_data
=
compiled_func
->
get_performance_data
();
vector
<
runtime
::
PerformanceCounter
>
perf_data
=
exec
->
get_performance_data
();
return
perf_data
;
}
vector
<
runtime
::
PerformanceCounter
>
run_benchmark_double_buffered
(
shared_ptr
<
Function
>
f
,
const
string
&
backend_name
,
size_t
iterations
,
bool
timing_detail
,
int
warmup_iterations
,
bool
copy_data
)
{
stopwatch
timer
;
timer
.
start
();
auto
backend
=
runtime
::
Backend
::
create
(
backend_name
);
auto
compiled_func
=
backend
->
compile
(
f
,
timing_detail
);
timer
.
stop
();
cout
.
imbue
(
locale
(
""
));
cout
<<
"compile time: "
<<
timer
.
get_milliseconds
()
<<
"ms"
<<
endl
;
set_denormals_flush_to_zero
();
array
<
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
,
2
>
args_data_set
;
array
<
vector
<
shared_ptr
<
runtime
::
Tensor
>>
,
2
>
args_set
;
array
<
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
,
2
>
results_data_set
;
array
<
vector
<
shared_ptr
<
runtime
::
Tensor
>>
,
2
>
results_set
;
for
(
size_t
i
=
0
;
i
<
2
;
i
++
)
{
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
args_data
;
vector
<
shared_ptr
<
runtime
::
Tensor
>>
args
;
for
(
shared_ptr
<
op
::
Parameter
>
param
:
f
->
get_parameters
())
{
auto
tensor
=
backend
->
create_tensor
(
param
->
get_element_type
(),
param
->
get_shape
());
auto
tensor_data
=
make_shared
<
runtime
::
HostTensor
>
(
param
->
get_element_type
(),
param
->
get_shape
());
random_init
(
tensor_data
);
tensor
->
write
(
tensor_data
->
get_data_ptr
(),
tensor_data
->
get_element_count
()
*
tensor_data
->
get_element_type
().
size
());
args
.
push_back
(
tensor
);
args_data
.
push_back
(
tensor_data
);
}
args_set
[
i
]
=
args
;
args_data_set
[
i
]
=
args_data
;
vector
<
shared_ptr
<
runtime
::
Tensor
>>
results
;
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
results_data
;
for
(
shared_ptr
<
Node
>
out
:
f
->
get_results
())
{
auto
result
=
backend
->
create_tensor
(
out
->
get_element_type
(),
out
->
get_shape
());
auto
result_data
=
make_shared
<
runtime
::
HostTensor
>
(
out
->
get_element_type
(),
out
->
get_shape
());
results
.
push_back
(
result
);
results_data
.
push_back
(
result_data
);
}
results_set
[
i
]
=
results
;
results_data_set
[
i
]
=
results_data
;
}
stopwatch
t1
;
// Before we start we write the first iteration's data
size_t
buffer_number
=
0
;
auto
args
=
args_set
[
buffer_number
];
auto
args_data
=
args_data_set
[
buffer_number
];
for
(
size_t
arg_index
=
0
;
arg_index
<
args
.
size
();
arg_index
++
)
{
const
shared_ptr
<
runtime
::
Tensor
>&
arg
=
args
[
arg_index
];
const
shared_ptr
<
runtime
::
HostTensor
>&
data
=
args_data
[
arg_index
];
arg
->
begin_write
(
data
->
get_data_ptr
(),
data
->
get_element_count
()
*
data
->
get_element_type
().
size
(),
buffer_number
);
}
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
results
=
results_set
[
buffer_number
];
const
vector
<
shared_ptr
<
runtime
::
HostTensor
>>&
results_data
=
results_data_set
[
buffer_number
];
for
(
size_t
i
=
0
;
i
<
iterations
+
warmup_iterations
;
i
++
)
{
if
(
i
==
warmup_iterations
)
{
t1
.
start
();
}
future
<
void
>
exec_future
=
compiled_func
->
begin_execute
(
results
,
args
);
if
(
i
>
0
)
{
for
(
size_t
result_index
=
0
;
result_index
<
results
.
size
();
result_index
++
)
{
const
shared_ptr
<
runtime
::
HostTensor
>&
data
=
results_data
[
result_index
];
const
shared_ptr
<
runtime
::
Tensor
>&
result
=
results
[
result_index
];
result
->
begin_read
(
data
->
get_data_ptr
(),
data
->
get_element_count
()
*
data
->
get_element_type
().
size
(),
(
buffer_number
-
1
)
&
1
);
}
}
buffer_number
=
(
buffer_number
+
1
)
&
1
;
for
(
size_t
arg_index
=
0
;
arg_index
<
args
.
size
();
arg_index
++
)
{
const
shared_ptr
<
runtime
::
Tensor
>&
arg
=
args
[
arg_index
];
const
shared_ptr
<
runtime
::
HostTensor
>&
data
=
args_data
[
arg_index
];
arg
->
begin_write
(
data
->
get_data_ptr
(),
data
->
get_element_count
()
*
data
->
get_element_type
().
size
(),
buffer_number
);
}
exec_future
.
get
();
}
for
(
size_t
result_index
=
0
;
result_index
<
results
.
size
();
result_index
++
)
{
const
shared_ptr
<
runtime
::
HostTensor
>&
data
=
results_data
[
result_index
];
const
shared_ptr
<
runtime
::
Tensor
>&
result
=
results
[
result_index
];
result
->
begin_read
(
data
->
get_data_ptr
(),
data
->
get_element_count
()
*
data
->
get_element_type
().
size
(),
(
buffer_number
-
1
)
&
1
);
}
t1
.
stop
();
float
time
=
t1
.
get_milliseconds
();
cout
<<
time
/
iterations
<<
"ms per iteration"
<<
endl
;
vector
<
runtime
::
PerformanceCounter
>
perf_data
=
compiled_func
->
get_performance_data
();
return
perf_data
;
return
perf_data
;
}
}
src/tools/nbench/benchmark.hpp
View file @
da7a15f8
...
@@ -24,21 +24,9 @@
...
@@ -24,21 +24,9 @@
#include "ngraph/function.hpp"
#include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp"
#include "ngraph/runtime/performance_counter.hpp"
/// performance test utilities
std
::
multimap
<
size_t
,
std
::
string
>
aggregate_timing
(
const
std
::
vector
<
ngraph
::
runtime
::
PerformanceCounter
>&
perf_data
);
std
::
vector
<
ngraph
::
runtime
::
PerformanceCounter
>
run_benchmark
(
std
::
shared_ptr
<
ngraph
::
Function
>
f
,
std
::
vector
<
ngraph
::
runtime
::
PerformanceCounter
>
run_benchmark
(
std
::
shared_ptr
<
ngraph
::
Function
>
f
,
const
std
::
string
&
backend_name
,
const
std
::
string
&
backend_name
,
size_t
iterations
,
size_t
iterations
,
bool
timing_detail
,
bool
timing_detail
,
int
warmup_iterations
,
int
warmup_iterations
,
bool
copy_data
);
bool
copy_data
);
std
::
vector
<
ngraph
::
runtime
::
PerformanceCounter
>
run_benchmark_double_buffered
(
std
::
shared_ptr
<
ngraph
::
Function
>
f
,
const
std
::
string
&
backend_name
,
size_t
iterations
,
bool
timing_detail
,
int
warmup_iterations
,
bool
copy_data
);
src/tools/nbench/benchmark_pipelined.cpp
0 → 100644
View file @
da7a15f8
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "benchmark.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
#include "benchmark_utils.hpp"
using
namespace
std
;
using
namespace
ngraph
;
vector
<
runtime
::
PerformanceCounter
>
run_benchmark_pipelined
(
shared_ptr
<
Function
>
f
,
const
string
&
backend_name
,
size_t
iterations
,
bool
timing_detail
,
int
warmup_iterations
,
bool
copy_data
)
{
constexpr
size_t
pipeline_depth
=
2
;
stopwatch
timer
;
timer
.
start
();
auto
backend
=
runtime
::
Backend
::
create
(
backend_name
);
auto
exec
=
backend
->
compile
(
f
,
timing_detail
);
timer
.
stop
();
cout
.
imbue
(
locale
(
""
));
cout
<<
"compile time: "
<<
timer
.
get_milliseconds
()
<<
"ms"
<<
endl
;
set_denormals_flush_to_zero
();
// Create random input data for all input tensors
array
<
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
,
pipeline_depth
>
parameters_data_set
;
array
<
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
,
pipeline_depth
>
results_data_set
;
for
(
size_t
i
=
0
;
i
<
pipeline_depth
;
i
++
)
{
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
parameters_data
;
for
(
shared_ptr
<
op
::
Parameter
>
param
:
f
->
get_parameters
())
{
auto
tensor_data
=
make_shared
<
runtime
::
HostTensor
>
(
param
->
get_element_type
(),
param
->
get_shape
());
random_init
(
tensor_data
);
parameters_data
.
push_back
(
tensor_data
);
}
parameters_data_set
[
i
]
=
parameters_data
;
}
// Create input tensors for all Parameters
array
<
vector
<
shared_ptr
<
runtime
::
Tensor
>>
,
pipeline_depth
>
input_tensors_array
;
size_t
input_index
=
0
;
for
(
shared_ptr
<
op
::
Parameter
>
param
:
f
->
get_parameters
())
{
auto
input_tensors
=
exec
->
create_input_tensor
(
input_index
++
,
pipeline_depth
);
for
(
size_t
i
=
0
;
i
<
pipeline_depth
;
i
++
)
{
input_tensors_array
[
i
].
push_back
(
input_tensors
[
i
]);
}
}
// // Create output tensors for all Results
// array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> output_tensors_array;
// for (shared_ptr<Node> out : f->get_results())
// {
// auto output_tensors = backend->create_tensor(out->get_element_type(), out->get_shape());
// output_tensors_array[i] = output_tensors;
// }
stopwatch
t1
;
// // Before we start we write the first iteration's data
// size_t buffer_number = 0;
// auto args = input_tensors_array[buffer_number];
// auto args_data = parameters_data_set[buffer_number];
// for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
// {
// const shared_ptr<runtime::Tensor>& arg = args[arg_index];
// const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
// arg->begin_write(data->get_data_ptr(),
// data->get_element_count() * data->get_element_type().size(),
// buffer_number);
// }
// const vector<shared_ptr<runtime::Tensor>>& results = output_tensors[buffer_number];
// const vector<shared_ptr<runtime::HostTensor>>& results_data = results_data_set[buffer_number];
// for (size_t i = 0; i < iterations + warmup_iterations; i++)
// {
// if (i == warmup_iterations)
// {
// t1.start();
// }
// future<void> exec_future = exec->begin_execute(results, args);
// if (i > 0)
// {
// for (size_t result_index = 0; result_index < results.size(); result_index++)
// {
// const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
// const shared_ptr<runtime::Tensor>& result = results[result_index];
// result->begin_read(data->get_data_ptr(),
// data->get_element_count() * data->get_element_type().size(),
// (buffer_number - 1) & 1);
// }
// }
// buffer_number = (buffer_number + 1) & 1;
// for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
// {
// const shared_ptr<runtime::Tensor>& arg = args[arg_index];
// const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
// arg->begin_write(data->get_data_ptr(),
// data->get_element_count() * data->get_element_type().size(),
// buffer_number);
// }
// exec_future.get();
// }
// for (size_t result_index = 0; result_index < results.size(); result_index++)
// {
// const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
// const shared_ptr<runtime::Tensor>& result = results[result_index];
// result->begin_read(data->get_data_ptr(),
// data->get_element_count() * data->get_element_type().size(),
// (buffer_number - 1) & 1);
// }
// t1.stop();
// float time = t1.get_milliseconds();
// cout << time / iterations << "ms per iteration" << endl;
vector
<
runtime
::
PerformanceCounter
>
perf_data
=
exec
->
get_performance_data
();
return
perf_data
;
}
src/tools/nbench/benchmark_pipelined.hpp
0 → 100644
View file @
da7a15f8
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp"
std
::
vector
<
ngraph
::
runtime
::
PerformanceCounter
>
run_benchmark_pipelined
(
std
::
shared_ptr
<
ngraph
::
Function
>
f
,
const
std
::
string
&
backend_name
,
size_t
iterations
,
bool
timing_detail
,
int
warmup_iterations
,
bool
copy_data
);
src/tools/nbench/benchmark_utils.cpp
0 → 100644
View file @
da7a15f8
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using
namespace
std
;
using
namespace
ngraph
;
template
<>
void
init_int_tensor
<
char
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
char
min
,
char
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
char
>
vec
(
size
);
for
(
char
&
element
:
vec
)
{
element
=
static_cast
<
char
>
(
dist
(
get_random_engine
()));
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
char
));
}
template
<>
void
init_int_tensor
<
int8_t
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
int8_t
min
,
int8_t
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
int8_t
>
vec
(
size
);
for
(
int8_t
&
element
:
vec
)
{
element
=
static_cast
<
int8_t
>
(
dist
(
get_random_engine
()));
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
int8_t
));
}
template
<>
void
init_int_tensor
<
uint8_t
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
uint8_t
min
,
uint8_t
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
uint8_t
>
vec
(
size
);
for
(
uint8_t
&
element
:
vec
)
{
element
=
static_cast
<
uint8_t
>
(
dist
(
get_random_engine
()));
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
uint8_t
));
}
void
set_denormals_flush_to_zero
()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE
(
_MM_FLUSH_ZERO_ON
);
_MM_SET_DENORMALS_ZERO_MODE
(
_MM_DENORMALS_ZERO_ON
);
#endif
}
void
random_init
(
shared_ptr
<
runtime
::
Tensor
>
tensor
)
{
element
::
Type
et
=
tensor
->
get_element_type
();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
switch
(
et
.
get_type_enum
())
{
case
element
:
:
Type_t
::
boolean
:
init_int_tensor
<
char
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
f32
:
init_real_tensor
<
float
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
f64
:
init_real_tensor
<
double
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i8
:
init_int_tensor
<
int8_t
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i16
:
init_int_tensor
<
int16_t
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i32
:
init_int_tensor
<
int32_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
i64
:
init_int_tensor
<
int64_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u8
:
init_int_tensor
<
uint8_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u16
:
init_int_tensor
<
uint16_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u32
:
init_int_tensor
<
uint32_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u64
:
init_int_tensor
<
uint64_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
undefined
:
case
element
:
:
Type_t
::
dynamic
:
case
element
:
:
Type_t
::
bf16
:
case
element
:
:
Type_t
::
f16
:
default
:
throw
runtime_error
(
"unsupported type"
);
}
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
}
default_random_engine
&
get_random_engine
()
{
static
std
::
default_random_engine
s_random_engine
;
return
s_random_engine
;
}
src/tools/nbench/benchmark_utils.hpp
0 → 100644
View file @
da7a15f8
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <random>
#include "benchmark.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using
namespace
std
;
using
namespace
ngraph
;
void
set_denormals_flush_to_zero
();
void
random_init
(
shared_ptr
<
runtime
::
Tensor
>
tensor
);
std
::
default_random_engine
&
get_random_engine
();
template
<
typename
T
>
void
init_int_tensor
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
T
min
,
T
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_int_distribution
<
T
>
dist
(
min
,
max
);
vector
<
T
>
vec
(
size
);
for
(
T
&
element
:
vec
)
{
element
=
dist
(
get_random_engine
());
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
}
template
<
typename
T
>
void
init_real_tensor
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
T
min
,
T
max
)
{
size_t
size
=
tensor
->
get_element_count
();
uniform_real_distribution
<
T
>
dist
(
min
,
max
);
vector
<
T
>
vec
(
size
);
for
(
T
&
element
:
vec
)
{
element
=
dist
(
get_random_engine
());
}
tensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
}
src/tools/nbench/nbench.cpp
View file @
da7a15f8
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
#include <iomanip>
#include <iomanip>
#include "benchmark.hpp"
#include "benchmark.hpp"
#include "benchmark_pipelined.hpp"
#include "ngraph/distributed.hpp"
#include "ngraph/distributed.hpp"
#include "ngraph/except.hpp"
#include "ngraph/except.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/file_util.hpp"
...
@@ -429,7 +430,7 @@ OPTIONS
...
@@ -429,7 +430,7 @@ OPTIONS
vector
<
runtime
::
PerformanceCounter
>
perf_data
;
vector
<
runtime
::
PerformanceCounter
>
perf_data
;
if
(
double_buffer
)
if
(
double_buffer
)
{
{
perf_data
=
run_benchmark_
double_buffer
ed
(
perf_data
=
run_benchmark_
pipelin
ed
(
f
,
backend
,
iterations
,
timing_detail
,
warmup_iterations
,
copy_data
);
f
,
backend
,
iterations
,
timing_detail
,
warmup_iterations
,
copy_data
);
}
}
else
else
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment