Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
ngraph
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ngraph
Commits
567bc822
Commit
567bc822
authored
Jul 05, 2019
by
Robert Kimball
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
nbench async option
parent
4b84262c
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
189 additions
and
33 deletions
+189
-33
backend.cpp
src/ngraph/runtime/backend.cpp
+11
-4
benchmark.cpp
src/tools/nbench/benchmark.cpp
+154
-28
benchmark.hpp
src/tools/nbench/benchmark.hpp
+8
-0
nbench.cpp
src/tools/nbench/nbench.cpp
+16
-1
No files found.
src/ngraph/runtime/backend.cpp
View file @
567bc822
...
...
@@ -196,23 +196,30 @@ void runtime::Backend::async_thread_stop()
}
}
static
void
local_thread_entry
(
shared_ptr
<
runtime
::
Backend
::
AsyncEvent
>
event
)
{
event
->
get_executable
()
->
call
(
event
->
get_outputs
(),
event
->
get_inputs
());
event
->
signal_result
();
};
void
runtime
::
Backend
::
async_thread_process
(
const
shared_ptr
<
AsyncEvent
>&
event
)
{
switch
(
event
->
get_type
())
{
case
AsyncEvent
:
:
Type
::
READ
:
event
->
get_tensor
()
->
read
(
event
->
get_data
(),
0
,
event
->
get_size_in_bytes
());
event
->
get_tensor
()
->
read
(
event
->
get_data
(),
event
->
get_size_in_bytes
());
event
->
signal_result
();
break
;
case
AsyncEvent
:
:
Type
::
WRITE
:
event
->
get_tensor
()
->
write
(
event
->
get_data
(),
0
,
event
->
get_size_in_bytes
());
event
->
get_tensor
()
->
write
(
event
->
get_data
(),
event
->
get_size_in_bytes
());
event
->
signal_result
();
break
;
case
AsyncEvent
:
:
Type
::
EXECUTE
:
event
->
get_executable
()
->
call
(
event
->
get_outputs
(),
event
->
get_inputs
());
event
->
signal_result
();
{
std
::
thread
(
local_thread_entry
,
event
).
detach
();
break
;
}
}
}
void
runtime
::
Backend
::
async_thread_entry
()
...
...
src/tools/nbench/benchmark.cpp
View file @
567bc822
...
...
@@ -42,88 +42,100 @@ void set_denormals_flush_to_zero()
}
template
<
typename
T
>
void
init_int_t
v
(
shared_ptr
<
runtime
::
Tensor
>
tv
,
T
min
,
T
max
)
void
init_int_t
ensor
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
T
min
,
T
max
)
{
size_t
size
=
t
v
->
get_element_count
();
size_t
size
=
t
ensor
->
get_element_count
();
uniform_int_distribution
<
T
>
dist
(
min
,
max
);
vector
<
T
>
vec
(
size
);
for
(
T
&
element
:
vec
)
{
element
=
dist
(
s_random_engine
);
}
t
v
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
t
ensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
}
template
<>
void
init_int_t
v
<
char
>
(
shared_ptr
<
runtime
::
Tensor
>
tv
,
char
min
,
char
max
)
void
init_int_t
ensor
<
char
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
char
min
,
char
max
)
{
size_t
size
=
t
v
->
get_element_count
();
size_t
size
=
t
ensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
char
>
vec
(
size
);
for
(
char
&
element
:
vec
)
{
element
=
static_cast
<
char
>
(
dist
(
s_random_engine
));
}
t
v
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
char
));
t
ensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
char
));
}
template
<>
void
init_int_t
v
<
int8_t
>
(
shared_ptr
<
runtime
::
Tensor
>
tv
,
int8_t
min
,
int8_t
max
)
void
init_int_t
ensor
<
int8_t
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
int8_t
min
,
int8_t
max
)
{
size_t
size
=
t
v
->
get_element_count
();
size_t
size
=
t
ensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
int8_t
>
vec
(
size
);
for
(
int8_t
&
element
:
vec
)
{
element
=
static_cast
<
int8_t
>
(
dist
(
s_random_engine
));
}
t
v
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
int8_t
));
t
ensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
int8_t
));
}
template
<>
void
init_int_t
v
<
uint8_t
>
(
shared_ptr
<
runtime
::
Tensor
>
tv
,
uint8_t
min
,
uint8_t
max
)
void
init_int_t
ensor
<
uint8_t
>
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
uint8_t
min
,
uint8_t
max
)
{
size_t
size
=
t
v
->
get_element_count
();
size_t
size
=
t
ensor
->
get_element_count
();
uniform_int_distribution
<
int16_t
>
dist
(
static_cast
<
short
>
(
min
),
static_cast
<
short
>
(
max
));
vector
<
uint8_t
>
vec
(
size
);
for
(
uint8_t
&
element
:
vec
)
{
element
=
static_cast
<
uint8_t
>
(
dist
(
s_random_engine
));
}
t
v
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
uint8_t
));
t
ensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
uint8_t
));
}
template
<
typename
T
>
void
init_real_t
v
(
shared_ptr
<
runtime
::
Tensor
>
tv
,
T
min
,
T
max
)
void
init_real_t
ensor
(
shared_ptr
<
runtime
::
Tensor
>
tensor
,
T
min
,
T
max
)
{
size_t
size
=
t
v
->
get_element_count
();
size_t
size
=
t
ensor
->
get_element_count
();
uniform_real_distribution
<
T
>
dist
(
min
,
max
);
vector
<
T
>
vec
(
size
);
for
(
T
&
element
:
vec
)
{
element
=
dist
(
s_random_engine
);
}
t
v
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
t
ensor
->
write
(
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
}
static
void
random_init
(
shared_ptr
<
runtime
::
Tensor
>
t
v
)
static
void
random_init
(
shared_ptr
<
runtime
::
Tensor
>
t
ensor
)
{
element
::
Type
et
=
tv
->
get_element_type
();
element
::
Type
et
=
tensor
->
get_element_type
();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
switch
(
et
.
get_type_enum
())
{
case
element
:
:
Type_t
::
boolean
:
init_int_tv
<
char
>
(
tv
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
f32
:
init_real_tv
<
float
>
(
tv
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
f64
:
init_real_tv
<
double
>
(
tv
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i8
:
init_int_tv
<
int8_t
>
(
tv
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i16
:
init_int_tv
<
int16_t
>
(
tv
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i32
:
init_int_tv
<
int32_t
>
(
tv
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
i64
:
init_int_tv
<
int64_t
>
(
tv
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u8
:
init_int_tv
<
uint8_t
>
(
tv
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u16
:
init_int_tv
<
uint16_t
>
(
tv
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u32
:
init_int_tv
<
uint32_t
>
(
tv
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u64
:
init_int_tv
<
uint64_t
>
(
tv
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
boolean
:
init_int_tensor
<
char
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
f32
:
init_real_tensor
<
float
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
f64
:
init_real_tensor
<
double
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i8
:
init_int_tensor
<
int8_t
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i16
:
init_int_tensor
<
int16_t
>
(
tensor
,
-
1
,
1
);
break
;
case
element
:
:
Type_t
::
i32
:
init_int_tensor
<
int32_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
i64
:
init_int_tensor
<
int64_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u8
:
init_int_tensor
<
uint8_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u16
:
init_int_tensor
<
uint16_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u32
:
init_int_tensor
<
uint32_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
u64
:
init_int_tensor
<
uint64_t
>
(
tensor
,
0
,
1
);
break
;
case
element
:
:
Type_t
::
undefined
:
case
element
:
:
Type_t
::
dynamic
:
case
element
:
:
Type_t
::
bf16
:
case
element
:
:
Type_t
::
f16
:
default
:
throw
runtime_error
(
"unsupported type"
);
}
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
}
vector
<
runtime
::
PerformanceCounter
>
run_benchmark
(
shared_ptr
<
Function
>
f
,
...
...
@@ -216,3 +228,117 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
vector
<
runtime
::
PerformanceCounter
>
perf_data
=
compiled_func
->
get_performance_data
();
return
perf_data
;
}
vector
<
runtime
::
PerformanceCounter
>
run_benchmark_double_buffered
(
shared_ptr
<
Function
>
f
,
const
string
&
backend_name
,
size_t
iterations
,
bool
timing_detail
,
int
warmup_iterations
,
bool
copy_data
)
{
stopwatch
timer
;
timer
.
start
();
auto
backend
=
runtime
::
Backend
::
create
(
backend_name
);
auto
compiled_func
=
backend
->
compile
(
f
,
timing_detail
);
timer
.
stop
();
cout
.
imbue
(
locale
(
""
));
cout
<<
"compile time: "
<<
timer
.
get_milliseconds
()
<<
"ms"
<<
endl
;
set_denormals_flush_to_zero
();
array
<
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
,
2
>
args_data_set
;
array
<
vector
<
shared_ptr
<
runtime
::
Tensor
>>
,
2
>
args_set
;
array
<
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
,
2
>
results_data_set
;
array
<
vector
<
shared_ptr
<
runtime
::
Tensor
>>
,
2
>
results_set
;
for
(
size_t
i
=
0
;
i
<
2
;
i
++
)
{
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
args_data
;
vector
<
shared_ptr
<
runtime
::
Tensor
>>
args
;
for
(
shared_ptr
<
op
::
Parameter
>
param
:
f
->
get_parameters
())
{
auto
tensor
=
backend
->
create_tensor
(
param
->
get_element_type
(),
param
->
get_shape
());
auto
tensor_data
=
make_shared
<
runtime
::
HostTensor
>
(
param
->
get_element_type
(),
param
->
get_shape
());
random_init
(
tensor_data
);
tensor
->
write
(
tensor_data
->
get_data_ptr
(),
tensor_data
->
get_element_count
()
*
tensor_data
->
get_element_type
().
size
());
args
.
push_back
(
tensor
);
args_data
.
push_back
(
tensor_data
);
}
args_set
[
i
]
=
args
;
args_data_set
[
i
]
=
args_data
;
vector
<
shared_ptr
<
runtime
::
Tensor
>>
results
;
vector
<
shared_ptr
<
runtime
::
HostTensor
>>
results_data
;
for
(
shared_ptr
<
Node
>
out
:
f
->
get_results
())
{
auto
result
=
backend
->
create_tensor
(
out
->
get_element_type
(),
out
->
get_shape
());
auto
result_data
=
make_shared
<
runtime
::
HostTensor
>
(
out
->
get_element_type
(),
out
->
get_shape
());
results
.
push_back
(
result
);
results_data
.
push_back
(
result_data
);
}
results_set
[
i
]
=
results
;
results_data_set
[
i
]
=
results_data
;
}
stopwatch
t1
;
// Before we start we write the first iteration's data
size_t
buffer_number
=
0
;
auto
args
=
args_set
[
buffer_number
];
auto
args_data
=
args_data_set
[
buffer_number
];
for
(
size_t
arg_index
=
0
;
arg_index
<
args
.
size
();
arg_index
++
)
{
const
shared_ptr
<
runtime
::
Tensor
>&
arg
=
args
[
arg_index
];
const
shared_ptr
<
runtime
::
HostTensor
>&
data
=
args_data
[
arg_index
];
arg
->
begin_write
(
data
->
get_data_ptr
(),
data
->
get_element_count
()
*
data
->
get_element_type
().
size
(),
buffer_number
);
}
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
results
=
results_set
[
buffer_number
];
const
vector
<
shared_ptr
<
runtime
::
HostTensor
>>&
results_data
=
results_data_set
[
buffer_number
];
for
(
size_t
i
=
0
;
i
<
iterations
+
warmup_iterations
;
i
++
)
{
if
(
i
==
warmup_iterations
)
{
t1
.
start
();
}
future
<
void
>
exec_future
=
compiled_func
->
begin_execute
(
results
,
args
);
if
(
i
>
0
)
{
for
(
size_t
result_index
=
0
;
result_index
<
results
.
size
();
result_index
++
)
{
const
shared_ptr
<
runtime
::
HostTensor
>&
data
=
results_data
[
result_index
];
const
shared_ptr
<
runtime
::
Tensor
>&
result
=
results
[
result_index
];
result
->
begin_read
(
data
->
get_data_ptr
(),
data
->
get_element_count
()
*
data
->
get_element_type
().
size
(),
(
buffer_number
-
1
)
&
1
);
}
}
buffer_number
=
(
buffer_number
+
1
)
&
1
;
for
(
size_t
arg_index
=
0
;
arg_index
<
args
.
size
();
arg_index
++
)
{
const
shared_ptr
<
runtime
::
Tensor
>&
arg
=
args
[
arg_index
];
const
shared_ptr
<
runtime
::
HostTensor
>&
data
=
args_data
[
arg_index
];
arg
->
begin_write
(
data
->
get_data_ptr
(),
data
->
get_element_count
()
*
data
->
get_element_type
().
size
(),
buffer_number
);
}
exec_future
.
get
();
}
for
(
size_t
result_index
=
0
;
result_index
<
results
.
size
();
result_index
++
)
{
const
shared_ptr
<
runtime
::
HostTensor
>&
data
=
results_data
[
result_index
];
const
shared_ptr
<
runtime
::
Tensor
>&
result
=
results
[
result_index
];
result
->
begin_read
(
data
->
get_data_ptr
(),
data
->
get_element_count
()
*
data
->
get_element_type
().
size
(),
(
buffer_number
-
1
)
&
1
);
}
t1
.
stop
();
float
time
=
t1
.
get_milliseconds
();
cout
<<
time
/
iterations
<<
"ms per iteration"
<<
endl
;
vector
<
runtime
::
PerformanceCounter
>
perf_data
=
compiled_func
->
get_performance_data
();
return
perf_data
;
}
src/tools/nbench/benchmark.hpp
View file @
567bc822
...
...
@@ -34,3 +34,11 @@ std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<n
bool
timing_detail
,
int
warmup_iterations
,
bool
copy_data
);
std
::
vector
<
ngraph
::
runtime
::
PerformanceCounter
>
run_benchmark_double_buffered
(
std
::
shared_ptr
<
ngraph
::
Function
>
f
,
const
std
::
string
&
backend_name
,
size_t
iterations
,
bool
timing_detail
,
int
warmup_iterations
,
bool
copy_data
);
src/tools/nbench/nbench.cpp
View file @
567bc822
...
...
@@ -181,6 +181,7 @@ int main(int argc, char** argv)
int
warmup_iterations
=
1
;
bool
copy_data
=
true
;
bool
dot_file
=
false
;
bool
double_buffer
=
false
;
for
(
size_t
i
=
1
;
i
<
argc
;
i
++
)
{
...
...
@@ -229,6 +230,10 @@ int main(int argc, char** argv)
{
directory
=
argv
[
++
i
];
}
else
if
(
arg
==
"--double_buffer"
)
{
double_buffer
=
true
;
}
else
if
(
arg
==
"-w"
||
arg
==
"--warmup_iterations"
)
{
try
...
...
@@ -283,6 +288,7 @@ OPTIONS
-w|--warmup_iterations Number of warm-up iterations
--no_copy_data Disable copy of input/result data every iteration
--dot Generate Graphviz dot file
--double_buffer Double buffer inputs and outputs
)###"
;
return
1
;
}
...
...
@@ -420,8 +426,17 @@ OPTIONS
{
cout
<<
"
\n
---- Benchmark ----
\n
"
;
shared_ptr
<
Function
>
f
=
deserialize
(
model
);
auto
perf_data
=
run_benchmark
(
vector
<
runtime
::
PerformanceCounter
>
perf_data
;
if
(
double_buffer
)
{
perf_data
=
run_benchmark_double_buffered
(
f
,
backend
,
iterations
,
timing_detail
,
warmup_iterations
,
copy_data
);
}
else
{
perf_data
=
run_benchmark
(
f
,
backend
,
iterations
,
timing_detail
,
warmup_iterations
,
copy_data
);
}
auto
perf_shape
=
to_perf_shape
(
f
,
perf_data
);
aggregate_perf_data
.
insert
(
aggregate_perf_data
.
end
(),
perf_shape
.
begin
(),
perf_shape
.
end
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment