Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
ngraph
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ngraph
Commits
013c2381
Commit
013c2381
authored
6 years ago
by
Sergey Shalnov
Committed by
Robert Kimball
6 years ago
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
IntelGPU backend: Separate backend and executable classes (#2447)
parent
65141c5f
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
428 additions
and
364 deletions
+428
-364
CMakeLists.txt
src/ngraph/runtime/intelgpu/CMakeLists.txt
+1
-0
intelgpu_backend.cpp
src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+3
-326
intelgpu_backend.hpp
src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
+0
-38
intelgpu_executable.cpp
src/ngraph/runtime/intelgpu/intelgpu_executable.cpp
+335
-0
intelgpu_executable.hpp
src/ngraph/runtime/intelgpu/intelgpu_executable.hpp
+68
-0
intelgpu_op_custom_kernels.cpp
src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+19
-0
intelgpu_op_custom_kernels.hpp
src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+2
-0
No files found.
src/ngraph/runtime/intelgpu/CMakeLists.txt
View file @
013c2381
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
set
(
SRC
set
(
SRC
intelgpu_backend.cpp
intelgpu_backend.cpp
intelgpu_executable.cpp
intelgpu_tensor_view.cpp
intelgpu_tensor_view.cpp
intelgpu_layout.cpp
intelgpu_layout.cpp
intelgpu_op_batchnorm.cpp
intelgpu_op_batchnorm.cpp
...
...
This diff is collapsed.
Click to expand it.
src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
View file @
013c2381
...
@@ -15,8 +15,6 @@
...
@@ -15,8 +15,6 @@
//*****************************************************************************
//*****************************************************************************
#include <iomanip>
#include <iomanip>
#include <sys/resource.h>
#include <sys/time.h>
#include <CPP/activation.hpp>
#include <CPP/activation.hpp>
#include <CPP/activation_grad.hpp>
#include <CPP/activation_grad.hpp>
...
@@ -37,9 +35,7 @@
...
@@ -37,9 +35,7 @@
#include <CPP/mutable_data.hpp>
#include <CPP/mutable_data.hpp>
#include <CPP/permute.hpp>
#include <CPP/permute.hpp>
#include <CPP/pooling.hpp>
#include <CPP/pooling.hpp>
#include <CPP/reorder.hpp>
#include <CPP/reshape.hpp>
#include <CPP/reshape.hpp>
#include <CPP/scale.hpp>
#include <CPP/select.hpp>
#include <CPP/select.hpp>
#include <CPP/softmax.hpp>
#include <CPP/softmax.hpp>
#include <CPP/topology.hpp>
#include <CPP/topology.hpp>
...
@@ -51,6 +47,7 @@
...
@@ -51,6 +47,7 @@
#include "ngraph/pass/nop_elimination.hpp"
#include "ngraph/pass/nop_elimination.hpp"
#include "ngraph/pass/reshape_elimination.hpp"
#include "ngraph/pass/reshape_elimination.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
...
@@ -141,21 +138,6 @@ static void arguments_check(const shared_ptr<Node>& op, size_t input, size_t out
...
@@ -141,21 +138,6 @@ static void arguments_check(const shared_ptr<Node>& op, size_t input, size_t out
}
}
}
}
static
void
memory_size_check
(
size_t
memory_size
,
const
shared_ptr
<
Node
>&
node
,
const
string
&
function_name
)
{
const
size_t
tensor_size
=
shape_size
(
node
->
get_shape
())
*
node
->
get_element_type
().
size
();
if
(
memory_size
!=
tensor_size
)
{
ostringstream
os
;
os
<<
"IntelGPU backend failed memory check. In
\"
"
<<
function_name
<<
"
\"
with Node
\"
"
<<
node
->
get_name
()
<<
"
\"
and "
<<
node
->
get_shape
()
<<
" mismatched memory sizes "
<<
tensor_size
<<
" and "
<<
memory_size
;
throw
invalid_argument
(
os
.
str
());
}
}
static
const
string
&
get_input_name
(
const
shared_ptr
<
Node
>&
op
,
size_t
num
=
0
)
static
const
string
&
get_input_name
(
const
shared_ptr
<
Node
>&
op
,
size_t
num
=
0
)
{
{
return
op
->
get_inputs
().
at
(
num
).
get_tensor
().
get_name
();
return
op
->
get_inputs
().
at
(
num
).
get_tensor
().
get_name
();
...
@@ -312,22 +294,6 @@ extern "C" void delete_backend(runtime::Backend* backend)
...
@@ -312,22 +294,6 @@ extern "C" void delete_backend(runtime::Backend* backend)
delete
backend
;
delete
backend
;
}
}
static
size_t
get_max_memory_rss
()
{
size_t
result
=
0
;
struct
rusage
usage
;
if
(
getrusage
(
RUSAGE_SELF
,
&
usage
)
==
0
)
{
result
=
usage
.
ru_maxrss
;
// the value is in kilobytes
// aligne result to return bytes
result
*=
1000
;
}
return
result
;
}
runtime
::
intelgpu
::
IntelGPUBackend
::
IntelGPUBackend
()
runtime
::
intelgpu
::
IntelGPUBackend
::
IntelGPUBackend
()
{
{
bool
profiling
=
false
;
bool
profiling
=
false
;
...
@@ -433,7 +399,7 @@ shared_ptr<runtime::Executable>
...
@@ -433,7 +399,7 @@ shared_ptr<runtime::Executable>
if
(
m_profile_enable
)
if
(
m_profile_enable
)
{
{
consumed_memory
=
get_max_memory_rss
();
consumed_memory
=
runtime
::
intelgpu
::
get_max_memory_rss
();
timer_compile
.
start
();
timer_compile
.
start
();
}
}
...
@@ -1831,7 +1797,7 @@ shared_ptr<runtime::Executable>
...
@@ -1831,7 +1797,7 @@ shared_ptr<runtime::Executable>
{
{
timer_compile
.
stop
();
timer_compile
.
stop
();
compilation_time
=
timer_compile
.
get_milliseconds
();
compilation_time
=
timer_compile
.
get_milliseconds
();
consumed_memory
=
get_max_memory_rss
()
-
consumed_memory
;
consumed_memory
=
runtime
::
intelgpu
::
get_max_memory_rss
()
-
consumed_memory
;
}
}
rc
=
make_shared
<
IntelGPUExecutable
>
(
func
,
rc
=
make_shared
<
IntelGPUExecutable
>
(
func
,
...
@@ -1849,102 +1815,6 @@ shared_ptr<runtime::Executable>
...
@@ -1849,102 +1815,6 @@ shared_ptr<runtime::Executable>
return
rc
;
return
rc
;
}
}
runtime
::
intelgpu
::
IntelGPUExecutable
::
IntelGPUExecutable
(
shared_ptr
<
Function
>
func
,
shared_ptr
<
cldnn
::
network
>
network
,
bool
enable_timing
,
bool
enable_profile
,
double
compilation_time
,
double
consumed_memory
,
size_t
profile_lines_limit_count
)
{
m_function
=
func
;
m_cldnn_network
=
network
;
m_performance_counters_enabled
=
enable_timing
;
m_profile_enable
=
enable_profile
;
m_compilation_time
=
compilation_time
;
m_consumed_memory
=
consumed_memory
;
m_profile_lines_limit_count
=
profile_lines_limit_count
;
set_parameters_and_results
(
*
func
);
}
bool
runtime
::
intelgpu
::
IntelGPUExecutable
::
call
(
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
outputs
,
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
inputs
)
{
double
mem_call_consumed
=
0.0
f
;
stopwatch
timer_call
;
if
(
m_cldnn_network
==
nullptr
)
{
throw
runtime_error
(
"compile() must be called before call()."
);
}
if
(
m_profile_enable
)
{
mem_call_consumed
=
get_max_memory_rss
();
timer_call
.
start
();
}
// Process input parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_parameters and inputs, there is
// we try to match them by index number in vectors.
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
shared_ptr
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
tv
=
static_pointer_cast
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
(
inputs
[
i
]);
const
ParameterVector
&
input_params
=
get_parameters
();
const
string
&
tensor_name
=
input_params
[
i
]
->
get_output_tensor
().
get_name
();
m_cldnn_network
->
set_input_data
(
tensor_name
,
*
tv
->
get_data_ptr
());
}
// Execute network
map
<
cldnn
::
primitive_id
,
cldnn
::
network_output
>
result
=
m_cldnn_network
->
execute
();
// Process output parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_results and outputs, there is
// we try to match them by index number in vectors.
for
(
size_t
i
=
0
;
i
<
m_function
->
get_output_size
();
i
++
)
{
const
shared_ptr
<
Node
>&
dst_node
=
m_function
->
get_output_op
(
i
);
const
size_t
dst_shape_size
=
shape_size
(
dst_node
->
get_shape
());
// We should not touch destination memory if it is not existed
if
(
!
dst_shape_size
)
{
continue
;
}
shared_ptr
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
ngraph_res
=
static_pointer_cast
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
(
outputs
[
i
]);
const
string
&
tensor_name
=
get_input_name
(
dst_node
);
auto
result_memory
=
result
.
at
(
tensor_name
).
get_memory
().
pointer
<
char
>
();
memory_size_check
(
result_memory
.
size
(),
dst_node
,
m_function
->
get_name
());
ngraph_res
->
write
(
result_memory
.
data
(),
0
,
result_memory
.
size
());
}
if
(
m_profile_enable
)
{
timer_call
.
stop
();
mem_call_consumed
=
get_max_memory_rss
()
-
mem_call_consumed
;
print_call_performance
(
m_cldnn_network
,
m_function
,
m_compilation_time
,
timer_call
.
get_milliseconds
(),
m_consumed_memory
,
mem_call_consumed
,
get_max_memory_rss
());
// Output compile time only once
m_compilation_time
=
0.0
;
m_consumed_memory
=
0.0
;
}
return
true
;
}
void
runtime
::
intelgpu
::
IntelGPUBackend
::
remove_compiled_function
(
shared_ptr
<
Executable
>
exec
)
void
runtime
::
intelgpu
::
IntelGPUBackend
::
remove_compiled_function
(
shared_ptr
<
Executable
>
exec
)
{
{
for
(
auto
it
=
cldnn_networks
.
begin
();
it
!=
cldnn_networks
.
end
();
++
it
)
for
(
auto
it
=
cldnn_networks
.
begin
();
it
!=
cldnn_networks
.
end
();
++
it
)
...
@@ -1957,199 +1827,6 @@ void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Exe
...
@@ -1957,199 +1827,6 @@ void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Exe
}
}
}
}
// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
// This function should return "Parameter_254" from the example above
static
string
convert_cldnn_names
(
shared_ptr
<
Function
>
func
,
const
string
&
cldnn_name
)
{
const
string
key
(
"_"
);
string
result
;
const
size_t
last_key
=
cldnn_name
.
rfind
(
key
);
const
size_t
pre_last_key
=
cldnn_name
.
rfind
(
key
,
last_key
-
1
);
const
size_t
pre_pre_last_key
=
cldnn_name
.
rfind
(
key
,
pre_last_key
-
1
);
if
(
pre_pre_last_key
==
std
::
string
::
npos
)
{
result
=
cldnn_name
.
substr
(
0
,
last_key
);
}
else
{
result
=
cldnn_name
.
substr
(
pre_pre_last_key
+
1
,
last_key
-
pre_pre_last_key
-
1
);
}
return
result
;
}
vector
<
runtime
::
PerformanceCounter
>
runtime
::
intelgpu
::
IntelGPUExecutable
::
get_performance_data
()
const
{
vector
<
runtime
::
PerformanceCounter
>
rc
;
if
(
m_cldnn_network
!=
nullptr
&&
m_performance_counters_enabled
)
{
const
map
<
cldnn
::
primitive_id
,
cldnn
::
event
>&
primitives
=
m_cldnn_network
->
get_executed_primitives
();
for
(
const
auto
&
p
:
primitives
)
{
// Let's generate the primitive name that matches to the name in Function
const
string
primitive_name
=
convert_cldnn_names
(
m_function
,
p
.
first
);
size_t
usec
=
0
;
for
(
const
auto
&
q
:
p
.
second
.
get_profiling_info
())
{
if
(
q
.
name
==
string
(
"executing"
))
{
usec
+=
chrono
::
duration_cast
<
chrono
::
duration
<
size_t
,
chrono
::
milliseconds
::
period
>>
(
q
.
value
->
value
())
.
count
();
}
}
const
runtime
::
PerformanceCounter
perf_counter
(
primitive_name
.
c_str
(),
usec
,
1
);
rc
.
push_back
(
perf_counter
);
}
}
return
rc
;
}
static
Node
*
get_node_by_name
(
const
shared_ptr
<
Function
>
func
,
const
string
&
name
)
{
for
(
shared_ptr
<
Node
>
node
:
func
->
get_ops
())
{
if
(
node
->
get_name
()
==
name
)
{
return
node
.
get
();
}
}
return
nullptr
;
}
void
runtime
::
intelgpu
::
IntelGPUExecutable
::
print_call_performance
(
const
shared_ptr
<
cldnn
::
network
>
network
,
const
shared_ptr
<
Function
>
func
,
double
time_compile
,
double
time_call
,
double
mem_compilation_consumed
,
double
mem_call_consumed
,
double
mem_current
)
const
{
struct
data_item
{
string
item_name
;
map
<
string
,
double
>
item_times
;
};
const
string
&
func_name
=
func
->
get_name
();
const
map
<
cldnn
::
primitive_id
,
cldnn
::
event
>&
primitives
=
network
->
get_executed_primitives
();
size_t
limit_count
=
m_profile_lines_limit_count
;
multimap
<
double
,
data_item
>
data
;
map
<
string
,
double
>
total_interval_times
;
double
total_executing_time
=
0
;
size_t
total_items_count
=
0
;
size_t
max_item_name_size
=
0
;
ios_base
::
fmtflags
saved_stream_flags
(
cout
.
flags
());
// Save stream flags to restore them later
if
(
m_profile_lines_limit_count
>
0
)
{
// Extract profiling statistic, calculate summary and sort
for
(
auto
&
prim
:
primitives
)
{
double
executing_time
=
0
;
data_item
item
;
item
.
item_name
=
prim
.
first
;
max_item_name_size
=
max
(
max_item_name_size
,
prim
.
first
.
size
());
for
(
auto
&
prof_info
:
prim
.
second
.
get_profiling_info
())
{
const
string
&
interval_name
=
prof_info
.
name
;
double
interval
=
chrono
::
duration_cast
<
chrono
::
duration
<
double
,
chrono
::
milliseconds
::
period
>>
(
prof_info
.
value
->
value
())
.
count
();
item
.
item_times
[
interval_name
]
=
interval
;
// Get the Key time to sort by
if
(
interval_name
==
"executing"
)
{
executing_time
+=
interval
;
}
// Accumulate total time for each interval
if
(
total_interval_times
.
find
(
interval_name
)
==
total_interval_times
.
end
())
{
total_interval_times
[
interval_name
]
=
interval
;
}
else
{
total_interval_times
[
interval_name
]
+=
interval
;
}
}
data
.
emplace
(
executing_time
,
item
);
total_executing_time
+=
executing_time
;
++
total_items_count
;
}
// Print statistic for each primitive in the cldnn::network
for
(
auto
it
=
data
.
rbegin
();
(
it
!=
data
.
rend
())
&&
(
limit_count
>
0
);
++
it
,
--
limit_count
)
{
const
string
ngraph_node_name
=
convert_cldnn_names
(
func
,
it
->
second
.
item_name
);
const
Node
*
ngraph_node
=
get_node_by_name
(
func
,
ngraph_node_name
);
cout
<<
func_name
<<
delim
<<
setw
(
max_item_name_size
)
<<
it
->
second
.
item_name
<<
delim
<<
"time(ms)"
<<
delim
<<
scientific
<<
setprecision
(
2
)
<<
it
->
first
;
for
(
auto
item
:
it
->
second
.
item_times
)
{
cout
<<
delim
<<
item
.
first
<<
"(ms)"
<<
delim
<<
item
.
second
;
}
cout
<<
delim
<<
ngraph_node_name
;
if
(
ngraph_node
)
// it might be initialized by nullptr
{
// print all input shapes for the Node
size_t
arg_idx
=
0
;
for
(
const
descriptor
::
Input
&
op_input
:
ngraph_node
->
get_inputs
())
{
cout
<<
delim
<<
op_input
.
get_element_type
().
c_type_string
()
<<
" input"
<<
arg_idx
<<
vector_to_string
(
op_input
.
get_shape
());
++
arg_idx
;
}
// print all output shapes for the Node
arg_idx
=
0
;
for
(
const
descriptor
::
Output
&
op_output
:
ngraph_node
->
get_outputs
())
{
cout
<<
delim
<<
op_output
.
get_element_type
().
c_type_string
()
<<
" output"
<<
arg_idx
<<
vector_to_string
(
op_output
.
get_shape
());
++
arg_idx
;
}
}
cout
<<
"
\n
"
;
}
// Print bottom line summary
const
string
total_items_count_string
=
"Total(cldnn "
+
to_string
(
total_items_count
)
+
", ngraph "
+
to_string
(
func
->
get_ops
().
size
())
+
")"
;
cout
<<
func_name
<<
delim
<<
setw
(
max_item_name_size
)
<<
total_items_count_string
<<
delim
<<
"time(ms)"
<<
delim
<<
scientific
<<
setprecision
(
2
)
<<
total_executing_time
;
for
(
auto
item_times
:
total_interval_times
)
{
cout
<<
delim
<<
item_times
.
first
<<
"(ms)"
<<
delim
<<
item_times
.
second
;
}
cout
<<
"
\n
"
;
}
// Print time and memory consumed in ::call function
cout
<<
func_name
<<
delim
<<
" Backend compilation(ms)"
<<
delim
<<
time_compile
<<
delim
<<
"call(ms)"
<<
delim
<<
time_call
<<
delim
<<
"memory consumption compile(B)"
<<
delim
<<
mem_compilation_consumed
<<
delim
<<
"call(B)"
<<
delim
<<
mem_call_consumed
<<
delim
<<
"RSS(B)"
<<
delim
<<
mem_current
<<
endl
;
cout
.
flags
(
saved_stream_flags
);
// Restore stream configuration to leave it in original state
}
bool
runtime
::
intelgpu
::
IntelGPUBackend
::
is_supported_property
(
const
Property
prop
)
const
bool
runtime
::
intelgpu
::
IntelGPUBackend
::
is_supported_property
(
const
Property
prop
)
const
{
{
if
(
prop
==
Property
::
memory_attach
)
if
(
prop
==
Property
::
memory_attach
)
...
...
This diff is collapsed.
Click to expand it.
src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
View file @
013c2381
...
@@ -20,7 +20,6 @@
...
@@ -20,7 +20,6 @@
#include <memory>
#include <memory>
#include <CPP/engine.hpp>
#include <CPP/engine.hpp>
#include <CPP/network.hpp>
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/backend.hpp"
...
@@ -31,7 +30,6 @@ namespace ngraph
...
@@ -31,7 +30,6 @@ namespace ngraph
namespace
intelgpu
namespace
intelgpu
{
{
class
IntelGPUBackend
;
class
IntelGPUBackend
;
class
IntelGPUExecutable
;
}
}
}
}
}
}
...
@@ -67,39 +65,3 @@ private:
...
@@ -67,39 +65,3 @@ private:
bool
m_disable_backend_optimizations
=
false
;
bool
m_disable_backend_optimizations
=
false
;
std
::
string
m_cldnn_dump_dir
=
std
::
string
(
"intelgpu_codegen"
);
std
::
string
m_cldnn_dump_dir
=
std
::
string
(
"intelgpu_codegen"
);
};
};
class
ngraph
::
runtime
::
intelgpu
::
IntelGPUExecutable
:
public
runtime
::
Executable
{
public
:
IntelGPUExecutable
(
std
::
shared_ptr
<
Function
>
func
,
std
::
shared_ptr
<
cldnn
::
network
>
network
,
bool
enable_timing
,
bool
enable_profile
,
double
compilation_time
,
double
consumed_memory
,
size_t
profile_lines_limit_count
);
bool
call
(
const
std
::
vector
<
std
::
shared_ptr
<
runtime
::
Tensor
>>&
outputs
,
const
std
::
vector
<
std
::
shared_ptr
<
runtime
::
Tensor
>>&
inputs
)
override
;
std
::
vector
<
PerformanceCounter
>
get_performance_data
()
const
override
;
private
:
std
::
shared_ptr
<
Function
>
m_function
;
std
::
shared_ptr
<
cldnn
::
network
>
m_cldnn_network
=
nullptr
;
bool
m_performance_counters_enabled
=
false
;
bool
m_profile_enable
=
false
;
double
m_compilation_time
=
0.0
;
double
m_consumed_memory
=
0.0
;
long
m_profile_lines_limit_count
=
10
;
std
::
string
delim
=
std
::
string
(
":"
);
// Statistic related things
void
print_call_performance
(
const
std
::
shared_ptr
<
cldnn
::
network
>
network
,
const
std
::
shared_ptr
<
Function
>
func
,
double
time_compile
,
double
time_call
,
double
mem_compilation_consumed
,
double
mem_call_consumed
,
double
mem_current
)
const
;
};
This diff is collapsed.
Click to expand it.
src/ngraph/runtime/intelgpu/intelgpu_executable.cpp
0 → 100644
View file @
013c2381
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <iomanip>
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"
#include "ngraph/util.hpp"
using
namespace
std
;
using
namespace
ngraph
;
static
void
memory_size_check
(
size_t
memory_size
,
const
shared_ptr
<
Node
>&
node
,
const
string
&
function_name
)
{
const
size_t
tensor_size
=
shape_size
(
node
->
get_shape
())
*
node
->
get_element_type
().
size
();
if
(
memory_size
!=
tensor_size
)
{
ostringstream
os
;
os
<<
"IntelGPU backend failed memory check. In
\"
"
<<
function_name
<<
"
\"
with Node
\"
"
<<
node
->
get_name
()
<<
"
\"
and "
<<
node
->
get_shape
()
<<
" mismatched memory sizes "
<<
tensor_size
<<
" and "
<<
memory_size
;
throw
invalid_argument
(
os
.
str
());
}
}
static
const
string
&
get_input_name
(
const
shared_ptr
<
Node
>&
op
,
size_t
num
=
0
)
{
return
op
->
get_inputs
().
at
(
num
).
get_tensor
().
get_name
();
}
// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
// This function should return "Parameter_254" from the example above
static
string
convert_cldnn_names
(
shared_ptr
<
Function
>
func
,
const
string
&
cldnn_name
)
{
const
string
key
(
"_"
);
string
result
;
const
size_t
last_key
=
cldnn_name
.
rfind
(
key
);
const
size_t
pre_last_key
=
cldnn_name
.
rfind
(
key
,
last_key
-
1
);
const
size_t
pre_pre_last_key
=
cldnn_name
.
rfind
(
key
,
pre_last_key
-
1
);
if
(
pre_pre_last_key
==
std
::
string
::
npos
)
{
result
=
cldnn_name
.
substr
(
0
,
last_key
);
}
else
{
result
=
cldnn_name
.
substr
(
pre_pre_last_key
+
1
,
last_key
-
pre_pre_last_key
-
1
);
}
return
result
;
}
runtime
::
intelgpu
::
IntelGPUExecutable
::
IntelGPUExecutable
(
shared_ptr
<
Function
>
func
,
shared_ptr
<
cldnn
::
network
>
network
,
bool
enable_timing
,
bool
enable_profile
,
double
compilation_time
,
double
consumed_memory
,
size_t
profile_lines_limit_count
)
{
m_function
=
func
;
m_cldnn_network
=
network
;
m_performance_counters_enabled
=
enable_timing
;
m_profile_enable
=
enable_profile
;
m_compilation_time
=
compilation_time
;
m_consumed_memory
=
consumed_memory
;
m_profile_lines_limit_count
=
profile_lines_limit_count
;
set_parameters_and_results
(
*
func
);
}
bool
runtime
::
intelgpu
::
IntelGPUExecutable
::
call
(
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
outputs
,
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
inputs
)
{
double
mem_call_consumed
=
0.0
f
;
stopwatch
timer_call
;
if
(
m_cldnn_network
==
nullptr
)
{
throw
runtime_error
(
"compile() must be called before call()."
);
}
if
(
m_profile_enable
)
{
mem_call_consumed
=
runtime
::
intelgpu
::
get_max_memory_rss
();
timer_call
.
start
();
}
// Process input parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_parameters and inputs, there is
// we try to match them by index number in vectors.
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
shared_ptr
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
tv
=
static_pointer_cast
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
(
inputs
[
i
]);
const
ParameterVector
&
input_params
=
get_parameters
();
const
string
&
tensor_name
=
input_params
[
i
]
->
get_output_tensor
().
get_name
();
m_cldnn_network
->
set_input_data
(
tensor_name
,
*
tv
->
get_data_ptr
());
}
// Execute network
map
<
cldnn
::
primitive_id
,
cldnn
::
network_output
>
result
=
m_cldnn_network
->
execute
();
// Process output parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_results and outputs, there is
// we try to match them by index number in vectors.
for
(
size_t
i
=
0
;
i
<
m_function
->
get_output_size
();
i
++
)
{
const
shared_ptr
<
Node
>&
dst_node
=
m_function
->
get_output_op
(
i
);
const
size_t
dst_shape_size
=
shape_size
(
dst_node
->
get_shape
());
// We should not touch destination memory if it is not existed
if
(
!
dst_shape_size
)
{
continue
;
}
shared_ptr
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
ngraph_res
=
static_pointer_cast
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
(
outputs
[
i
]);
const
string
&
tensor_name
=
get_input_name
(
dst_node
);
auto
result_memory
=
result
.
at
(
tensor_name
).
get_memory
().
pointer
<
char
>
();
memory_size_check
(
result_memory
.
size
(),
dst_node
,
m_function
->
get_name
());
ngraph_res
->
write
(
result_memory
.
data
(),
0
,
result_memory
.
size
());
}
if
(
m_profile_enable
)
{
timer_call
.
stop
();
mem_call_consumed
=
runtime
::
intelgpu
::
get_max_memory_rss
()
-
mem_call_consumed
;
print_call_performance
(
m_cldnn_network
,
m_function
,
m_compilation_time
,
timer_call
.
get_milliseconds
(),
m_consumed_memory
,
mem_call_consumed
,
runtime
::
intelgpu
::
get_max_memory_rss
());
// Output compile time only once
m_compilation_time
=
0.0
;
m_consumed_memory
=
0.0
;
}
return
true
;
}
vector
<
runtime
::
PerformanceCounter
>
runtime
::
intelgpu
::
IntelGPUExecutable
::
get_performance_data
()
const
{
vector
<
runtime
::
PerformanceCounter
>
rc
;
if
(
m_cldnn_network
!=
nullptr
&&
m_performance_counters_enabled
)
{
const
map
<
cldnn
::
primitive_id
,
cldnn
::
event
>&
primitives
=
m_cldnn_network
->
get_executed_primitives
();
for
(
const
auto
&
p
:
primitives
)
{
// Let's generate the primitive name that matches to the name in Function
const
string
primitive_name
=
convert_cldnn_names
(
m_function
,
p
.
first
);
size_t
usec
=
0
;
for
(
const
auto
&
q
:
p
.
second
.
get_profiling_info
())
{
if
(
q
.
name
==
string
(
"executing"
))
{
usec
+=
chrono
::
duration_cast
<
chrono
::
duration
<
size_t
,
chrono
::
milliseconds
::
period
>>
(
q
.
value
->
value
())
.
count
();
}
}
const
runtime
::
PerformanceCounter
perf_counter
(
primitive_name
.
c_str
(),
usec
,
1
);
rc
.
push_back
(
perf_counter
);
}
}
return
rc
;
}
static
Node
*
get_node_by_name
(
const
shared_ptr
<
Function
>
func
,
const
string
&
name
)
{
for
(
shared_ptr
<
Node
>
node
:
func
->
get_ops
())
{
if
(
node
->
get_name
()
==
name
)
{
return
node
.
get
();
}
}
return
nullptr
;
}
void
runtime
::
intelgpu
::
IntelGPUExecutable
::
print_call_performance
(
const
shared_ptr
<
cldnn
::
network
>
network
,
const
shared_ptr
<
Function
>
func
,
double
time_compile
,
double
time_call
,
double
mem_compilation_consumed
,
double
mem_call_consumed
,
double
mem_current
)
const
{
struct
data_item
{
string
item_name
;
map
<
string
,
double
>
item_times
;
};
const
string
&
func_name
=
func
->
get_name
();
const
map
<
cldnn
::
primitive_id
,
cldnn
::
event
>&
primitives
=
network
->
get_executed_primitives
();
size_t
limit_count
=
m_profile_lines_limit_count
;
multimap
<
double
,
data_item
>
data
;
map
<
string
,
double
>
total_interval_times
;
double
total_executing_time
=
0
;
size_t
total_items_count
=
0
;
size_t
max_item_name_size
=
0
;
ios_base
::
fmtflags
saved_stream_flags
(
cout
.
flags
());
// Save stream flags to restore them later
if
(
m_profile_lines_limit_count
>
0
)
{
// Extract profiling statistic, calculate summary and sort
for
(
auto
&
prim
:
primitives
)
{
double
executing_time
=
0
;
data_item
item
;
item
.
item_name
=
prim
.
first
;
max_item_name_size
=
max
(
max_item_name_size
,
prim
.
first
.
size
());
for
(
auto
&
prof_info
:
prim
.
second
.
get_profiling_info
())
{
const
string
&
interval_name
=
prof_info
.
name
;
double
interval
=
chrono
::
duration_cast
<
chrono
::
duration
<
double
,
chrono
::
milliseconds
::
period
>>
(
prof_info
.
value
->
value
())
.
count
();
item
.
item_times
[
interval_name
]
=
interval
;
// Get the Key time to sort by
if
(
interval_name
==
"executing"
)
{
executing_time
+=
interval
;
}
// Accumulate total time for each interval
if
(
total_interval_times
.
find
(
interval_name
)
==
total_interval_times
.
end
())
{
total_interval_times
[
interval_name
]
=
interval
;
}
else
{
total_interval_times
[
interval_name
]
+=
interval
;
}
}
data
.
emplace
(
executing_time
,
item
);
total_executing_time
+=
executing_time
;
++
total_items_count
;
}
// Print statistic for each primitive in the cldnn::network
for
(
auto
it
=
data
.
rbegin
();
(
it
!=
data
.
rend
())
&&
(
limit_count
>
0
);
++
it
,
--
limit_count
)
{
const
string
ngraph_node_name
=
convert_cldnn_names
(
func
,
it
->
second
.
item_name
);
const
Node
*
ngraph_node
=
get_node_by_name
(
func
,
ngraph_node_name
);
cout
<<
func_name
<<
delim
<<
setw
(
max_item_name_size
)
<<
it
->
second
.
item_name
<<
delim
<<
"time(ms)"
<<
delim
<<
scientific
<<
setprecision
(
2
)
<<
it
->
first
;
for
(
auto
item
:
it
->
second
.
item_times
)
{
cout
<<
delim
<<
item
.
first
<<
"(ms)"
<<
delim
<<
item
.
second
;
}
cout
<<
delim
<<
ngraph_node_name
;
if
(
ngraph_node
)
// it might be initialized by nullptr
{
// print all input shapes for the Node
size_t
arg_idx
=
0
;
for
(
const
descriptor
::
Input
&
op_input
:
ngraph_node
->
get_inputs
())
{
cout
<<
delim
<<
op_input
.
get_element_type
().
c_type_string
()
<<
" input"
<<
arg_idx
<<
vector_to_string
(
op_input
.
get_shape
());
++
arg_idx
;
}
// print all output shapes for the Node
arg_idx
=
0
;
for
(
const
descriptor
::
Output
&
op_output
:
ngraph_node
->
get_outputs
())
{
cout
<<
delim
<<
op_output
.
get_element_type
().
c_type_string
()
<<
" output"
<<
arg_idx
<<
vector_to_string
(
op_output
.
get_shape
());
++
arg_idx
;
}
}
cout
<<
"
\n
"
;
}
// Print bottom line summary
const
string
total_items_count_string
=
"Total(cldnn "
+
to_string
(
total_items_count
)
+
", ngraph "
+
to_string
(
func
->
get_ops
().
size
())
+
")"
;
cout
<<
func_name
<<
delim
<<
setw
(
max_item_name_size
)
<<
total_items_count_string
<<
delim
<<
"time(ms)"
<<
delim
<<
scientific
<<
setprecision
(
2
)
<<
total_executing_time
;
for
(
auto
item_times
:
total_interval_times
)
{
cout
<<
delim
<<
item_times
.
first
<<
"(ms)"
<<
delim
<<
item_times
.
second
;
}
cout
<<
"
\n
"
;
}
// Print time and memory consumed in ::call function
cout
<<
func_name
<<
delim
<<
" Backend compilation(ms)"
<<
delim
<<
time_compile
<<
delim
<<
"call(ms)"
<<
delim
<<
time_call
<<
delim
<<
"memory consumption compile(B)"
<<
delim
<<
mem_compilation_consumed
<<
delim
<<
"call(B)"
<<
delim
<<
mem_call_consumed
<<
delim
<<
"RSS(B)"
<<
delim
<<
mem_current
<<
endl
;
cout
.
flags
(
saved_stream_flags
);
// Restore stream configuration to leave it in original state
}
This diff is collapsed.
Click to expand it.
src/ngraph/runtime/intelgpu/intelgpu_executable.hpp
0 → 100644
View file @
013c2381
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <CPP/network.hpp>
#include "ngraph/runtime/tensor.hpp"
namespace
ngraph
{
namespace
runtime
{
namespace
intelgpu
{
class
IntelGPUExecutable
;
}
}
}
class
ngraph
::
runtime
::
intelgpu
::
IntelGPUExecutable
:
public
runtime
::
Executable
{
public
:
IntelGPUExecutable
(
std
::
shared_ptr
<
Function
>
func
,
std
::
shared_ptr
<
cldnn
::
network
>
network
,
bool
enable_timing
,
bool
enable_profile
,
double
compilation_time
,
double
consumed_memory
,
size_t
profile_lines_limit_count
);
bool
call
(
const
std
::
vector
<
std
::
shared_ptr
<
runtime
::
Tensor
>>&
outputs
,
const
std
::
vector
<
std
::
shared_ptr
<
runtime
::
Tensor
>>&
inputs
)
override
;
std
::
vector
<
PerformanceCounter
>
get_performance_data
()
const
override
;
private
:
std
::
shared_ptr
<
Function
>
m_function
;
std
::
shared_ptr
<
cldnn
::
network
>
m_cldnn_network
=
nullptr
;
bool
m_performance_counters_enabled
=
false
;
bool
m_profile_enable
=
false
;
double
m_compilation_time
=
0.0
;
double
m_consumed_memory
=
0.0
;
long
m_profile_lines_limit_count
=
10
;
std
::
string
delim
=
std
::
string
(
":"
);
// Statistic related things
void
print_call_performance
(
const
std
::
shared_ptr
<
cldnn
::
network
>
network
,
const
std
::
shared_ptr
<
Function
>
func
,
double
time_compile
,
double
time_call
,
double
mem_compilation_consumed
,
double
mem_call_consumed
,
double
mem_current
)
const
;
};
This diff is collapsed.
Click to expand it.
src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
View file @
013c2381
...
@@ -14,6 +14,9 @@
...
@@ -14,6 +14,9 @@
// limitations under the License.
// limitations under the License.
//*****************************************************************************
//*****************************************************************************
#include <sys/resource.h>
#include <sys/time.h>
#include <CPP/concatenation.hpp>
#include <CPP/concatenation.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp>
#include <CPP/reshape.hpp>
...
@@ -1515,3 +1518,19 @@ void runtime::intelgpu::do_reshape_operation(cldnn::topology& topology,
...
@@ -1515,3 +1518,19 @@ void runtime::intelgpu::do_reshape_operation(cldnn::topology& topology,
{
1
});
{
1
});
topology
.
add
(
op_reshape
);
topology
.
add
(
op_reshape
);
}
}
size_t
runtime
::
intelgpu
::
get_max_memory_rss
()
{
size_t
result
=
0
;
struct
rusage
usage
;
if
(
getrusage
(
RUSAGE_SELF
,
&
usage
)
==
0
)
{
result
=
usage
.
ru_maxrss
;
// the value is in kilobytes
// aligne result to return bytes
result
*=
1000
;
}
return
result
;
}
This diff is collapsed.
Click to expand it.
src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
View file @
013c2381
...
@@ -33,6 +33,8 @@ namespace ngraph
...
@@ -33,6 +33,8 @@ namespace ngraph
{
{
namespace
intelgpu
namespace
intelgpu
{
{
size_t
get_max_memory_rss
();
void
do_pad_operation
(
cldnn
::
topology
&
topology
,
void
do_pad_operation
(
cldnn
::
topology
&
topology
,
const
std
::
string
&
input_name
,
const
std
::
string
&
input_name
,
const
Shape
&
input_shape
,
const
Shape
&
input_shape
,
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment