Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
ngraph
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ngraph
Commits
013c2381
Commit
013c2381
authored
Feb 14, 2019
by
Sergey Shalnov
Committed by
Robert Kimball
Feb 14, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
IntelGPU backend: Separate backend and executable classes (#2447)
parent
65141c5f
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
428 additions
and
364 deletions
+428
-364
CMakeLists.txt
src/ngraph/runtime/intelgpu/CMakeLists.txt
+1
-0
intelgpu_backend.cpp
src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+3
-326
intelgpu_backend.hpp
src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
+0
-38
intelgpu_executable.cpp
src/ngraph/runtime/intelgpu/intelgpu_executable.cpp
+335
-0
intelgpu_executable.hpp
src/ngraph/runtime/intelgpu/intelgpu_executable.hpp
+68
-0
intelgpu_op_custom_kernels.cpp
src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+19
-0
intelgpu_op_custom_kernels.hpp
src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+2
-0
No files found.
src/ngraph/runtime/intelgpu/CMakeLists.txt
View file @
013c2381
...
...
@@ -16,6 +16,7 @@
set
(
SRC
intelgpu_backend.cpp
intelgpu_executable.cpp
intelgpu_tensor_view.cpp
intelgpu_layout.cpp
intelgpu_op_batchnorm.cpp
...
...
src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
View file @
013c2381
...
...
@@ -15,8 +15,6 @@
//*****************************************************************************
#include <iomanip>
#include <sys/resource.h>
#include <sys/time.h>
#include <CPP/activation.hpp>
#include <CPP/activation_grad.hpp>
...
...
@@ -37,9 +35,7 @@
#include <CPP/mutable_data.hpp>
#include <CPP/permute.hpp>
#include <CPP/pooling.hpp>
#include <CPP/reorder.hpp>
#include <CPP/reshape.hpp>
#include <CPP/scale.hpp>
#include <CPP/select.hpp>
#include <CPP/softmax.hpp>
#include <CPP/topology.hpp>
...
...
@@ -51,6 +47,7 @@
#include "ngraph/pass/nop_elimination.hpp"
#include "ngraph/pass/reshape_elimination.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
...
...
@@ -141,21 +138,6 @@ static void arguments_check(const shared_ptr<Node>& op, size_t input, size_t out
}
}
static
void
memory_size_check
(
size_t
memory_size
,
const
shared_ptr
<
Node
>&
node
,
const
string
&
function_name
)
{
const
size_t
tensor_size
=
shape_size
(
node
->
get_shape
())
*
node
->
get_element_type
().
size
();
if
(
memory_size
!=
tensor_size
)
{
ostringstream
os
;
os
<<
"IntelGPU backend failed memory check. In
\"
"
<<
function_name
<<
"
\"
with Node
\"
"
<<
node
->
get_name
()
<<
"
\"
and "
<<
node
->
get_shape
()
<<
" mismatched memory sizes "
<<
tensor_size
<<
" and "
<<
memory_size
;
throw
invalid_argument
(
os
.
str
());
}
}
static
const
string
&
get_input_name
(
const
shared_ptr
<
Node
>&
op
,
size_t
num
=
0
)
{
return
op
->
get_inputs
().
at
(
num
).
get_tensor
().
get_name
();
...
...
@@ -312,22 +294,6 @@ extern "C" void delete_backend(runtime::Backend* backend)
delete
backend
;
}
static
size_t
get_max_memory_rss
()
{
size_t
result
=
0
;
struct
rusage
usage
;
if
(
getrusage
(
RUSAGE_SELF
,
&
usage
)
==
0
)
{
result
=
usage
.
ru_maxrss
;
// the value is in kilobytes
// aligne result to return bytes
result
*=
1000
;
}
return
result
;
}
runtime
::
intelgpu
::
IntelGPUBackend
::
IntelGPUBackend
()
{
bool
profiling
=
false
;
...
...
@@ -433,7 +399,7 @@ shared_ptr<runtime::Executable>
if
(
m_profile_enable
)
{
consumed_memory
=
get_max_memory_rss
();
consumed_memory
=
runtime
::
intelgpu
::
get_max_memory_rss
();
timer_compile
.
start
();
}
...
...
@@ -1831,7 +1797,7 @@ shared_ptr<runtime::Executable>
{
timer_compile
.
stop
();
compilation_time
=
timer_compile
.
get_milliseconds
();
consumed_memory
=
get_max_memory_rss
()
-
consumed_memory
;
consumed_memory
=
runtime
::
intelgpu
::
get_max_memory_rss
()
-
consumed_memory
;
}
rc
=
make_shared
<
IntelGPUExecutable
>
(
func
,
...
...
@@ -1849,102 +1815,6 @@ shared_ptr<runtime::Executable>
return
rc
;
}
runtime
::
intelgpu
::
IntelGPUExecutable
::
IntelGPUExecutable
(
shared_ptr
<
Function
>
func
,
shared_ptr
<
cldnn
::
network
>
network
,
bool
enable_timing
,
bool
enable_profile
,
double
compilation_time
,
double
consumed_memory
,
size_t
profile_lines_limit_count
)
{
m_function
=
func
;
m_cldnn_network
=
network
;
m_performance_counters_enabled
=
enable_timing
;
m_profile_enable
=
enable_profile
;
m_compilation_time
=
compilation_time
;
m_consumed_memory
=
consumed_memory
;
m_profile_lines_limit_count
=
profile_lines_limit_count
;
set_parameters_and_results
(
*
func
);
}
bool
runtime
::
intelgpu
::
IntelGPUExecutable
::
call
(
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
outputs
,
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
inputs
)
{
double
mem_call_consumed
=
0.0
f
;
stopwatch
timer_call
;
if
(
m_cldnn_network
==
nullptr
)
{
throw
runtime_error
(
"compile() must be called before call()."
);
}
if
(
m_profile_enable
)
{
mem_call_consumed
=
get_max_memory_rss
();
timer_call
.
start
();
}
// Process input parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_parameters and inputs, there is
// we try to match them by index number in vectors.
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
shared_ptr
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
tv
=
static_pointer_cast
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
(
inputs
[
i
]);
const
ParameterVector
&
input_params
=
get_parameters
();
const
string
&
tensor_name
=
input_params
[
i
]
->
get_output_tensor
().
get_name
();
m_cldnn_network
->
set_input_data
(
tensor_name
,
*
tv
->
get_data_ptr
());
}
// Execute network
map
<
cldnn
::
primitive_id
,
cldnn
::
network_output
>
result
=
m_cldnn_network
->
execute
();
// Process output parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_results and outputs, there is
// we try to match them by index number in vectors.
for
(
size_t
i
=
0
;
i
<
m_function
->
get_output_size
();
i
++
)
{
const
shared_ptr
<
Node
>&
dst_node
=
m_function
->
get_output_op
(
i
);
const
size_t
dst_shape_size
=
shape_size
(
dst_node
->
get_shape
());
// We should not touch destination memory if it is not existed
if
(
!
dst_shape_size
)
{
continue
;
}
shared_ptr
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
ngraph_res
=
static_pointer_cast
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
(
outputs
[
i
]);
const
string
&
tensor_name
=
get_input_name
(
dst_node
);
auto
result_memory
=
result
.
at
(
tensor_name
).
get_memory
().
pointer
<
char
>
();
memory_size_check
(
result_memory
.
size
(),
dst_node
,
m_function
->
get_name
());
ngraph_res
->
write
(
result_memory
.
data
(),
0
,
result_memory
.
size
());
}
if
(
m_profile_enable
)
{
timer_call
.
stop
();
mem_call_consumed
=
get_max_memory_rss
()
-
mem_call_consumed
;
print_call_performance
(
m_cldnn_network
,
m_function
,
m_compilation_time
,
timer_call
.
get_milliseconds
(),
m_consumed_memory
,
mem_call_consumed
,
get_max_memory_rss
());
// Output compile time only once
m_compilation_time
=
0.0
;
m_consumed_memory
=
0.0
;
}
return
true
;
}
void
runtime
::
intelgpu
::
IntelGPUBackend
::
remove_compiled_function
(
shared_ptr
<
Executable
>
exec
)
{
for
(
auto
it
=
cldnn_networks
.
begin
();
it
!=
cldnn_networks
.
end
();
++
it
)
...
...
@@ -1957,199 +1827,6 @@ void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Exe
}
}
// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
// This function should return "Parameter_254" from the example above
static
string
convert_cldnn_names
(
shared_ptr
<
Function
>
func
,
const
string
&
cldnn_name
)
{
const
string
key
(
"_"
);
string
result
;
const
size_t
last_key
=
cldnn_name
.
rfind
(
key
);
const
size_t
pre_last_key
=
cldnn_name
.
rfind
(
key
,
last_key
-
1
);
const
size_t
pre_pre_last_key
=
cldnn_name
.
rfind
(
key
,
pre_last_key
-
1
);
if
(
pre_pre_last_key
==
std
::
string
::
npos
)
{
result
=
cldnn_name
.
substr
(
0
,
last_key
);
}
else
{
result
=
cldnn_name
.
substr
(
pre_pre_last_key
+
1
,
last_key
-
pre_pre_last_key
-
1
);
}
return
result
;
}
vector
<
runtime
::
PerformanceCounter
>
runtime
::
intelgpu
::
IntelGPUExecutable
::
get_performance_data
()
const
{
vector
<
runtime
::
PerformanceCounter
>
rc
;
if
(
m_cldnn_network
!=
nullptr
&&
m_performance_counters_enabled
)
{
const
map
<
cldnn
::
primitive_id
,
cldnn
::
event
>&
primitives
=
m_cldnn_network
->
get_executed_primitives
();
for
(
const
auto
&
p
:
primitives
)
{
// Let's generate the primitive name that matches to the name in Function
const
string
primitive_name
=
convert_cldnn_names
(
m_function
,
p
.
first
);
size_t
usec
=
0
;
for
(
const
auto
&
q
:
p
.
second
.
get_profiling_info
())
{
if
(
q
.
name
==
string
(
"executing"
))
{
usec
+=
chrono
::
duration_cast
<
chrono
::
duration
<
size_t
,
chrono
::
milliseconds
::
period
>>
(
q
.
value
->
value
())
.
count
();
}
}
const
runtime
::
PerformanceCounter
perf_counter
(
primitive_name
.
c_str
(),
usec
,
1
);
rc
.
push_back
(
perf_counter
);
}
}
return
rc
;
}
static
Node
*
get_node_by_name
(
const
shared_ptr
<
Function
>
func
,
const
string
&
name
)
{
for
(
shared_ptr
<
Node
>
node
:
func
->
get_ops
())
{
if
(
node
->
get_name
()
==
name
)
{
return
node
.
get
();
}
}
return
nullptr
;
}
void
runtime
::
intelgpu
::
IntelGPUExecutable
::
print_call_performance
(
const
shared_ptr
<
cldnn
::
network
>
network
,
const
shared_ptr
<
Function
>
func
,
double
time_compile
,
double
time_call
,
double
mem_compilation_consumed
,
double
mem_call_consumed
,
double
mem_current
)
const
{
struct
data_item
{
string
item_name
;
map
<
string
,
double
>
item_times
;
};
const
string
&
func_name
=
func
->
get_name
();
const
map
<
cldnn
::
primitive_id
,
cldnn
::
event
>&
primitives
=
network
->
get_executed_primitives
();
size_t
limit_count
=
m_profile_lines_limit_count
;
multimap
<
double
,
data_item
>
data
;
map
<
string
,
double
>
total_interval_times
;
double
total_executing_time
=
0
;
size_t
total_items_count
=
0
;
size_t
max_item_name_size
=
0
;
ios_base
::
fmtflags
saved_stream_flags
(
cout
.
flags
());
// Save stream flags to restore them later
if
(
m_profile_lines_limit_count
>
0
)
{
// Extract profiling statistic, calculate summary and sort
for
(
auto
&
prim
:
primitives
)
{
double
executing_time
=
0
;
data_item
item
;
item
.
item_name
=
prim
.
first
;
max_item_name_size
=
max
(
max_item_name_size
,
prim
.
first
.
size
());
for
(
auto
&
prof_info
:
prim
.
second
.
get_profiling_info
())
{
const
string
&
interval_name
=
prof_info
.
name
;
double
interval
=
chrono
::
duration_cast
<
chrono
::
duration
<
double
,
chrono
::
milliseconds
::
period
>>
(
prof_info
.
value
->
value
())
.
count
();
item
.
item_times
[
interval_name
]
=
interval
;
// Get the Key time to sort by
if
(
interval_name
==
"executing"
)
{
executing_time
+=
interval
;
}
// Accumulate total time for each interval
if
(
total_interval_times
.
find
(
interval_name
)
==
total_interval_times
.
end
())
{
total_interval_times
[
interval_name
]
=
interval
;
}
else
{
total_interval_times
[
interval_name
]
+=
interval
;
}
}
data
.
emplace
(
executing_time
,
item
);
total_executing_time
+=
executing_time
;
++
total_items_count
;
}
// Print statistic for each primitive in the cldnn::network
for
(
auto
it
=
data
.
rbegin
();
(
it
!=
data
.
rend
())
&&
(
limit_count
>
0
);
++
it
,
--
limit_count
)
{
const
string
ngraph_node_name
=
convert_cldnn_names
(
func
,
it
->
second
.
item_name
);
const
Node
*
ngraph_node
=
get_node_by_name
(
func
,
ngraph_node_name
);
cout
<<
func_name
<<
delim
<<
setw
(
max_item_name_size
)
<<
it
->
second
.
item_name
<<
delim
<<
"time(ms)"
<<
delim
<<
scientific
<<
setprecision
(
2
)
<<
it
->
first
;
for
(
auto
item
:
it
->
second
.
item_times
)
{
cout
<<
delim
<<
item
.
first
<<
"(ms)"
<<
delim
<<
item
.
second
;
}
cout
<<
delim
<<
ngraph_node_name
;
if
(
ngraph_node
)
// it might be initialized by nullptr
{
// print all input shapes for the Node
size_t
arg_idx
=
0
;
for
(
const
descriptor
::
Input
&
op_input
:
ngraph_node
->
get_inputs
())
{
cout
<<
delim
<<
op_input
.
get_element_type
().
c_type_string
()
<<
" input"
<<
arg_idx
<<
vector_to_string
(
op_input
.
get_shape
());
++
arg_idx
;
}
// print all output shapes for the Node
arg_idx
=
0
;
for
(
const
descriptor
::
Output
&
op_output
:
ngraph_node
->
get_outputs
())
{
cout
<<
delim
<<
op_output
.
get_element_type
().
c_type_string
()
<<
" output"
<<
arg_idx
<<
vector_to_string
(
op_output
.
get_shape
());
++
arg_idx
;
}
}
cout
<<
"
\n
"
;
}
// Print bottom line summary
const
string
total_items_count_string
=
"Total(cldnn "
+
to_string
(
total_items_count
)
+
", ngraph "
+
to_string
(
func
->
get_ops
().
size
())
+
")"
;
cout
<<
func_name
<<
delim
<<
setw
(
max_item_name_size
)
<<
total_items_count_string
<<
delim
<<
"time(ms)"
<<
delim
<<
scientific
<<
setprecision
(
2
)
<<
total_executing_time
;
for
(
auto
item_times
:
total_interval_times
)
{
cout
<<
delim
<<
item_times
.
first
<<
"(ms)"
<<
delim
<<
item_times
.
second
;
}
cout
<<
"
\n
"
;
}
// Print time and memory consumed in ::call function
cout
<<
func_name
<<
delim
<<
" Backend compilation(ms)"
<<
delim
<<
time_compile
<<
delim
<<
"call(ms)"
<<
delim
<<
time_call
<<
delim
<<
"memory consumption compile(B)"
<<
delim
<<
mem_compilation_consumed
<<
delim
<<
"call(B)"
<<
delim
<<
mem_call_consumed
<<
delim
<<
"RSS(B)"
<<
delim
<<
mem_current
<<
endl
;
cout
.
flags
(
saved_stream_flags
);
// Restore stream configuration to leave it in original state
}
bool
runtime
::
intelgpu
::
IntelGPUBackend
::
is_supported_property
(
const
Property
prop
)
const
{
if
(
prop
==
Property
::
memory_attach
)
...
...
src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
View file @
013c2381
...
...
@@ -20,7 +20,6 @@
#include <memory>
#include <CPP/engine.hpp>
#include <CPP/network.hpp>
#include "ngraph/runtime/backend.hpp"
...
...
@@ -31,7 +30,6 @@ namespace ngraph
namespace
intelgpu
{
class
IntelGPUBackend
;
class
IntelGPUExecutable
;
}
}
}
...
...
@@ -67,39 +65,3 @@ private:
bool
m_disable_backend_optimizations
=
false
;
std
::
string
m_cldnn_dump_dir
=
std
::
string
(
"intelgpu_codegen"
);
};
class
ngraph
::
runtime
::
intelgpu
::
IntelGPUExecutable
:
public
runtime
::
Executable
{
public
:
IntelGPUExecutable
(
std
::
shared_ptr
<
Function
>
func
,
std
::
shared_ptr
<
cldnn
::
network
>
network
,
bool
enable_timing
,
bool
enable_profile
,
double
compilation_time
,
double
consumed_memory
,
size_t
profile_lines_limit_count
);
bool
call
(
const
std
::
vector
<
std
::
shared_ptr
<
runtime
::
Tensor
>>&
outputs
,
const
std
::
vector
<
std
::
shared_ptr
<
runtime
::
Tensor
>>&
inputs
)
override
;
std
::
vector
<
PerformanceCounter
>
get_performance_data
()
const
override
;
private
:
std
::
shared_ptr
<
Function
>
m_function
;
std
::
shared_ptr
<
cldnn
::
network
>
m_cldnn_network
=
nullptr
;
bool
m_performance_counters_enabled
=
false
;
bool
m_profile_enable
=
false
;
double
m_compilation_time
=
0.0
;
double
m_consumed_memory
=
0.0
;
long
m_profile_lines_limit_count
=
10
;
std
::
string
delim
=
std
::
string
(
":"
);
// Statistic related things
void
print_call_performance
(
const
std
::
shared_ptr
<
cldnn
::
network
>
network
,
const
std
::
shared_ptr
<
Function
>
func
,
double
time_compile
,
double
time_call
,
double
mem_compilation_consumed
,
double
mem_call_consumed
,
double
mem_current
)
const
;
};
src/ngraph/runtime/intelgpu/intelgpu_executable.cpp
0 → 100644
View file @
013c2381
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <iomanip>
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"
#include "ngraph/util.hpp"
using
namespace
std
;
using
namespace
ngraph
;
static
void
memory_size_check
(
size_t
memory_size
,
const
shared_ptr
<
Node
>&
node
,
const
string
&
function_name
)
{
const
size_t
tensor_size
=
shape_size
(
node
->
get_shape
())
*
node
->
get_element_type
().
size
();
if
(
memory_size
!=
tensor_size
)
{
ostringstream
os
;
os
<<
"IntelGPU backend failed memory check. In
\"
"
<<
function_name
<<
"
\"
with Node
\"
"
<<
node
->
get_name
()
<<
"
\"
and "
<<
node
->
get_shape
()
<<
" mismatched memory sizes "
<<
tensor_size
<<
" and "
<<
memory_size
;
throw
invalid_argument
(
os
.
str
());
}
}
static
const
string
&
get_input_name
(
const
shared_ptr
<
Node
>&
op
,
size_t
num
=
0
)
{
return
op
->
get_inputs
().
at
(
num
).
get_tensor
().
get_name
();
}
// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
// This function should return "Parameter_254" from the example above
static
string
convert_cldnn_names
(
shared_ptr
<
Function
>
func
,
const
string
&
cldnn_name
)
{
const
string
key
(
"_"
);
string
result
;
const
size_t
last_key
=
cldnn_name
.
rfind
(
key
);
const
size_t
pre_last_key
=
cldnn_name
.
rfind
(
key
,
last_key
-
1
);
const
size_t
pre_pre_last_key
=
cldnn_name
.
rfind
(
key
,
pre_last_key
-
1
);
if
(
pre_pre_last_key
==
std
::
string
::
npos
)
{
result
=
cldnn_name
.
substr
(
0
,
last_key
);
}
else
{
result
=
cldnn_name
.
substr
(
pre_pre_last_key
+
1
,
last_key
-
pre_pre_last_key
-
1
);
}
return
result
;
}
runtime
::
intelgpu
::
IntelGPUExecutable
::
IntelGPUExecutable
(
shared_ptr
<
Function
>
func
,
shared_ptr
<
cldnn
::
network
>
network
,
bool
enable_timing
,
bool
enable_profile
,
double
compilation_time
,
double
consumed_memory
,
size_t
profile_lines_limit_count
)
{
m_function
=
func
;
m_cldnn_network
=
network
;
m_performance_counters_enabled
=
enable_timing
;
m_profile_enable
=
enable_profile
;
m_compilation_time
=
compilation_time
;
m_consumed_memory
=
consumed_memory
;
m_profile_lines_limit_count
=
profile_lines_limit_count
;
set_parameters_and_results
(
*
func
);
}
bool
runtime
::
intelgpu
::
IntelGPUExecutable
::
call
(
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
outputs
,
const
vector
<
shared_ptr
<
runtime
::
Tensor
>>&
inputs
)
{
double
mem_call_consumed
=
0.0
f
;
stopwatch
timer_call
;
if
(
m_cldnn_network
==
nullptr
)
{
throw
runtime_error
(
"compile() must be called before call()."
);
}
if
(
m_profile_enable
)
{
mem_call_consumed
=
runtime
::
intelgpu
::
get_max_memory_rss
();
timer_call
.
start
();
}
// Process input parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_parameters and inputs, there is
// we try to match them by index number in vectors.
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
shared_ptr
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
tv
=
static_pointer_cast
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
(
inputs
[
i
]);
const
ParameterVector
&
input_params
=
get_parameters
();
const
string
&
tensor_name
=
input_params
[
i
]
->
get_output_tensor
().
get_name
();
m_cldnn_network
->
set_input_data
(
tensor_name
,
*
tv
->
get_data_ptr
());
}
// Execute network
map
<
cldnn
::
primitive_id
,
cldnn
::
network_output
>
result
=
m_cldnn_network
->
execute
();
// Process output parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_results and outputs, there is
// we try to match them by index number in vectors.
for
(
size_t
i
=
0
;
i
<
m_function
->
get_output_size
();
i
++
)
{
const
shared_ptr
<
Node
>&
dst_node
=
m_function
->
get_output_op
(
i
);
const
size_t
dst_shape_size
=
shape_size
(
dst_node
->
get_shape
());
// We should not touch destination memory if it is not existed
if
(
!
dst_shape_size
)
{
continue
;
}
shared_ptr
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
ngraph_res
=
static_pointer_cast
<
runtime
::
intelgpu
::
IntelGPUTensorView
>
(
outputs
[
i
]);
const
string
&
tensor_name
=
get_input_name
(
dst_node
);
auto
result_memory
=
result
.
at
(
tensor_name
).
get_memory
().
pointer
<
char
>
();
memory_size_check
(
result_memory
.
size
(),
dst_node
,
m_function
->
get_name
());
ngraph_res
->
write
(
result_memory
.
data
(),
0
,
result_memory
.
size
());
}
if
(
m_profile_enable
)
{
timer_call
.
stop
();
mem_call_consumed
=
runtime
::
intelgpu
::
get_max_memory_rss
()
-
mem_call_consumed
;
print_call_performance
(
m_cldnn_network
,
m_function
,
m_compilation_time
,
timer_call
.
get_milliseconds
(),
m_consumed_memory
,
mem_call_consumed
,
runtime
::
intelgpu
::
get_max_memory_rss
());
// Output compile time only once
m_compilation_time
=
0.0
;
m_consumed_memory
=
0.0
;
}
return
true
;
}
vector
<
runtime
::
PerformanceCounter
>
runtime
::
intelgpu
::
IntelGPUExecutable
::
get_performance_data
()
const
{
vector
<
runtime
::
PerformanceCounter
>
rc
;
if
(
m_cldnn_network
!=
nullptr
&&
m_performance_counters_enabled
)
{
const
map
<
cldnn
::
primitive_id
,
cldnn
::
event
>&
primitives
=
m_cldnn_network
->
get_executed_primitives
();
for
(
const
auto
&
p
:
primitives
)
{
// Let's generate the primitive name that matches to the name in Function
const
string
primitive_name
=
convert_cldnn_names
(
m_function
,
p
.
first
);
size_t
usec
=
0
;
for
(
const
auto
&
q
:
p
.
second
.
get_profiling_info
())
{
if
(
q
.
name
==
string
(
"executing"
))
{
usec
+=
chrono
::
duration_cast
<
chrono
::
duration
<
size_t
,
chrono
::
milliseconds
::
period
>>
(
q
.
value
->
value
())
.
count
();
}
}
const
runtime
::
PerformanceCounter
perf_counter
(
primitive_name
.
c_str
(),
usec
,
1
);
rc
.
push_back
(
perf_counter
);
}
}
return
rc
;
}
static
Node
*
get_node_by_name
(
const
shared_ptr
<
Function
>
func
,
const
string
&
name
)
{
for
(
shared_ptr
<
Node
>
node
:
func
->
get_ops
())
{
if
(
node
->
get_name
()
==
name
)
{
return
node
.
get
();
}
}
return
nullptr
;
}
void
runtime
::
intelgpu
::
IntelGPUExecutable
::
print_call_performance
(
const
shared_ptr
<
cldnn
::
network
>
network
,
const
shared_ptr
<
Function
>
func
,
double
time_compile
,
double
time_call
,
double
mem_compilation_consumed
,
double
mem_call_consumed
,
double
mem_current
)
const
{
struct
data_item
{
string
item_name
;
map
<
string
,
double
>
item_times
;
};
const
string
&
func_name
=
func
->
get_name
();
const
map
<
cldnn
::
primitive_id
,
cldnn
::
event
>&
primitives
=
network
->
get_executed_primitives
();
size_t
limit_count
=
m_profile_lines_limit_count
;
multimap
<
double
,
data_item
>
data
;
map
<
string
,
double
>
total_interval_times
;
double
total_executing_time
=
0
;
size_t
total_items_count
=
0
;
size_t
max_item_name_size
=
0
;
ios_base
::
fmtflags
saved_stream_flags
(
cout
.
flags
());
// Save stream flags to restore them later
if
(
m_profile_lines_limit_count
>
0
)
{
// Extract profiling statistic, calculate summary and sort
for
(
auto
&
prim
:
primitives
)
{
double
executing_time
=
0
;
data_item
item
;
item
.
item_name
=
prim
.
first
;
max_item_name_size
=
max
(
max_item_name_size
,
prim
.
first
.
size
());
for
(
auto
&
prof_info
:
prim
.
second
.
get_profiling_info
())
{
const
string
&
interval_name
=
prof_info
.
name
;
double
interval
=
chrono
::
duration_cast
<
chrono
::
duration
<
double
,
chrono
::
milliseconds
::
period
>>
(
prof_info
.
value
->
value
())
.
count
();
item
.
item_times
[
interval_name
]
=
interval
;
// Get the Key time to sort by
if
(
interval_name
==
"executing"
)
{
executing_time
+=
interval
;
}
// Accumulate total time for each interval
if
(
total_interval_times
.
find
(
interval_name
)
==
total_interval_times
.
end
())
{
total_interval_times
[
interval_name
]
=
interval
;
}
else
{
total_interval_times
[
interval_name
]
+=
interval
;
}
}
data
.
emplace
(
executing_time
,
item
);
total_executing_time
+=
executing_time
;
++
total_items_count
;
}
// Print statistic for each primitive in the cldnn::network
for
(
auto
it
=
data
.
rbegin
();
(
it
!=
data
.
rend
())
&&
(
limit_count
>
0
);
++
it
,
--
limit_count
)
{
const
string
ngraph_node_name
=
convert_cldnn_names
(
func
,
it
->
second
.
item_name
);
const
Node
*
ngraph_node
=
get_node_by_name
(
func
,
ngraph_node_name
);
cout
<<
func_name
<<
delim
<<
setw
(
max_item_name_size
)
<<
it
->
second
.
item_name
<<
delim
<<
"time(ms)"
<<
delim
<<
scientific
<<
setprecision
(
2
)
<<
it
->
first
;
for
(
auto
item
:
it
->
second
.
item_times
)
{
cout
<<
delim
<<
item
.
first
<<
"(ms)"
<<
delim
<<
item
.
second
;
}
cout
<<
delim
<<
ngraph_node_name
;
if
(
ngraph_node
)
// it might be initialized by nullptr
{
// print all input shapes for the Node
size_t
arg_idx
=
0
;
for
(
const
descriptor
::
Input
&
op_input
:
ngraph_node
->
get_inputs
())
{
cout
<<
delim
<<
op_input
.
get_element_type
().
c_type_string
()
<<
" input"
<<
arg_idx
<<
vector_to_string
(
op_input
.
get_shape
());
++
arg_idx
;
}
// print all output shapes for the Node
arg_idx
=
0
;
for
(
const
descriptor
::
Output
&
op_output
:
ngraph_node
->
get_outputs
())
{
cout
<<
delim
<<
op_output
.
get_element_type
().
c_type_string
()
<<
" output"
<<
arg_idx
<<
vector_to_string
(
op_output
.
get_shape
());
++
arg_idx
;
}
}
cout
<<
"
\n
"
;
}
// Print bottom line summary
const
string
total_items_count_string
=
"Total(cldnn "
+
to_string
(
total_items_count
)
+
", ngraph "
+
to_string
(
func
->
get_ops
().
size
())
+
")"
;
cout
<<
func_name
<<
delim
<<
setw
(
max_item_name_size
)
<<
total_items_count_string
<<
delim
<<
"time(ms)"
<<
delim
<<
scientific
<<
setprecision
(
2
)
<<
total_executing_time
;
for
(
auto
item_times
:
total_interval_times
)
{
cout
<<
delim
<<
item_times
.
first
<<
"(ms)"
<<
delim
<<
item_times
.
second
;
}
cout
<<
"
\n
"
;
}
// Print time and memory consumed in ::call function
cout
<<
func_name
<<
delim
<<
" Backend compilation(ms)"
<<
delim
<<
time_compile
<<
delim
<<
"call(ms)"
<<
delim
<<
time_call
<<
delim
<<
"memory consumption compile(B)"
<<
delim
<<
mem_compilation_consumed
<<
delim
<<
"call(B)"
<<
delim
<<
mem_call_consumed
<<
delim
<<
"RSS(B)"
<<
delim
<<
mem_current
<<
endl
;
cout
.
flags
(
saved_stream_flags
);
// Restore stream configuration to leave it in original state
}
src/ngraph/runtime/intelgpu/intelgpu_executable.hpp
0 → 100644
View file @
013c2381
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <CPP/network.hpp>
#include "ngraph/runtime/tensor.hpp"
namespace
ngraph
{
namespace
runtime
{
namespace
intelgpu
{
class
IntelGPUExecutable
;
}
}
}
class
ngraph
::
runtime
::
intelgpu
::
IntelGPUExecutable
:
public
runtime
::
Executable
{
public
:
IntelGPUExecutable
(
std
::
shared_ptr
<
Function
>
func
,
std
::
shared_ptr
<
cldnn
::
network
>
network
,
bool
enable_timing
,
bool
enable_profile
,
double
compilation_time
,
double
consumed_memory
,
size_t
profile_lines_limit_count
);
bool
call
(
const
std
::
vector
<
std
::
shared_ptr
<
runtime
::
Tensor
>>&
outputs
,
const
std
::
vector
<
std
::
shared_ptr
<
runtime
::
Tensor
>>&
inputs
)
override
;
std
::
vector
<
PerformanceCounter
>
get_performance_data
()
const
override
;
private
:
std
::
shared_ptr
<
Function
>
m_function
;
std
::
shared_ptr
<
cldnn
::
network
>
m_cldnn_network
=
nullptr
;
bool
m_performance_counters_enabled
=
false
;
bool
m_profile_enable
=
false
;
double
m_compilation_time
=
0.0
;
double
m_consumed_memory
=
0.0
;
long
m_profile_lines_limit_count
=
10
;
std
::
string
delim
=
std
::
string
(
":"
);
// Statistic related things
void
print_call_performance
(
const
std
::
shared_ptr
<
cldnn
::
network
>
network
,
const
std
::
shared_ptr
<
Function
>
func
,
double
time_compile
,
double
time_call
,
double
mem_compilation_consumed
,
double
mem_call_consumed
,
double
mem_current
)
const
;
};
src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
View file @
013c2381
...
...
@@ -14,6 +14,9 @@
// limitations under the License.
//*****************************************************************************
#include <sys/resource.h>
#include <sys/time.h>
#include <CPP/concatenation.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp>
...
...
@@ -1515,3 +1518,19 @@ void runtime::intelgpu::do_reshape_operation(cldnn::topology& topology,
{
1
});
topology
.
add
(
op_reshape
);
}
size_t
runtime
::
intelgpu
::
get_max_memory_rss
()
{
size_t
result
=
0
;
struct
rusage
usage
;
if
(
getrusage
(
RUSAGE_SELF
,
&
usage
)
==
0
)
{
result
=
usage
.
ru_maxrss
;
// the value is in kilobytes
// aligne result to return bytes
result
*=
1000
;
}
return
result
;
}
src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
View file @
013c2381
...
...
@@ -33,6 +33,8 @@ namespace ngraph
{
namespace
intelgpu
{
size_t
get_max_memory_rss
();
void
do_pad_operation
(
cldnn
::
topology
&
topology
,
const
std
::
string
&
input_name
,
const
Shape
&
input_shape
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment