Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
ngraph
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ngraph
Commits
38ba5c12
Commit
38ba5c12
authored
Jul 27, 2018
by
Fenglei
Committed by
Scott Cyphers
Jul 27, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
gpu concat optimize (#1259)
* optimize concat * compile sucess * multi inputs * clang format
parent
deacf29a
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
122 additions
and
77 deletions
+122
-77
cuda_emitter.cpp
src/ngraph/runtime/gpu/cuda_emitter.cpp
+94
-0
cuda_emitter.hpp
src/ngraph/runtime/gpu/cuda_emitter.hpp
+5
-0
gpu_cuda_kernel_builder.cpp
src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+5
-5
gpu_cuda_kernel_emitters.hpp
src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+0
-40
gpu_emitter.cpp
src/ngraph/runtime/gpu/gpu_emitter.cpp
+18
-32
No files found.
src/ngraph/runtime/gpu/cuda_emitter.cpp
View file @
38ba5c12
...
...
@@ -69,6 +69,100 @@ runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitte
m_ctx
=
ctx
;
}
size_t
runtime
::
gpu
::
CUDAEmitter
::
build_concat
(
const
std
::
vector
<
std
::
string
>&
dtypes
,
std
::
vector
<
GPUShape
>
input_shapes
,
size_t
concat_axis
,
GPUShape
output_shape
)
{
std
::
stringstream
kernel_name
;
size_t
input_size
=
input_shapes
.
size
();
kernel_name
<<
"concat_"
<<
join
(
dtypes
,
"_"
)
<<
"_r_"
<<
input_size
;
std
::
stringstream
hash
;
hash
<<
kernel_name
.
str
()
<<
"_o_"
<<
join
(
output_shape
,
"_"
)
<<
"_a_"
<<
concat_axis
;
for
(
size_t
i
=
0
;
i
<
input_size
;
i
++
)
{
hash
<<
"_i_"
<<
join
(
input_shapes
[
i
],
"_"
);
}
// For backwards compatability we currently use two unordered maps
// 1. one looks up the compiled cuda kernel (CudaFunctionPool)
// 2. the other looks to see if this kernel is already in the primitive list
// check if the requested kernel is already an inserted primitive
size_t
primitive_index
=
m_primitive_emitter
->
lookup
(
hash
.
str
());
if
(
primitive_index
!=
std
::
numeric_limits
<
size_t
>::
max
())
{
return
primitive_index
;
}
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto
compiled_kernel
=
m_ctx
->
compiled_kernel_pool
->
get
(
kernel_name
.
str
());
if
(
compiled_kernel
==
nullptr
)
{
codegen
::
CodeWriter
writer
;
CudaKernelBuilder
::
add_pod_typedefs
(
writer
);
CudaKernelBuilder
::
get_concat_op
(
writer
,
kernel_name
.
str
(),
dtypes
,
input_shapes
.
size
());
compiled_kernel
=
m_ctx
->
compiled_kernel_pool
->
set
(
kernel_name
.
str
(),
writer
.
get_code
());
}
std
::
vector
<
uint32_t
>
block_strides
(
input_size
,
1
);
uint32_t
block_size
=
0
;
for
(
size_t
i
=
0
;
i
<
input_size
;
i
++
)
{
auto
arg_rank
=
input_shapes
[
i
].
size
();
for
(
size_t
j
=
concat_axis
;
j
<
arg_rank
;
j
++
)
{
block_strides
[
i
]
*=
input_shapes
[
i
][
j
];
}
block_size
+=
block_strides
[
i
];
}
uint32_t
nthreads
=
static_cast
<
uint32_t
>
(
shape_size
(
output_shape
));
//TODO: currently we set it to 64, will add tuning method later
uint32_t
block_size_x
=
64
;
uint32_t
aligned_grid_size_x
=
align_to_block_size
(
nthreads
,
block_size_x
);
// get an allocator for transient per kernel gpu memory
GPUAllocator
allocator
=
this
->
m_primitive_emitter
->
get_memory_allocator
();
size_t
idx_block_strides
=
allocator
.
reserve_argspace
(
block_strides
.
data
(),
block_strides
.
size
()
*
sizeof
(
uint32_t
));
// create the launch primitive
std
::
unique_ptr
<
gpu
::
primitive
>
kernel_launch
(
new
gpu
::
primitive
{[
=
](
void
**
inputs
,
void
**
outputs
)
mutable
{
void
*
param_block_strides
=
runtime
::
gpu
::
invoke_memory_primitive
(
m_ctx
,
idx_block_strides
);
std
::
vector
<
void
*>
args_list
;
for
(
size_t
i
=
0
;
i
<
input_size
;
i
++
)
{
args_list
.
push_back
(
&
inputs
[
i
]);
}
args_list
.
push_back
(
&
outputs
[
0
]);
args_list
.
push_back
(
&
param_block_strides
);
args_list
.
push_back
(
&
block_size
);
args_list
.
push_back
(
&
nthreads
);
CUDA_SAFE_CALL
(
cuLaunchKernel
(
*
compiled_kernel
.
get
(),
aligned_grid_size_x
,
1
,
1
,
// grid dim
block_size_x
,
1
,
1
,
// block dim
0
,
NULL
,
// shared mem and stream
args_list
.
data
(),
0
));
// arguments
debug_sync
();
}});
primitive_index
=
this
->
m_primitive_emitter
->
insert
(
std
::
move
(
kernel_launch
));
m_primitive_emitter
->
cache
(
hash
.
str
(),
primitive_index
);
return
primitive_index
;
}
size_t
runtime
::
gpu
::
CUDAEmitter
::
build_pad
(
const
std
::
array
<
std
::
string
,
2
>&
dtypes
,
GPUShape
input_shape
,
GPUShape
output_shape
,
...
...
src/ngraph/runtime/gpu/cuda_emitter.hpp
View file @
38ba5c12
...
...
@@ -135,6 +135,11 @@ namespace ngraph
GPUShape
filter_dilation
,
GPUShape
output_shape
);
size_t
build_concat
(
const
std
::
vector
<
std
::
string
>&
dtypes
,
std
::
vector
<
GPUShape
>
input_shapes
,
size_t
concat_axis
,
GPUShape
output_shape
);
void
debug_sync
();
void
sync
();
...
...
src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
View file @
38ba5c12
...
...
@@ -252,17 +252,17 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer
<<
data_types
[
i
]
<<
"* in"
<<
i
<<
", "
;
}
writer
<<
data_types
[
num_inputs
]
<<
"* out,
size_t* block_strides, size_t block_size, size
_t n)
\n
"
;
<<
"* out,
uint32_t* block_strides, uint32_t block_size, uint32
_t n)
\n
"
;
writer
.
block_begin
();
{
writer
<<
"
size
_t tid = blockIdx.x * blockDim.x + threadIdx.x;
\n
"
;
writer
<<
"
uint32
_t tid = blockIdx.x * blockDim.x + threadIdx.x;
\n
"
;
writer
<<
"if(tid < n)
\n
"
;
writer
.
block_begin
();
{
writer
<<
"out[tid] = 1;
\n
"
;
writer
<<
"
size
_t output_idx = tid;
\n
"
;
writer
<<
"
size
_t block_id = tid / block_size;
\n
"
;
writer
<<
"
size
_t block_idx = tid % block_size;
\n
"
;
writer
<<
"
uint32
_t output_idx = tid;
\n
"
;
writer
<<
"
uint32
_t block_id = tid / block_size;
\n
"
;
writer
<<
"
uint32
_t block_idx = tid % block_size;
\n
"
;
writer
<<
"bool processed = false;
\n
"
;
for
(
size_t
i
=
0
;
i
<
num_inputs
;
i
++
)
{
...
...
src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
View file @
38ba5c12
...
...
@@ -54,46 +54,6 @@ namespace ngraph
CUdeviceptr
reverse_axes
,
size_t
rank
,
size_t
count
);
template
<
typename
...
Inputs
>
void
emit_concat_op
(
const
std
::
string
&
name
,
const
std
::
vector
<
std
::
string
>&
data_types
,
GPURuntimeContext
*
ctx
,
size_t
count
,
size_t
block_size
,
CUdeviceptr
block_strides
,
CUdeviceptr
out
,
Inputs
&&
...
inputs
)
{
std
::
string
type_signature
=
"_"
+
join
(
data_types
,
"_"
);
std
::
replace
(
type_signature
.
begin
(),
type_signature
.
end
(),
' '
,
'_'
);
auto
compiled_kernel
=
ctx
->
compiled_kernel_pool
->
get
(
name
+
type_signature
);
if
(
compiled_kernel
==
nullptr
)
{
codegen
::
CodeWriter
writer
;
CudaKernelBuilder
::
add_pod_typedefs
(
writer
);
CudaKernelBuilder
::
get_concat_op
(
writer
,
name
+
type_signature
,
data_types
,
sizeof
...(
inputs
));
std
::
string
kernel
=
writer
.
get_code
();
compiled_kernel
=
ctx
->
compiled_kernel_pool
->
set
(
name
+
type_signature
,
kernel
);
}
void
*
args_list
[]
=
{
&
inputs
...,
&
out
,
&
block_strides
,
&
block_size
,
&
count
};
CUDA_SAFE_CALL
(
cuLaunchKernel
(
*
compiled_kernel
.
get
(),
count
,
1
,
1
,
// grid dim
1
,
1
,
1
,
// block dim
0
,
NULL
,
// shared mem and stream
args_list
,
0
));
// arguments
CUDA_SAFE_CALL
(
cuCtxSynchronize
());
// Retrieve and print output.
}
}
}
}
src/ngraph/runtime/gpu/gpu_emitter.cpp
View file @
38ba5c12
...
...
@@ -824,46 +824,32 @@ namespace ngraph
auto
concat
=
static_cast
<
const
ngraph
::
op
::
Concat
*>
(
node
);
auto
axis
=
concat
->
get_concatenation_axis
();
std
::
vector
<
s
ize_t
>
block_strides
(
args
.
size
(),
1
)
;
s
ize_t
block_size
=
0
;
for
(
size_t
i
=
0
;
i
<
args
.
size
();
i
++
)
std
::
vector
<
s
td
::
string
>
dtypes
;
s
td
::
vector
<
GPUShape
>
input_shapes
;
for
(
auto
arg
:
args
)
{
auto
arg_shape
=
args
[
i
].
get_shape
();
auto
arg_rank
=
arg_shape
.
size
();
for
(
size_t
j
=
axis
;
j
<
arg_rank
;
j
++
)
{
block_strides
[
i
]
*=
arg_shape
[
j
];
}
block_size
+=
block_strides
[
i
];
dtypes
.
push_back
(
arg
.
get_type
());
input_shapes
.
push_back
(
arg
.
get_shape
());
}
dtypes
.
push_back
(
out
[
0
].
get_type
());
writer
.
block_begin
();
writer
<<
"int count = "
<<
out
[
0
].
get_size
()
<<
";
\n
"
;
writer
<<
"int num_inputs = "
<<
args
.
size
()
<<
";
\n
"
;
GPUAllocator
allocator
=
external_function
->
get_primitive_emitter
()
->
get_memory_allocator
();
size_t
idx_block_strides
=
allocator
.
reserve_argspace
(
block_strides
.
data
(),
block_strides
.
size
()
*
sizeof
(
size_t
));
writer
<<
"void* block_strides_d = runtime::gpu::invoke_memory_primitive(ctx, "
<<
idx_block_strides
<<
");
\n
"
;
writer
<<
"ngraph::runtime::gpu::emit_concat_op(
\"
"
<<
node
->
description
()
<<
"
\"
"
<<
", std::vector<std::string>{"
;
for
(
size_t
i
=
0
;
i
<
args
.
size
();
i
++
)
{
writer
<<
"
\"
"
<<
args
[
i
].
get_type
()
<<
"
\"
, "
;
}
writer
<<
"
\"
"
<<
out
[
0
].
get_type
()
<<
"
\"
}"
<<
", ctx"
<<
", count"
<<
", "
<<
block_size
<<
", CUdeviceptr(block_strides_d)"
<<
", CUdeviceptr("
<<
out
[
0
].
get_name
()
<<
")"
;
for
(
size_t
i
=
0
;
i
<
args
.
size
();
i
++
)
auto
&
cuda_emitter
=
external_function
->
get_primitive_emitter
()
->
get_cuda_emitter
();
auto
index
=
cuda_emitter
->
build_concat
(
dtypes
,
input_shapes
,
axis
,
out
[
0
].
get_shape
());
writer
<<
"gpu::invoke_primitive(ctx, "
<<
index
<<
", "
;
writer
<<
"std::vector<void*>{"
<<
args
[
0
].
get_name
()
;
for
(
size_t
i
=
1
;
i
<
args
.
size
();
i
++
)
{
writer
<<
", CUdeviceptr("
<<
args
[
i
].
get_name
()
<<
")"
;
writer
<<
", "
<<
args
[
i
].
get_name
()
;
}
writer
<<
"}.data(), "
;
writer
<<
"std::vector<void*>{"
<<
out
[
0
].
get_name
()
<<
"}.data()"
;
writer
<<
");
\n
"
;
}
writer
.
block_end
();
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment