Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
ngraph
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ngraph
Commits
a86a9050
Commit
a86a9050
authored
Mar 14, 2018
by
Fenglei
Committed by
Robert Kimball
Mar 14, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
gpu add onehot op (#638)
* add onehot op * refactor broadcast and onehot op
parent
9d89ffb9
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
164 additions
and
41 deletions
+164
-41
gpu_cuda_kernel_builder.cpp
src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+48
-0
gpu_cuda_kernel_builder.hpp
src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+8
-0
gpu_cuda_kernel_emitters.cpp
src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+50
-25
gpu_cuda_kernel_emitters.hpp
src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+16
-2
gpu_emitter.cpp
src/ngraph/runtime/gpu/gpu_emitter.cpp
+42
-5
autodiff.in.cpp
test/autodiff.in.cpp
+0
-2
backend_test.in.cpp
test/backend_test.in.cpp
+0
-7
No files found.
src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
View file @
a86a9050
...
...
@@ -56,6 +56,54 @@ void runtime::gpu::CudaKernelBuilder::get_elementwise_op(
return
;
}
void
runtime
::
gpu
::
CudaKernelBuilder
::
get_broadcast_op
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
array
<
std
::
string
,
2
>&
data_types
)
{
writer
<<
"extern
\"
C
\"
__global__ void cuda_"
<<
name
<<
"("
<<
data_types
[
0
]
<<
"* in, "
<<
data_types
[
1
]
<<
"* out, size_t m, size_t k, size_t n)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
writer
<<
"size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
\n
"
;
writer
<<
"if (tid < n)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
writer
<<
"size_t idx = tid / (m * k) * m + tid % m;
\n
"
;
writer
<<
"out[tid] = in[idx];
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
void
runtime
::
gpu
::
CudaKernelBuilder
::
get_onehot_op
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
array
<
std
::
string
,
2
>&
data_types
)
{
writer
<<
"extern
\"
C
\"
__global__ void cuda_"
<<
name
<<
"("
<<
data_types
[
0
]
<<
"* in, "
<<
data_types
[
1
]
<<
"* out, size_t m, size_t k, size_t n)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
writer
<<
"size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
\n
"
;
writer
<<
"if (tid < n)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
writer
<<
"size_t idx = (tid / m) * m * k + (m * in[tid]) + tid % m;
\n
"
;
writer
<<
"out[idx] = 1;
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
void
runtime
::
gpu
::
CudaKernelBuilder
::
get_device_helper
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
...
...
src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
View file @
a86a9050
...
...
@@ -39,6 +39,14 @@ namespace ngraph
const
std
::
array
<
std
::
string
,
2
>&
data_types
,
const
size_t
&
num_inputs
);
static
void
get_broadcast_op
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
array
<
std
::
string
,
2
>&
data_types
);
static
void
get_onehot_op
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
array
<
std
::
string
,
2
>&
data_types
);
static
void
get_device_helper
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
string
&
math_kernel
,
...
...
src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
View file @
a86a9050
...
...
@@ -21,40 +21,65 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
using
namespace
ngraph
;
using
namespace
ngraph
::
runtime
::
gpu
;
void
runtime
::
gpu
::
emit_broadcast
(
void
*
in
,
void
*
out
,
size_t
repeat_size
,
size_t
repeat_times
,
size_t
count
)
void
runtime
::
gpu
::
emit_broadcast
(
std
::
string
name
,
CUdeviceptr
in
,
CUdeviceptr
out
,
std
::
array
<
std
::
string
,
2
>
data_types
,
size_t
repeat_size
,
size_t
repeat_times
,
size_t
count
)
{
std
::
string
name
=
"broadcast"
;
std
::
string
name_signature
=
name
+
"_"
+
data_types
[
0
]
+
"_"
+
data_types
[
1
];
std
::
replace
(
name_signature
.
begin
(),
name_signature
.
end
(),
' '
,
'_'
);
// Create an instance of nvrtcProgram with the code string.
if
(
CudaFunctionPool
::
instance
().
get
(
name
)
==
nullptr
)
if
(
CudaFunctionPool
::
instance
().
get
(
name
_signature
)
==
nullptr
)
{
std
::
string
kernel
;
std
::
string
data_type
(
"float"
);
codegen
::
CodeWriter
writer
;
CudaKernelBuilder
::
add_pod_typedefs
(
writer
);
CudaKernelBuilder
::
get_broadcast_op
(
writer
,
name_signature
,
data_types
);
std
::
string
kernel
=
writer
.
get_code
();
CudaFunctionPool
::
instance
().
set
(
name_signature
,
kernel
);
}
kernel
=
R"(
extern "C" __global__
void cuda_)"
+
name
+
"("
+
data_type
+
"* in, "
+
data_type
+
"* out, size_t m, size_t k, size_t n)
\n
"
+
R"(
void
*
args_list
[]
=
{
&
in
,
&
out
,
&
repeat_size
,
&
repeat_times
,
&
count
};
CUDA_SAFE_CALL
(
cuLaunchKernel
(
*
CudaFunctionPool
::
instance
().
get
(
name_signature
).
get
(),
static_cast
<
unsigned
int
>
(
count
),
1
,
1
,
// grid dim
1
,
1
,
1
,
// block dim
0
,
NULL
,
// shared mem and stream
args_list
,
0
));
// arguments
CUDA_SAFE_CALL
(
cuCtxSynchronize
());
// Retrieve and print output.
}
void
runtime
::
gpu
::
emit_onehot
(
std
::
string
name
,
CUdeviceptr
in
,
CUdeviceptr
out
,
std
::
array
<
std
::
string
,
2
>
data_types
,
size_t
repeat_size
,
size_t
repeat_times
,
size_t
count
)
{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
std
::
string
name_signature
=
name
+
"_"
+
data_types
[
0
]
+
"_"
+
data_types
[
1
];
std
::
replace
(
name_signature
.
begin
(),
name_signature
.
end
(),
' '
,
'_'
);
// Create an instance of nvrtcProgram with the code string.
if
(
CudaFunctionPool
::
instance
().
get
(
name_signature
)
==
nullptr
)
{
size_t idx = tid / (m * k) * m + tid % m;
out[tid] = in[idx];
codegen
::
CodeWriter
writer
;
CudaKernelBuilder
::
add_pod_typedefs
(
writer
);
CudaKernelBuilder
::
get_onehot_op
(
writer
,
name_signature
,
data_types
);
std
::
string
kernel
=
writer
.
get_code
();
CudaFunctionPool
::
instance
().
set
(
name_signature
,
kernel
);
}
})"
;
CudaFunctionPool
::
instance
().
set
(
name
,
kernel
);
}
//convert runtime ptr to driver api ptr
CUdeviceptr
d_ptr_in
,
d_ptr_out
;
d_ptr_in
=
CUdeviceptr
(
in
);
d_ptr_out
=
CUdeviceptr
(
out
);
void
*
args_list
[]
=
{
&
d_ptr_in
,
&
d_ptr_
out
,
&
repeat_size
,
&
repeat_times
,
&
count
};
CUDA_SAFE_CALL
(
cuLaunchKernel
(
*
CudaFunctionPool
::
instance
().
get
(
name
).
get
(),
void
*
args_list
[]
=
{
&
in
,
&
out
,
&
repeat_size
,
&
repeat_times
,
&
count
};
CUDA_SAFE_CALL
(
cuLaunchKernel
(
*
CudaFunctionPool
::
instance
().
get
(
name
_signature
).
get
(),
static_cast
<
unsigned
int
>
(
count
),
1
,
1
,
// grid dim
...
...
src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
View file @
a86a9050
...
...
@@ -34,8 +34,21 @@ namespace ngraph
template
<
typename
T
>
struct
CudaOpMap
;
void
emit_broadcast
(
void
*
in
,
void
*
out
,
size_t
repeat_size
,
size_t
repeat_times
,
size_t
count
);
void
emit_broadcast
(
std
::
string
name
,
CUdeviceptr
in
,
CUdeviceptr
out
,
std
::
array
<
std
::
string
,
2
>
data_types
,
size_t
repeat_size
,
size_t
repeat_times
,
size_t
count
);
void
emit_onehot
(
std
::
string
name
,
CUdeviceptr
in
,
CUdeviceptr
out
,
std
::
array
<
std
::
string
,
2
>
data_types
,
size_t
repeat_size
,
size_t
repeat_times
,
size_t
count
);
template
<
typename
T
,
typename
...
Inputs
>
void
emit_elementwise_op
(
std
::
string
name
,
...
...
@@ -45,6 +58,7 @@ namespace ngraph
Inputs
&&
...
inputs
)
{
std
::
string
type_signature
=
"_"
+
data_types
[
0
]
+
"_"
+
data_types
[
1
];
std
::
replace
(
type_signature
.
begin
(),
type_signature
.
end
(),
' '
,
'_'
);
if
(
CudaFunctionPool
::
instance
().
get
(
name
+
type_signature
)
==
nullptr
)
{
codegen
::
CodeWriter
writer
;
...
...
src/ngraph/runtime/gpu/gpu_emitter.cpp
View file @
a86a9050
...
...
@@ -123,10 +123,10 @@ namespace ngraph
<<
n
->
description
()
<<
">(
\"
"
<<
n
->
description
()
<<
"
\"
"
<<
", {
\"
"
<<
args
[
0
].
get_type
()
<<
"
\"
,
\"
"
<<
out
[
0
].
get_type
()
<<
"
\"
}"
<<
", count"
<<
",
(CUdeviceptr) "
<<
out
[
0
].
get_name
()
;
<<
",
CUdeviceptr("
<<
out
[
0
].
get_name
()
<<
")"
;
for
(
size_t
i
=
0
;
i
<
args
.
size
();
i
++
)
{
writer
<<
",
(CUdeviceptr) "
<<
args
[
i
].
get_name
()
;
writer
<<
",
CUdeviceptr("
<<
args
[
i
].
get_name
()
<<
")"
;
}
writer
<<
");
\n
"
;
writer
.
indent
--
;
...
...
@@ -489,9 +489,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
writer
<<
"{ // "
<<
node
->
get_name
()
<<
"
\n
"
;
writer
.
indent
++
;
writer
<<
"runtime::gpu::emit_broadcast("
<<
args
[
0
].
get_name
()
<<
", "
<<
out
[
0
].
get_name
()
<<
", "
<<
repeat_size
<<
", "
<<
repeat_times
<<
", "
<<
out
[
0
].
get_size
()
<<
");
\n
"
;
writer
<<
"runtime::gpu::emit_broadcast(
\"
"
<<
node
->
description
()
<<
"
\"
, CUdeviceptr("
<<
args
[
0
].
get_name
()
<<
"), CUdeviceptr("
<<
out
[
0
].
get_name
()
<<
")"
<<
", {
\"
"
<<
args
[
0
].
get_type
()
<<
"
\"
,
\"
"
<<
out
[
0
].
get_type
()
<<
"
\"
}"
<<
", "
<<
repeat_size
<<
", "
<<
repeat_times
<<
", "
<<
out
[
0
].
get_size
()
<<
");
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
...
...
@@ -619,6 +624,38 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
writer
<<
"}
\n
"
;
}
template
<>
void
GPU_Emitter
::
EMITTER_DECL
(
ngraph
::
op
::
OneHot
)
{
if
(
out
[
0
].
get_size
()
==
0
)
{
return
;
}
auto
onehot
=
static_cast
<
const
ngraph
::
op
::
OneHot
*>
(
node
);
auto
arg_shape
=
args
[
0
].
get_shape
();
auto
result_shape
=
out
[
0
].
get_shape
();
size_t
idx
=
onehot
->
get_one_hot_axis
();
size_t
repeat_times
=
result_shape
[
idx
];
size_t
repeat_size
=
1
;
for
(
size_t
i
=
idx
+
1
;
i
<
result_shape
.
size
();
i
++
)
{
repeat_size
*=
result_shape
[
i
];
}
writer
<<
"{ // "
<<
node
->
get_name
()
<<
"
\n
"
;
writer
.
indent
++
;
writer
<<
"runtime::gpu::cuda_memset("
<<
out
[
0
].
get_name
()
<<
", 0, "
<<
out
[
0
].
get_size
()
<<
" * "
<<
out
[
0
].
get_element_type
().
size
()
<<
");
\n
"
;
writer
<<
"runtime::gpu::emit_onehot(
\"
"
<<
node
->
description
()
<<
"
\"
, CUdeviceptr("
<<
args
[
0
].
get_name
()
<<
"), CUdeviceptr("
<<
out
[
0
].
get_name
()
<<
")"
<<
", {
\"
"
<<
args
[
0
].
get_type
()
<<
"
\"
,
\"
"
<<
out
[
0
].
get_type
()
<<
"
\"
}"
<<
", "
<<
repeat_size
<<
", "
<<
repeat_times
<<
", "
<<
args
[
0
].
get_size
()
<<
");
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
template
<>
void
GPU_Emitter
::
EMITTER_DECL
(
ngraph
::
op
::
Sqrt
)
{
...
...
test/autodiff.in.cpp
View file @
a86a9050
...
...
@@ -909,7 +909,6 @@ TEST(${BACKEND_NAME}, backwards_log)
TEST
(
$
{
BACKEND_NAME
},
backwards_maximum
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
auto
manager
=
runtime
::
Manager
::
get
(
"${BACKEND_NAME}"
);
auto
backend
=
manager
->
allocate_backend
();
...
...
@@ -930,7 +929,6 @@ TEST(${BACKEND_NAME}, backwards_maximum)
TEST
(
$
{
BACKEND_NAME
},
backwards_minimum
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
auto
manager
=
runtime
::
Manager
::
get
(
"${BACKEND_NAME}"
);
auto
backend
=
manager
->
allocate_backend
();
...
...
test/backend_test.in.cpp
View file @
a86a9050
...
...
@@ -3764,7 +3764,6 @@ TEST(${BACKEND_NAME}, replace_slice_vector)
TEST
(
$
{
BACKEND_NAME
},
one_hot_scalar_2_in_3
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
Shape
shape_a
{};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shape_a
);
Shape
shape_r
{
3
};
...
...
@@ -3787,7 +3786,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_2_in_3)
TEST
(
$
{
BACKEND_NAME
},
one_hot_scalar_1_in_3
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
Shape
shape_a
{};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shape_a
);
Shape
shape_r
{
3
};
...
...
@@ -3810,7 +3808,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_1_in_3)
TEST
(
$
{
BACKEND_NAME
},
one_hot_scalar_0_in_3
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
Shape
shape_a
{};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shape_a
);
Shape
shape_r
{
3
};
...
...
@@ -3899,7 +3896,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_oob_in_3)
TEST
(
$
{
BACKEND_NAME
},
one_hot_vector_0
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
Shape
shape_a
{
8
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shape_a
);
Shape
shape_r
{
3
,
8
};
...
...
@@ -3924,7 +3920,6 @@ TEST(${BACKEND_NAME}, one_hot_vector_0)
TEST
(
$
{
BACKEND_NAME
},
one_hot_vector_1
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
Shape
shape_a
{
8
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shape_a
);
Shape
shape_r
{
8
,
3
};
...
...
@@ -4015,7 +4010,6 @@ TEST(${BACKEND_NAME}, one_hot_vector_1_far_oob)
TEST
(
$
{
BACKEND_NAME
},
one_hot_matrix_0
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
Shape
shape_a
{
3
,
3
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
i32
,
shape_a
);
Shape
shape_r
{
3
,
3
,
3
};
...
...
@@ -4046,7 +4040,6 @@ TEST(${BACKEND_NAME}, one_hot_matrix_0)
TEST
(
$
{
BACKEND_NAME
},
one_hot_vector_1_fp
)
{
SKIP_TEST_FOR
(
"GPU"
,
"${BACKEND_NAME}"
);
Shape
shape_a
{
8
};
auto
A
=
make_shared
<
op
::
Parameter
>
(
element
::
f32
,
shape_a
);
Shape
shape_r
{
8
,
3
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment