Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
N
ngraph
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
ngraph
Commits
b5467550
Unverified
Commit
b5467550
authored
Mar 13, 2018
by
Chris Sullivan
Committed by
GitHub
Mar 13, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Updated gpu cpp files with consistent use of namespaces (cosmetic) (#629)
* Updated namespace use in cpp files.
parent
a32fdab5
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
708 additions
and
779 deletions
+708
-779
gpu_cuda_context_manager.cpp
src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
+12
-19
gpu_cuda_function_builder.cpp
src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
+29
-36
gpu_cuda_function_pool.cpp
src/ngraph/runtime/gpu/gpu_cuda_function_pool.cpp
+23
-32
gpu_cuda_kernel_builder.cpp
src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+55
-62
gpu_cuda_kernel_emitters.cpp
src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+31
-38
gpu_external_function.cpp
src/ngraph/runtime/gpu/gpu_external_function.cpp
+558
-592
No files found.
src/ngraph/runtime/gpu/gpu_cuda_context_manager.cpp
View file @
b5467550
...
...
@@ -19,25 +19,18 @@
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
namespace
ngraph
using
namespace
ngraph
;
runtime
::
gpu
::
CudaContextManager
&
runtime
::
gpu
::
CudaContextManager
::
instance
()
{
namespace
runtime
{
namespace
gpu
{
CudaContextManager
&
CudaContextManager
::
instance
()
{
static
CudaContextManager
manager
;
return
manager
;
}
static
CudaContextManager
manager
;
return
manager
;
}
CudaContextManager
::
CudaContextManager
()
{
CUDA_SAFE_CALL
(
cuInit
(
0
));
CUDA_SAFE_CALL
(
cuDeviceGet
(
&
m_device
,
0
));
CUDA_SAFE_CALL
(
cuCtxCreate
(
&
m_context
,
0
,
m_device
));
m_context_ptr
=
std
::
make_shared
<
CUcontext
>
(
m_context
);
}
}
}
runtime
::
gpu
::
CudaContextManager
::
CudaContextManager
()
{
CUDA_SAFE_CALL
(
cuInit
(
0
));
CUDA_SAFE_CALL
(
cuDeviceGet
(
&
m_device
,
0
));
CUDA_SAFE_CALL
(
cuCtxCreate
(
&
m_context
,
0
,
m_device
));
m_context_ptr
=
std
::
make_shared
<
CUcontext
>
(
m_context
);
}
src/ngraph/runtime/gpu/gpu_cuda_function_builder.cpp
View file @
b5467550
...
...
@@ -20,46 +20,39 @@
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace
ngraph
using
namespace
ngraph
;
std
::
shared_ptr
<
CUfunction
>
runtime
::
gpu
::
CudaFunctionBuilder
::
get
(
const
std
::
string
&
name
,
const
std
::
string
&
kernel
,
int
number_of_options
,
const
char
**
options
)
{
namespace
runtime
{
namespace
gpu
{
std
::
shared_ptr
<
CUfunction
>
CudaFunctionBuilder
::
get
(
const
std
::
string
&
name
,
const
std
::
string
&
kernel
,
int
number_of_options
,
const
char
**
options
)
{
nvrtcProgram
prog
;
NVRTC_SAFE_CALL
(
nvrtcCreateProgram
(
&
prog
,
kernel
.
c_str
(),
"op.cu"
,
0
,
// numHeaders
NULL
,
// headers
NULL
));
// includeNames
nvrtcProgram
prog
;
NVRTC_SAFE_CALL
(
nvrtcCreateProgram
(
&
prog
,
kernel
.
c_str
(),
"op.cu"
,
0
,
// numHeaders
NULL
,
// headers
NULL
));
// includeNames
nvrtcResult
compile_result
=
nvrtcCompileProgram
(
prog
,
number_of_options
,
options
);
nvrtcResult
compile_result
=
nvrtcCompileProgram
(
prog
,
number_of_options
,
options
);
if
(
compile_result
!=
NVRTC_SUCCESS
)
{
throw
std
::
runtime_error
(
"compile error:
\n
"
+
kernel
+
"
\n
options"
);
}
if
(
compile_result
!=
NVRTC_SUCCESS
)
{
throw
std
::
runtime_error
(
"compile error:
\n
"
+
kernel
+
"
\n
options"
);
}
size_t
ptx_size
;
NVRTC_SAFE_CALL
(
nvrtcGetPTXSize
(
prog
,
&
ptx_size
));
char
*
ptx
=
new
char
[
ptx_size
];
NVRTC_SAFE_CALL
(
nvrtcGetPTX
(
prog
,
size_t
ptx_size
;
NVRTC_SAFE_CALL
(
nvrtcGetPTXSize
(
prog
,
&
ptx_size
));
char
*
ptx
=
new
char
[
ptx_size
];
NVRTC_SAFE_CALL
(
nvrtcGetPTX
(
prog
,
ptx
));
// Load the generated PTX and get a handle to the parent kernel.
NVRTC_SAFE_CALL
(
nvrtcDestroyProgram
(
&
prog
));
// Destroy the program.
NVRTC_SAFE_CALL
(
nvrtcDestroyProgram
(
&
prog
));
// Destroy the program.
CUmodule
module
;
CUfunction
function
;
CUDA_SAFE_CALL
(
cuModuleLoadDataEx
(
&
module
,
ptx
,
0
,
0
,
0
));
CUDA_SAFE_CALL
(
cuModuleGetFunction
(
&
function
,
module
,
name
.
c_str
()));
return
std
::
make_shared
<
CUfunction
>
(
function
);
}
}
}
CUmodule
module
;
CUfunction
function
;
CUDA_SAFE_CALL
(
cuModuleLoadDataEx
(
&
module
,
ptx
,
0
,
0
,
0
));
CUDA_SAFE_CALL
(
cuModuleGetFunction
(
&
function
,
module
,
name
.
c_str
()));
return
std
::
make_shared
<
CUfunction
>
(
function
);
}
src/ngraph/runtime/gpu/gpu_cuda_function_pool.cpp
View file @
b5467550
...
...
@@ -26,40 +26,31 @@
static
const
std
::
string
s_output_dir
=
"gpu_codegen"
;
namespace
ngraph
using
namespace
ngraph
;
runtime
::
gpu
::
CudaFunctionPool
&
runtime
::
gpu
::
CudaFunctionPool
::
instance
()
{
namespace
runtime
{
namespace
gpu
{
CudaFunctionPool
&
CudaFunctionPool
::
instance
()
{
static
CudaFunctionPool
pool
;
return
pool
;
}
static
CudaFunctionPool
pool
;
return
pool
;
}
void
CudaFunctionPool
::
set
(
const
std
::
string
&
name
,
const
std
::
string
&
kernel
)
{
const
char
*
opts
[]
=
{
"--gpu-architecture=compute_35"
,
"--relocatable-device-code=true"
};
std
::
string
filename
=
file_util
::
path_join
(
s_output_dir
,
"cuda_kernel_"
+
name
+
"_codegen.cu"
);
std
::
ofstream
out
(
filename
);
out
<<
kernel
;
out
.
close
();
m_function_map
.
insert
(
{
name
,
CudaFunctionBuilder
::
get
(
"cuda_"
+
name
,
kernel
,
2
,
opts
)});
}
void
runtime
::
gpu
::
CudaFunctionPool
::
set
(
const
std
::
string
&
name
,
const
std
::
string
&
kernel
)
{
const
char
*
opts
[]
=
{
"--gpu-architecture=compute_35"
,
"--relocatable-device-code=true"
};
std
::
string
filename
=
file_util
::
path_join
(
s_output_dir
,
"cuda_kernel_"
+
name
+
"_codegen.cu"
);
std
::
ofstream
out
(
filename
);
out
<<
kernel
;
out
.
close
();
m_function_map
.
insert
({
name
,
CudaFunctionBuilder
::
get
(
"cuda_"
+
name
,
kernel
,
2
,
opts
)});
}
std
::
shared_ptr
<
CUfunction
>
CudaFunctionPool
::
get
(
const
std
::
string
&
name
)
{
auto
it
=
m_function_map
.
find
(
name
);
if
(
it
!=
m_function_map
.
end
())
{
return
(
*
it
).
second
;
}
return
nullptr
;
}
}
std
::
shared_ptr
<
CUfunction
>
runtime
::
gpu
::
CudaFunctionPool
::
get
(
const
std
::
string
&
name
)
{
auto
it
=
m_function_map
.
find
(
name
);
if
(
it
!=
m_function_map
.
end
())
{
return
(
*
it
).
second
;
}
return
nullptr
;
}
src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
View file @
b5467550
...
...
@@ -16,74 +16,67 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/codegen/code_writer.hpp"
namespace
ngraph
using
namespace
ngraph
;
void
runtime
::
gpu
::
CudaKernelBuilder
::
get_elementwise_op
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
string
&
data_type
,
const
std
::
string
&
op
,
const
size_t
&
num_inputs
)
{
namespace
runtime
writer
<<
"extern
\"
C
\"
__global__ void cuda_"
<<
name
<<
"("
;
for
(
size_t
i
=
0
;
i
<
num_inputs
;
i
++
)
{
writer
<<
data_type
<<
"* in"
<<
i
<<
", "
;
}
writer
<<
data_type
<<
"* out,"
<<
"size_t n)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
namespace
gpu
writer
<<
"size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
\n
"
;
writer
<<
"if (tid < n)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
void
CudaKernelBuilder
::
get_elementwise_op
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
string
&
data_type
,
const
std
::
string
&
op
,
const
size_t
&
num_inputs
)
writer
<<
"out[tid] = "
<<
op
<<
"("
;
for
(
size_t
i
=
0
;
i
<
num_inputs
-
1
;
i
++
)
{
writer
<<
"extern
\"
C
\"
__global__ void cuda_"
<<
name
<<
"("
;
for
(
size_t
i
=
0
;
i
<
num_inputs
;
i
++
)
{
writer
<<
data_type
<<
"* in"
<<
i
<<
", "
;
}
writer
<<
data_type
<<
"* out,"
<<
"size_t n)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
writer
<<
"size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
\n
"
;
writer
<<
"if (tid < n)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
writer
<<
"out[tid] = "
<<
op
<<
"("
;
for
(
size_t
i
=
0
;
i
<
num_inputs
-
1
;
i
++
)
{
writer
<<
"in"
<<
i
<<
"[tid], "
;
}
writer
<<
"in"
<<
num_inputs
-
1
<<
"[tid]);
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
return
;
writer
<<
"in"
<<
i
<<
"[tid], "
;
}
writer
<<
"in"
<<
num_inputs
-
1
<<
"[tid]);
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
void
CudaKernelBuilder
::
get_device_helper
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
string
&
data_type
,
const
std
::
string
&
math_kernel
,
const
size_t
&
num_inputs
)
{
if
(
math_kernel
.
size
())
{
writer
<<
"__device__ "
<<
data_type
<<
" "
<<
name
<<
"("
;
for
(
size_t
i
=
0
;
i
<
num_inputs
-
1
;
i
++
)
{
writer
<<
data_type
<<
" x"
<<
i
<<
", "
;
}
writer
<<
data_type
<<
" x"
<<
num_inputs
-
1
;
writer
<<
")
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
writer
<<
"return "
+
math_kernel
<<
";
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
return
;
}
return
;
}
void
runtime
::
gpu
::
CudaKernelBuilder
::
get_device_helper
(
codegen
::
CodeWriter
&
writer
,
const
std
::
string
&
name
,
const
std
::
string
&
data_type
,
const
std
::
string
&
math_kernel
,
const
size_t
&
num_inputs
)
{
if
(
math_kernel
.
size
())
{
writer
<<
"__device__ "
<<
data_type
<<
" "
<<
name
<<
"("
;
for
(
size_t
i
=
0
;
i
<
num_inputs
-
1
;
i
++
)
{
writer
<<
data_type
<<
" x"
<<
i
<<
", "
;
}
writer
<<
data_type
<<
" x"
<<
num_inputs
-
1
;
writer
<<
")
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
{
writer
<<
"return "
+
math_kernel
<<
";
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
return
;
}
src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
View file @
b5467550
...
...
@@ -20,26 +20,22 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
namespace
ngraph
using
namespace
ngraph
;
void
runtime
::
gpu
::
emit_broadcast
(
void
*
in
,
void
*
out
,
size_t
repeat_size
,
size_t
repeat_times
,
size_t
count
)
{
namespace
runtime
std
::
string
name
=
"broadcast"
;
// Create an instance of nvrtcProgram with the code string.
if
(
CudaFunctionPool
::
instance
().
get
(
name
)
==
nullptr
)
{
namespace
gpu
{
void
emit_broadcast
(
void
*
in
,
void
*
out
,
size_t
repeat_size
,
size_t
repeat_times
,
size_t
count
)
{
std
::
string
name
=
"broadcast"
;
// Create an instance of nvrtcProgram with the code string.
if
(
CudaFunctionPool
::
instance
().
get
(
name
)
==
nullptr
)
{
std
::
string
kernel
;
std
::
string
data_type
(
"float"
);
std
::
string
kernel
;
std
::
string
data_type
(
"float"
);
kernel
=
R"(
kernel
=
R"(
extern "C" __global__
void cuda_)"
+
name
+
"("
+
data_type
+
"* in, "
+
data_type
+
"* out, size_t m, size_t k, size_t n)
\n
"
+
R"(
void cuda_)"
+
name
+
"("
+
data_type
+
"* in, "
+
data_type
+
"* out, size_t m, size_t k, size_t n)
\n
"
+
R"(
{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
...
...
@@ -48,28 +44,25 @@ void cuda_)" + name + "(" + data_type +
out[tid] = in[idx];
}
})"
;
CudaFunctionPool
::
instance
().
set
(
name
,
kernel
);
}
CudaFunctionPool
::
instance
().
set
(
name
,
kernel
);
}
//convert runtime ptr to driver api ptr
CUdeviceptr
d_ptr_in
,
d_ptr_out
;
d_ptr_in
=
CUdeviceptr
(
in
);
d_ptr_out
=
CUdeviceptr
(
out
);
//convert runtime ptr to driver api ptr
CUdeviceptr
d_ptr_in
,
d_ptr_out
;
d_ptr_in
=
CUdeviceptr
(
in
);
d_ptr_out
=
CUdeviceptr
(
out
);
void
*
args_list
[]
=
{
&
d_ptr_in
,
&
d_ptr_out
,
&
repeat_size
,
&
repeat_times
,
&
count
};
CUDA_SAFE_CALL
(
cuLaunchKernel
(
*
CudaFunctionPool
::
instance
().
get
(
name
).
get
(),
static_cast
<
unsigned
int
>
(
count
),
1
,
1
,
// grid dim
1
,
1
,
1
,
// block dim
0
,
NULL
,
// shared mem and stream
args_list
,
0
));
// arguments
CUDA_SAFE_CALL
(
cuCtxSynchronize
());
// Retrieve and print output.
}
}
}
void
*
args_list
[]
=
{
&
d_ptr_in
,
&
d_ptr_out
,
&
repeat_size
,
&
repeat_times
,
&
count
};
CUDA_SAFE_CALL
(
cuLaunchKernel
(
*
CudaFunctionPool
::
instance
().
get
(
name
).
get
(),
static_cast
<
unsigned
int
>
(
count
),
1
,
1
,
// grid dim
1
,
1
,
1
,
// block dim
0
,
NULL
,
// shared mem and stream
args_list
,
0
));
// arguments
CUDA_SAFE_CALL
(
cuCtxSynchronize
());
// Retrieve and print output.
}
src/ngraph/runtime/gpu/gpu_external_function.cpp
View file @
b5467550
...
...
@@ -114,6 +114,7 @@
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
using
namespace
std
;
using
namespace
ngraph
;
static
const
string
s_output_dir
=
"gpu_codegen"
;
...
...
@@ -159,119 +160,113 @@ static StaticInitializers s_static_initializers;
#define TI(x) type_index(typeid(x))
namespace
ngraph
static
const
runtime
::
gpu
::
OpMap
dispatcher
{
{
TI
(
ngraph
::
op
::
Add
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Add
>
},
{
TI
(
ngraph
::
op
::
Dot
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Dot
>
},
{
TI
(
ngraph
::
op
::
Multiply
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Multiply
>
},
{
TI
(
ngraph
::
op
::
Parameter
),
&
runtime
::
gpu
::
GPU_Emitter
::
nop
},
{
TI
(
ngraph
::
op
::
Abs
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Concat
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Concat
>
},
{
TI
(
ngraph
::
op
::
Divide
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Equal
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Equal
>
},
{
TI
(
ngraph
::
op
::
GetOutputElement
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
GetOutputElement
>
},
{
TI
(
ngraph
::
op
::
Greater
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Greater
>
},
{
TI
(
ngraph
::
op
::
GreaterEq
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
GreaterEq
>
},
{
TI
(
ngraph
::
op
::
Less
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Less
>
},
{
TI
(
ngraph
::
op
::
LessEq
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
LessEq
>
},
{
TI
(
ngraph
::
op
::
Log
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Maximum
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Maximum
>
},
{
TI
(
ngraph
::
op
::
Minimum
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Minimum
>
},
{
TI
(
ngraph
::
op
::
Negative
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Negative
>
},
{
TI
(
ngraph
::
op
::
NotEqual
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
NotEqual
>
},
{
TI
(
ngraph
::
op
::
Power
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Select
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Select
>
},
{
TI
(
ngraph
::
op
::
Subtract
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Broadcast
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Broadcast
>
},
{
TI
(
ngraph
::
op
::
Convert
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Convert
>
},
{
TI
(
ngraph
::
op
::
Constant
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Constant
>
},
{
TI
(
ngraph
::
op
::
Reshape
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Reshape
>
},
{
TI
(
ngraph
::
op
::
FunctionCall
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
FunctionCall
>
},
{
TI
(
ngraph
::
op
::
Reduce
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Reduce
>
},
{
TI
(
ngraph
::
op
::
Sign
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Slice
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Slice
>
},
{
TI
(
ngraph
::
op
::
Sum
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Sum
>
},
{
TI
(
ngraph
::
op
::
Exp
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Sin
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Sinh
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Cos
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Cosh
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Tan
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Tanh
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Asin
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Acos
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Atan
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
ReplaceSlice
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
ReplaceSlice
>
},
{
TI
(
ngraph
::
op
::
OneHot
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
OneHot
>
},
{
TI
(
ngraph
::
op
::
Floor
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Ceiling
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Sqrt
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Sqrt
>
},
{
TI
(
ngraph
::
op
::
Convolution
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Convolution
>
},
{
TI
(
ngraph
::
op
::
ConvolutionBackpropFilters
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
ConvolutionBackpropFilters
>
},
{
TI
(
ngraph
::
op
::
ConvolutionBackpropData
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
ConvolutionBackpropData
>
},
{
TI
(
ngraph
::
op
::
Not
),
&
runtime
::
gpu
::
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
MaxPool
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
MaxPool
>
},
{
TI
(
ngraph
::
op
::
Reverse
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Reverse
>
},
{
TI
(
ngraph
::
op
::
Result
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Result
>
},
{
TI
(
ngraph
::
op
::
ReduceWindow
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
ReduceWindow
>
},
{
TI
(
ngraph
::
op
::
SelectAndScatter
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
SelectAndScatter
>
},
{
TI
(
ngraph
::
op
::
AvgPool
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
AvgPool
>
},
{
TI
(
ngraph
::
op
::
AvgPoolBackprop
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
AvgPoolBackprop
>
},
{
TI
(
ngraph
::
op
::
Pad
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Pad
>
},
{
TI
(
ngraph
::
op
::
BatchNorm
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
BatchNorm
>
},
{
TI
(
ngraph
::
op
::
BatchNormBackprop
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
BatchNormBackprop
>
},
{
TI
(
ngraph
::
op
::
MaxPoolBackprop
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
MaxPoolBackprop
>
},
{
TI
(
ngraph
::
op
::
Product
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Product
>
},
{
TI
(
ngraph
::
op
::
Max
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Max
>
},
{
TI
(
ngraph
::
op
::
Min
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Min
>
},
{
TI
(
ngraph
::
op
::
Relu
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Relu
>
},
{
TI
(
ngraph
::
op
::
ReluBackprop
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
ReluBackprop
>
},
{
TI
(
ngraph
::
op
::
Softmax
),
&
runtime
::
gpu
::
GPU_Emitter
::
emit
<
ngraph
::
op
::
Softmax
>
},
};
runtime
::
gpu
::
GPU_ExternalFunction
::
GPU_ExternalFunction
(
const
shared_ptr
<
ngraph
::
Function
>&
function
,
bool
release_function
)
:
ngraph
::
runtime
::
ExternalFunction
(
function
,
release_function
)
,
m_compiled_function
(
nullptr
)
,
m_emit_timing
(
std
::
getenv
(
"NGRAPH_GPU_EMIT_TIMING"
)
!=
nullptr
)
{
namespace
runtime
{
namespace
gpu
{
static
const
OpMap
dispatcher
{
{
TI
(
ngraph
::
op
::
Add
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Add
>
},
{
TI
(
ngraph
::
op
::
Dot
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Dot
>
},
{
TI
(
ngraph
::
op
::
Multiply
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Multiply
>
},
{
TI
(
ngraph
::
op
::
Parameter
),
&
GPU_Emitter
::
nop
},
{
TI
(
ngraph
::
op
::
Abs
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Concat
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Concat
>
},
{
TI
(
ngraph
::
op
::
Divide
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Equal
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Equal
>
},
{
TI
(
ngraph
::
op
::
GetOutputElement
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
GetOutputElement
>
},
{
TI
(
ngraph
::
op
::
Greater
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Greater
>
},
{
TI
(
ngraph
::
op
::
GreaterEq
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
GreaterEq
>
},
{
TI
(
ngraph
::
op
::
Less
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Less
>
},
{
TI
(
ngraph
::
op
::
LessEq
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
LessEq
>
},
{
TI
(
ngraph
::
op
::
Log
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Maximum
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Maximum
>
},
{
TI
(
ngraph
::
op
::
Minimum
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Minimum
>
},
{
TI
(
ngraph
::
op
::
Negative
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Negative
>
},
{
TI
(
ngraph
::
op
::
NotEqual
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
NotEqual
>
},
{
TI
(
ngraph
::
op
::
Power
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Select
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Select
>
},
{
TI
(
ngraph
::
op
::
Subtract
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Broadcast
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Broadcast
>
},
{
TI
(
ngraph
::
op
::
Convert
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Convert
>
},
{
TI
(
ngraph
::
op
::
Constant
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Constant
>
},
{
TI
(
ngraph
::
op
::
Reshape
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Reshape
>
},
{
TI
(
ngraph
::
op
::
FunctionCall
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
FunctionCall
>
},
{
TI
(
ngraph
::
op
::
Reduce
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Reduce
>
},
{
TI
(
ngraph
::
op
::
Sign
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Slice
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Slice
>
},
{
TI
(
ngraph
::
op
::
Sum
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Sum
>
},
{
TI
(
ngraph
::
op
::
Exp
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Sin
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Sinh
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Cos
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Cosh
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Tan
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Tanh
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Asin
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Acos
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Atan
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
ReplaceSlice
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
ReplaceSlice
>
},
{
TI
(
ngraph
::
op
::
OneHot
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
OneHot
>
},
{
TI
(
ngraph
::
op
::
Floor
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Ceiling
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
Sqrt
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Sqrt
>
},
{
TI
(
ngraph
::
op
::
Convolution
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Convolution
>
},
{
TI
(
ngraph
::
op
::
ConvolutionBackpropFilters
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
ConvolutionBackpropFilters
>
},
{
TI
(
ngraph
::
op
::
ConvolutionBackpropData
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
ConvolutionBackpropData
>
},
{
TI
(
ngraph
::
op
::
Not
),
&
GPU_Emitter
::
EmitElementwise
},
{
TI
(
ngraph
::
op
::
MaxPool
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
MaxPool
>
},
{
TI
(
ngraph
::
op
::
Reverse
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Reverse
>
},
{
TI
(
ngraph
::
op
::
Result
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Result
>
},
{
TI
(
ngraph
::
op
::
ReduceWindow
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
ReduceWindow
>
},
{
TI
(
ngraph
::
op
::
SelectAndScatter
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
SelectAndScatter
>
},
{
TI
(
ngraph
::
op
::
AvgPool
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
AvgPool
>
},
{
TI
(
ngraph
::
op
::
AvgPoolBackprop
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
AvgPoolBackprop
>
},
{
TI
(
ngraph
::
op
::
Pad
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Pad
>
},
{
TI
(
ngraph
::
op
::
BatchNorm
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
BatchNorm
>
},
{
TI
(
ngraph
::
op
::
BatchNormBackprop
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
BatchNormBackprop
>
},
{
TI
(
ngraph
::
op
::
MaxPoolBackprop
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
MaxPoolBackprop
>
},
{
TI
(
ngraph
::
op
::
Product
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Product
>
},
{
TI
(
ngraph
::
op
::
Max
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Max
>
},
{
TI
(
ngraph
::
op
::
Min
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Min
>
},
{
TI
(
ngraph
::
op
::
Relu
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Relu
>
},
{
TI
(
ngraph
::
op
::
ReluBackprop
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
ReluBackprop
>
},
{
TI
(
ngraph
::
op
::
Softmax
),
&
GPU_Emitter
::
emit
<
ngraph
::
op
::
Softmax
>
},
};
GPU_ExternalFunction
::
GPU_ExternalFunction
(
const
shared_ptr
<
ngraph
::
Function
>&
function
,
bool
release_function
)
:
ngraph
::
runtime
::
ExternalFunction
(
function
,
release_function
)
,
m_compiled_function
(
nullptr
)
,
m_emit_timing
(
std
::
getenv
(
"NGRAPH_GPU_EMIT_TIMING"
)
!=
nullptr
)
{
}
}
void
GPU_ExternalFunction
::
compile
()
{
if
(
m_is_compiled
)
{
return
;
}
void
runtime
::
gpu
::
GPU_ExternalFunction
::
compile
()
{
if
(
m_is_compiled
)
{
return
;
}
string
function_name
=
m_function
->
get_name
();
string
dump_filename
=
file_util
::
path_join
(
s_output_dir
,
function_name
+
"_ops.txt"
);
string
function_name
=
m_function
->
get_name
();
string
dump_filename
=
file_util
::
path_join
(
s_output_dir
,
function_name
+
"_ops.txt"
);
pass
::
Manager
pass_manager
;
// pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major.
pass_manager
.
register_pass
<
pass
::
AssignLayout
<
descriptor
::
layout
::
DenseTensorViewLayout
>>
();
pass_manager
.
register_pass
<
pass
::
Liveness
>
();
pass_manager
.
register_pass
<
pass
::
MemoryLayout
>
(
64
);
pass_manager
.
register_pass
<
pass
::
DumpSorted
>
(
dump_filename
);
pass_manager
.
run_passes
(
m_function
);
pass
::
Manager
pass_manager
;
// pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major.
pass_manager
.
register_pass
<
pass
::
AssignLayout
<
descriptor
::
layout
::
DenseTensorViewLayout
>>
();
pass_manager
.
register_pass
<
pass
::
Liveness
>
();
pass_manager
.
register_pass
<
pass
::
MemoryLayout
>
(
64
);
pass_manager
.
register_pass
<
pass
::
DumpSorted
>
(
dump_filename
);
pass_manager
.
run_passes
(
m_function
);
codegen
::
CodeWriter
writer
;
codegen
::
CodeWriter
writer
;
writer
+=
R"(// Generated by the NGraph GPU backend
writer
+=
R"(// Generated by the NGraph GPU backend
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
...
...
@@ -297,541 +292,512 @@ namespace ngraph
#include "ngraph/util.hpp"
)"
;
string
pch_header_source
=
writer
.
get_code
();
string
pch_header_source
=
writer
.
get_code
();
writer
+=
R"(
writer
+=
R"(
using namespace ngraph;
using namespace std;
)"
;
if
(
m_emit_timing
)
{
writer
<<
"// Declare debug timers
\n
"
;
vector
<
string
>
names
;
for
(
shared_ptr
<
Function
>
current_function
:
pass_manager
.
get_state
().
get_functions
())
{
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
if
(
!
node
->
is_parameter
()
&&
!
node
->
is_constant
())
{
names
.
push_back
(
node
->
get_name
());
}
}
}
for
(
const
string
&
s
:
names
)
{
writer
<<
"ngraph::stopwatch timer_"
<<
s
<<
";
\n
"
;
}
writer
<<
"extern
\"
C
\"
size_t get_debug_timer_count() { return "
<<
names
.
size
()
<<
"; }
\n
"
;
writer
<<
"extern
\"
C
\"
const char* get_debug_timer_name(size_t index)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
"const char* rc;
\n
"
;
writer
<<
"switch(index)
\n
"
;
writer
<<
"{
\n
"
;
for
(
size_t
i
=
0
;
i
<
names
.
size
();
i
++
)
{
writer
<<
"case "
<<
i
<<
": rc =
\"
"
<<
names
[
i
]
<<
"
\"
; break;
\n
"
;
}
writer
<<
"default: rc =
\"\"
;
\n
"
;
writer
<<
"}
\n
"
;
writer
<<
"return rc;
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
writer
<<
"extern
\"
C
\"
const size_t get_debug_timer_microseconds(size_t index)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
"size_t rc;
\n
"
;
writer
<<
"switch(index)
\n
"
;
writer
<<
"{
\n
"
;
for
(
size_t
i
=
0
;
i
<
names
.
size
();
i
++
)
{
writer
<<
"case "
<<
i
<<
": rc = timer_"
<<
names
[
i
]
<<
".get_total_microseconds(); break;
\n
"
;
}
writer
<<
"default: rc = 0;
\n
"
;
writer
<<
"}
\n
"
;
writer
<<
"return rc;
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
writer
<<
"extern
\"
C
\"
const size_t get_debug_timer_call_count(size_t index)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
"size_t rc;
\n
"
;
writer
<<
"switch(index)
\n
"
;
writer
<<
"{
\n
"
;
for
(
size_t
i
=
0
;
i
<
names
.
size
();
i
++
)
{
writer
<<
"case "
<<
i
<<
": rc = timer_"
<<
names
[
i
]
<<
".get_call_count(); break;
\n
"
;
}
writer
<<
"default: rc = 0;
\n
"
;
writer
<<
"}
\n
"
;
writer
<<
"return rc;
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
writer
<<
"
\n
"
;
}
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
// // atexit() happens too late, when the JIT is no longer alive
writer
<<
"void *__dso_handle = 0;
\n\n
"
;
writer
<<
"// Declare all constants
\n
"
;
for
(
shared_ptr
<
Function
>
current_function
:
pass_manager
.
get_state
().
get_functions
())
if
(
m_emit_timing
)
{
writer
<<
"// Declare debug timers
\n
"
;
vector
<
string
>
names
;
for
(
shared_ptr
<
Function
>
current_function
:
pass_manager
.
get_state
().
get_functions
())
{
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
if
(
!
node
->
is_parameter
()
&&
!
node
->
is_constant
())
{
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
const
op
::
Constant
*
c
=
dynamic_cast
<
ngraph
::
op
::
Constant
*>
(
node
.
get
());
if
(
c
)
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
node
->
get_outputs
()[
0
].
get_tensor_view
();
auto
c_value_strings
=
c
->
get_value_strings
();
writer
<<
"static "
<<
tv
->
get_tensor
().
get_element_type
().
c_type_string
()
<<
" "
<<
tv
->
get_tensor
().
get_name
()
<<
"_cpu["
<<
c_value_strings
.
size
()
<<
"] =
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
emit_string_array
(
c_value_strings
,
100
-
writer
.
indent
*
4
);
writer
.
indent
--
;
writer
<<
"
\n
};
\n\n
"
;
writer
<<
"static "
<<
tv
->
get_tensor
().
get_element_type
().
c_type_string
()
<<
" *"
<<
tv
->
get_tensor
().
get_name
()
<<
";
\n
"
;
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]
=
tv
->
get_tensor
().
get_name
();
}
}
names
.
push_back
(
node
->
get_name
());
}
}
}
for
(
const
string
&
s
:
names
)
{
writer
<<
"ngraph::stopwatch timer_"
<<
s
<<
";
\n
"
;
}
writer
<<
"extern
\"
C
\"
size_t get_debug_timer_count() { return "
<<
names
.
size
()
<<
"; }
\n
"
;
writer
<<
"extern
\"
C
\"
const char* get_debug_timer_name(size_t index)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
"const char* rc;
\n
"
;
writer
<<
"switch(index)
\n
"
;
writer
<<
"{
\n
"
;
for
(
size_t
i
=
0
;
i
<
names
.
size
();
i
++
)
{
writer
<<
"case "
<<
i
<<
": rc =
\"
"
<<
names
[
i
]
<<
"
\"
; break;
\n
"
;
}
writer
<<
"default: rc =
\"\"
;
\n
"
;
writer
<<
"}
\n
"
;
writer
<<
"return rc;
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
writer
<<
"extern
\"
C
\"
const size_t get_debug_timer_microseconds(size_t index)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
"size_t rc;
\n
"
;
writer
<<
"switch(index)
\n
"
;
writer
<<
"{
\n
"
;
for
(
size_t
i
=
0
;
i
<
names
.
size
();
i
++
)
{
writer
<<
"case "
<<
i
<<
": rc = timer_"
<<
names
[
i
]
<<
".get_total_microseconds(); break;
\n
"
;
}
writer
<<
"default: rc = 0;
\n
"
;
writer
<<
"}
\n
"
;
writer
<<
"return rc;
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
writer
<<
"extern
\"
C
\"
const size_t get_debug_timer_call_count(size_t index)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
"size_t rc;
\n
"
;
writer
<<
"switch(index)
\n
"
;
writer
<<
"{
\n
"
;
for
(
size_t
i
=
0
;
i
<
names
.
size
();
i
++
)
{
writer
<<
"case "
<<
i
<<
": rc = timer_"
<<
names
[
i
]
<<
".get_call_count(); break;
\n
"
;
}
writer
<<
"default: rc = 0;
\n
"
;
writer
<<
"}
\n
"
;
writer
<<
"return rc;
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
writer
<<
"
\n
"
;
}
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
// // atexit() happens too late, when the JIT is no longer alive
writer
<<
"void *__dso_handle = 0;
\n\n
"
;
writer
<<
"// Declare all constants
\n
"
;
for
(
shared_ptr
<
Function
>
current_function
:
pass_manager
.
get_state
().
get_functions
())
{
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
const
op
::
Constant
*
c
=
dynamic_cast
<
ngraph
::
op
::
Constant
*>
(
node
.
get
());
if
(
c
)
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
node
->
get_outputs
()[
0
].
get_tensor_view
();
auto
c_value_strings
=
c
->
get_value_strings
();
writer
<<
"static "
<<
tv
->
get_tensor
().
get_element_type
().
c_type_string
()
<<
" "
<<
tv
->
get_tensor
().
get_name
()
<<
"_cpu["
<<
c_value_strings
.
size
()
<<
"] =
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
emit_string_array
(
c_value_strings
,
100
-
writer
.
indent
*
4
);
writer
.
indent
--
;
writer
<<
"
\n
};
\n\n
"
;
writer
<<
"static "
<<
tv
->
get_tensor
().
get_element_type
().
c_type_string
()
<<
" *"
<<
tv
->
get_tensor
().
get_name
()
<<
";
\n
"
;
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]
=
tv
->
get_tensor
().
get_name
();
}
}
}
writer
<<
"// Declare all functions
\n
"
;
for
(
shared_ptr
<
Function
>
f
:
pass_manager
.
get_state
().
get_functions
())
{
writer
<<
"extern
\"
C
\"
void "
<<
f
->
get_name
()
<<
"(void** inputs, void** outputs, "
"cublasHandle_t& cublas_handle, "
"cudnnHandle_t& cudnn_handle);
\n
"
;
}
writer
<<
"// Declare all functions
\n
"
;
for
(
shared_ptr
<
Function
>
f
:
pass_manager
.
get_state
().
get_functions
())
{
writer
<<
"extern
\"
C
\"
void "
<<
f
->
get_name
()
<<
"(void** inputs, void** outputs, "
"cublasHandle_t& cublas_handle, "
"cudnnHandle_t& cudnn_handle);
\n
"
;
}
writer
<<
"
\n
"
;
writer
<<
"
\n
"
;
unordered_map
<
Node
*
,
string
>
match_functions
;
for
(
shared_ptr
<
Function
>
current_function
:
pass_manager
.
get_state
().
get_functions
())
unordered_map
<
Node
*
,
string
>
match_functions
;
for
(
shared_ptr
<
Function
>
current_function
:
pass_manager
.
get_state
().
get_functions
())
{
set
<
string
>
output_names
;
for
(
shared_ptr
<
Node
>
op
:
current_function
->
get_results
())
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
op
->
get_output_tensor_view
();
output_names
.
insert
(
tv
->
get_tensor
().
get_name
());
}
const
list
<
shared_ptr
<
Node
>>&
tmp
=
current_function
->
get_ordered_ops
();
if
(
tmp
.
size
()
<
2
)
{
// Since we are comparing ops there must be at least two ops to proceed.
continue
;
}
vector
<
shared_ptr
<
Node
>>
op_list
{
tmp
.
begin
(),
tmp
.
end
()};
for
(
size_t
i
=
0
;
i
<
op_list
.
size
()
-
1
;
i
++
)
{
if
(
op_list
[
i
]
->
is_constant
()
||
op_list
[
i
]
->
is_parameter
())
{
continue
;
}
if
(
contains_key
(
match_functions
,
op_list
[
i
].
get
()))
{
continue
;
}
string
match_function_name
;
if
(
!
match_function_name
.
empty
())
{
writer
<<
"static void "
<<
match_function_name
<<
"("
;
writer
.
indent
++
;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto
&
n
=
*
op_list
[
i
];
auto
handler
=
dispatcher
.
find
(
type_index
(
typeid
(
n
)));
vector
<
GPU_TensorViewWrapper
>
in
;
size_t
arg_index
=
0
;
set
<
string
>
arg_names
;
for
(
const
descriptor
::
Input
&
input
:
n
.
get_inputs
())
{
set
<
string
>
output_names
;
for
(
shared_ptr
<
Node
>
op
:
current_function
->
get_results
())
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
op
->
get_output_tensor_view
();
output_names
.
insert
(
tv
->
get_tensor
().
get_name
());
}
const
list
<
shared_ptr
<
Node
>>&
tmp
=
current_function
->
get_ordered_ops
();
if
(
tmp
.
size
()
<
2
)
{
// Since we are comparing ops there must be at least two ops to proceed.
continue
;
}
vector
<
shared_ptr
<
Node
>>
op_list
{
tmp
.
begin
(),
tmp
.
end
()};
for
(
size_t
i
=
0
;
i
<
op_list
.
size
()
-
1
;
i
++
)
const
descriptor
::
Output
&
output
=
input
.
get_output
();
shared_ptr
<
descriptor
::
TensorView
>
tv
=
output
.
get_tensor_view
();
GPU_TensorViewWrapper
tvw
{
tv
,
"_arg"
+
to_string
(
arg_index
)};
if
(
!
contains
(
arg_names
,
tvw
.
get_name
()))
{
if
(
op_list
[
i
]
->
is_constant
()
||
op_list
[
i
]
->
is_parameter
())
arg_names
.
insert
(
tvw
.
get_name
());
if
(
arg_index
++
>
0
)
{
continue
;
}
if
(
contains_key
(
match_functions
,
op_list
[
i
].
get
()))
{
continue
;
}
string
match_function_name
;
if
(
!
match_function_name
.
empty
())
{
writer
<<
"static void "
<<
match_function_name
<<
"("
;
writer
.
indent
++
;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto
&
n
=
*
op_list
[
i
];
auto
handler
=
dispatcher
.
find
(
type_index
(
typeid
(
n
)));
vector
<
GPU_TensorViewWrapper
>
in
;
size_t
arg_index
=
0
;
set
<
string
>
arg_names
;
for
(
const
descriptor
::
Input
&
input
:
n
.
get_inputs
())
{
const
descriptor
::
Output
&
output
=
input
.
get_output
();
shared_ptr
<
descriptor
::
TensorView
>
tv
=
output
.
get_tensor_view
();
GPU_TensorViewWrapper
tvw
{
tv
,
"_arg"
+
to_string
(
arg_index
)};
if
(
!
contains
(
arg_names
,
tvw
.
get_name
()))
{
arg_names
.
insert
(
tvw
.
get_name
());
if
(
arg_index
++
>
0
)
{
writer
<<
","
;
}
writer
<<
"
\n
"
;
writer
<<
tvw
.
get_type
()
<<
"* "
<<
tvw
.
get_name
();
}
in
.
push_back
(
tvw
);
}
vector
<
GPU_TensorViewWrapper
>
out
;
for
(
const
descriptor
::
Output
&
output
:
n
.
get_outputs
())
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
output
.
get_tensor_view
();
GPU_TensorViewWrapper
tvw
{
tv
,
"_out"
+
to_string
(
arg_index
)};
if
(
arg_index
++
>
0
)
{
writer
<<
","
;
}
writer
<<
"
\n
"
;
writer
<<
tvw
.
get_type
()
<<
"* "
<<
tvw
.
get_name
();
out
.
push_back
(
tvw
);
}
writer
.
indent
--
;
writer
<<
"
\n
)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
handler
->
second
(
this
,
writer
,
&
n
,
in
,
out
);
writer
.
indent
--
;
writer
<<
"}
\n
"
;
writer
<<
","
;
}
writer
<<
"
\n
"
;
writer
<<
tvw
.
get_type
()
<<
"* "
<<
tvw
.
get_name
();
}
in
.
push_back
(
tvw
);
}
for
(
shared_ptr
<
Function
>
current_function
:
pass_manager
.
get_state
().
get_functions
())
vector
<
GPU_TensorViewWrapper
>
out
;
for
(
const
descriptor
::
Output
&
output
:
n
.
get_outputs
())
{
set
<
string
>
output_names
;
for
(
shared_ptr
<
Node
>
op
:
current_function
->
get_results
())
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
op
->
get_output_tensor_view
();
output_names
.
insert
(
tv
->
get_tensor
().
get_name
());
}
set
<
descriptor
::
TensorView
*>
constants
;
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
shared_ptr
<
descriptor
::
TensorView
>
tv
=
output
.
get_tensor_view
();
GPU_TensorViewWrapper
tvw
{
tv
,
"_out"
+
to_string
(
arg_index
)};
if
(
arg_index
++
>
0
)
{
if
(
dynamic_cast
<
ngraph
::
op
::
Constant
*>
(
node
.
get
()))
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
node
->
get_outputs
()[
0
].
get_tensor_view
();
constants
.
insert
(
tv
.
get
());
}
writer
<<
","
;
}
writer
<<
"
\n
"
;
writer
<<
tvw
.
get_type
()
<<
"* "
<<
tvw
.
get_name
();
out
.
push_back
(
tvw
);
}
writer
.
indent
--
;
writer
<<
"
\n
)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
handler
->
second
(
this
,
writer
,
&
n
,
in
,
out
);
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
}
}
writer
<<
"extern
\"
C
\"
void "
<<
current_function
->
get_name
();
writer
<<
"(void** inputs, void** outputs, cublasHandle_t& cublas_handle, "
"cudnnHandle_t& "
"cudnn_handle)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
for
(
shared_ptr
<
Function
>
current_function
:
pass_manager
.
get_state
().
get_functions
())
{
set
<
string
>
output_names
;
for
(
shared_ptr
<
Node
>
op
:
current_function
->
get_results
())
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
op
->
get_output_tensor_view
();
output_names
.
insert
(
tv
->
get_tensor
().
get_name
());
}
set
<
descriptor
::
TensorView
*>
constants
;
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
if
(
dynamic_cast
<
ngraph
::
op
::
Constant
*>
(
node
.
get
()))
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
node
->
get_outputs
()[
0
].
get_tensor_view
();
constants
.
insert
(
tv
.
get
());
}
}
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
const
op
::
Constant
*
c
=
dynamic_cast
<
op
::
Constant
*>
(
node
.
get
());
if
(
c
)
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
node
->
get_outputs
()[
0
].
get_tensor_view
();
writer
<<
"if("
<<
tv
->
get_tensor
().
get_name
()
<<
" == NULL)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
"runtime::gpu::cuda_memcpyHtD("
<<
tv
->
get_tensor
().
get_name
()
<<
", "
<<
tv
->
get_tensor
().
get_name
()
<<
"_cpu, "
<<
tv
->
get_tensor
().
size
()
<<
");
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
}
writer
<<
"extern
\"
C
\"
void "
<<
current_function
->
get_name
();
writer
<<
"(void** inputs, void** outputs, cublasHandle_t& cublas_handle, "
"cudnnHandle_t& "
"cudnn_handle)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
bool
temporaries_used
=
false
;
size_t
worst_case_tmp_size
=
0
;
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
if
(
node
->
liveness_new_list
.
size
()
>
0
)
{
temporaries_used
=
true
;
for
(
descriptor
::
Tensor
*
tensor
:
node
->
liveness_new_list
)
{
worst_case_tmp_size
+=
tensor
->
size
();
}
}
}
if
(
temporaries_used
)
{
size_t
temp_pool_size
=
current_function
->
get_temporary_pool_size
();
writer
<<
"// Allocate the memory pool
\n
"
;
// TODO memory pool malloc.
writer
<<
"void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<<
temp_pool_size
<<
");
\n
"
;
// Add temporaries to the variable name map
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
for
(
descriptor
::
Tensor
*
tensor
:
node
->
liveness_new_list
)
{
stringstream
ss
;
ss
<<
"(("
<<
tensor
->
get_element_type
().
c_type_string
()
<<
"*)((char *)pool_base_ptr + "
<<
tensor
->
get_pool_offset
()
<<
"))"
;
m_variable_name_map
[
tensor
->
get_name
()]
=
ss
.
str
();
}
}
}
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
const
op
::
Constant
*
c
=
dynamic_cast
<
op
::
Constant
*>
(
node
.
get
());
if
(
c
)
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
node
->
get_outputs
()[
0
].
get_tensor_view
();
writer
<<
"if("
<<
tv
->
get_tensor
().
get_name
()
<<
" == NULL)
\n
"
;
writer
<<
"{
\n
"
;
writer
.
indent
++
;
writer
<<
"runtime::gpu::cuda_memcpyHtD("
<<
tv
->
get_tensor
().
get_name
()
<<
", "
<<
tv
->
get_tensor
().
get_name
()
<<
"_cpu, "
<<
tv
->
get_tensor
().
size
()
<<
");
\n
"
;
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
}
// Add inputs to the variable name map
size_t
arg_index
=
0
;
for
(
shared_ptr
<
ngraph
::
op
::
Parameter
>
param
:
current_function
->
get_parameters
())
{
for
(
size_t
i
=
0
;
i
<
param
->
get_output_size
();
++
i
)
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
param
->
get_output_tensor_view
(
i
);
const
element
::
Type
&
et
=
tv
->
get_tensor_view_type
()
->
get_element_type
();
string
type
=
et
.
c_type_string
();
stringstream
ss
;
ss
<<
"(("
<<
type
<<
"*)(inputs["
<<
arg_index
<<
"]))"
;
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]
=
ss
.
str
();
arg_index
++
;
}
}
bool
temporaries_used
=
false
;
size_t
worst_case_tmp_size
=
0
;
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
if
(
node
->
liveness_new_list
.
size
()
>
0
)
{
temporaries_used
=
true
;
for
(
descriptor
::
Tensor
*
tensor
:
node
->
liveness_new_list
)
{
worst_case_tmp_size
+=
tensor
->
size
();
}
}
}
if
(
temporaries_used
)
{
size_t
temp_pool_size
=
current_function
->
get_temporary_pool_size
();
writer
<<
"// Allocate the memory pool
\n
"
;
// TODO memory pool malloc.
writer
<<
"void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<<
temp_pool_size
<<
");
\n
"
;
// Add temporaries to the variable name map
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
for
(
descriptor
::
Tensor
*
tensor
:
node
->
liveness_new_list
)
{
stringstream
ss
;
ss
<<
"(("
<<
tensor
->
get_element_type
().
c_type_string
()
<<
"*)((char *)pool_base_ptr + "
<<
tensor
->
get_pool_offset
()
<<
"))"
;
m_variable_name_map
[
tensor
->
get_name
()]
=
ss
.
str
();
}
}
}
// create output alias map
size_t
output_index
=
0
;
unordered_map
<
descriptor
::
TensorView
*
,
vector
<
size_t
>>
output_alias_map
;
vector
<
size_t
>
aliases
;
for
(
size_t
i
=
0
;
i
<
current_function
->
get_output_size
();
++
i
)
{
shared_ptr
<
Node
>
op
=
current_function
->
get_output_op
(
i
);
shared_ptr
<
descriptor
::
TensorView
>
otv
=
op
->
get_output_tensor_view
();
vector
<
size_t
>&
al
=
output_alias_map
[
otv
.
get
()];
al
.
push_back
(
output_index
);
if
(
al
.
size
()
>
1
)
{
aliases
.
push_back
(
output_index
);
}
output_index
++
;
}
// Add inputs to the variable name map
size_t
arg_index
=
0
;
for
(
shared_ptr
<
ngraph
::
op
::
Parameter
>
param
:
current_function
->
get_parameters
())
{
for
(
size_t
i
=
0
;
i
<
param
->
get_output_size
();
++
i
)
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
param
->
get_output_tensor_view
(
i
);
const
element
::
Type
&
et
=
tv
->
get_tensor_view_type
()
->
get_element_type
();
string
type
=
et
.
c_type_string
();
stringstream
ss
;
ss
<<
"(("
<<
type
<<
"*)(inputs["
<<
arg_index
<<
"]))"
;
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]
=
ss
.
str
();
arg_index
++
;
}
}
// Add outputs to the variable name map
output_index
=
0
;
for
(
size_t
i
=
0
;
i
<
current_function
->
get_output_size
();
++
i
)
{
shared_ptr
<
Node
>
op
=
current_function
->
get_output_op
(
i
);
shared_ptr
<
descriptor
::
TensorView
>
tv
=
op
->
get_output_tensor_view
();
const
element
::
Type
&
et
=
tv
->
get_tensor_view_type
()
->
get_element_type
();
bool
parameter_as_output
=
false
;
for
(
shared_ptr
<
ngraph
::
op
::
Parameter
>
param
:
current_function
->
get_parameters
())
{
for
(
const
descriptor
::
Output
&
pout
:
param
->
get_outputs
())
{
shared_ptr
<
descriptor
::
TensorView
>
ptv
=
pout
.
get_tensor_view
();
if
(
tv
==
ptv
)
{
parameter_as_output
=
true
;
writer
<<
"ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
<<
et
.
c_type_string
()
<<
"*>(outputs["
<<
output_index
<<
"]), "
<<
m_variable_name_map
[
ptv
->
get_tensor
().
get_name
()]
<<
", "
<<
ptv
->
get_tensor
().
size
()
<<
");
\n
"
;
break
;
}
}
}
if
(
!
parameter_as_output
&&
!
contains
(
aliases
,
output_index
))
{
if
(
contains
(
constants
,
tv
.
get
()))
{
writer
<<
"ngraph::runtime::gpu::cuda_memcpyHtD(outputs["
<<
output_index
<<
"], "
<<
tv
->
get_tensor
().
get_name
()
<<
", "
<<
tv
->
get_tensor
().
size
()
<<
");
\n
"
;
}
else
{
string
type
=
et
.
c_type_string
();
stringstream
ss
;
ss
<<
"(("
<<
type
<<
"*)(outputs["
<<
output_index
<<
"]))"
;
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]
=
ss
.
str
();
}
}
output_index
++
;
}
// create output alias map
size_t
output_index
=
0
;
unordered_map
<
descriptor
::
TensorView
*
,
vector
<
size_t
>>
output_alias_map
;
vector
<
size_t
>
aliases
;
for
(
size_t
i
=
0
;
i
<
current_function
->
get_output_size
();
++
i
)
{
shared_ptr
<
Node
>
op
=
current_function
->
get_output_op
(
i
);
shared_ptr
<
descriptor
::
TensorView
>
otv
=
op
->
get_output_tensor_view
();
vector
<
size_t
>&
al
=
output_alias_map
[
otv
.
get
()];
al
.
push_back
(
output_index
);
if
(
al
.
size
()
>
1
)
{
aliases
.
push_back
(
output_index
);
}
output_index
++
;
}
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
// Add outputs to the variable name map
output_index
=
0
;
for
(
size_t
i
=
0
;
i
<
current_function
->
get_output_size
();
++
i
)
{
shared_ptr
<
Node
>
op
=
current_function
->
get_output_op
(
i
);
shared_ptr
<
descriptor
::
TensorView
>
tv
=
op
->
get_output_tensor_view
();
const
element
::
Type
&
et
=
tv
->
get_tensor_view_type
()
->
get_element_type
();
bool
parameter_as_output
=
false
;
for
(
shared_ptr
<
ngraph
::
op
::
Parameter
>
param
:
current_function
->
get_parameters
())
{
for
(
const
descriptor
::
Output
&
pout
:
param
->
get_outputs
())
{
shared_ptr
<
descriptor
::
TensorView
>
ptv
=
pout
.
get_tensor_view
();
if
(
tv
==
ptv
)
{
auto
&
n
=
*
node
;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto
handler
=
dispatcher
.
find
(
type_index
(
typeid
(
n
)));
if
(
handler
==
dispatcher
.
end
())
{
throw
ngraph_error
(
"Unhandled op during code generation : "
+
node
->
description
());
}
vector
<
GPU_TensorViewWrapper
>
in
;
for
(
const
descriptor
::
Input
&
input
:
node
->
get_inputs
())
{
const
descriptor
::
Output
&
output
=
input
.
get_output
();
shared_ptr
<
descriptor
::
TensorView
>
tv
=
output
.
get_tensor_view
();
in
.
push_back
(
GPU_TensorViewWrapper
(
tv
,
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]));
}
vector
<
GPU_TensorViewWrapper
>
out
;
for
(
const
descriptor
::
Output
&
output
:
node
->
get_outputs
())
{
shared_ptr
<
descriptor
::
TensorView
>
tv
=
output
.
get_tensor_view
();
out
.
push_back
(
GPU_TensorViewWrapper
(
tv
,
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]));
}
// Emit operation prologue
if
(
!
node
->
is_parameter
()
&&
!
node
->
is_constant
())
{
if
(
m_emit_timing
)
{
emit_debug_function_entry
(
writer
,
node
.
get
(),
in
,
out
);
}
}
// Emit operation body
string
func_name
;
auto
it
=
match_functions
.
find
(
node
.
get
());
if
(
it
!=
match_functions
.
end
())
{
func_name
=
it
->
second
;
}
if
(
func_name
.
empty
())
{
handler
->
second
(
this
,
writer
,
node
.
get
(),
in
,
out
);
}
else
{
vector
<
string
>
names
;
for
(
const
GPU_TensorViewWrapper
&
tv
:
in
)
{
names
.
push_back
(
tv
.
get_name
());
}
for
(
const
GPU_TensorViewWrapper
&
tv
:
out
)
{
names
.
push_back
(
tv
.
get_name
());
}
writer
<<
func_name
<<
"("
<<
join
(
names
)
<<
");
\n
"
;
}
// Emit operation epilogue
if
(
!
node
->
is_parameter
()
&&
!
node
->
is_constant
())
{
if
(
m_emit_timing
)
{
emit_debug_function_exit
(
writer
,
node
.
get
(),
in
,
out
);
}
}
parameter_as_output
=
true
;
writer
<<
"ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
<<
et
.
c_type_string
()
<<
"*>(outputs["
<<
output_index
<<
"]), "
<<
m_variable_name_map
[
ptv
->
get_tensor
().
get_name
()]
<<
", "
<<
ptv
->
get_tensor
().
size
()
<<
");
\n
"
;
break
;
}
writer
.
indent
--
;
// End generated function
writer
+=
"}
\n\n
"
;
}
// TODO: Cleanup and make this a utility function
file_util
::
make_directory
(
s_output_dir
);
string
filename
=
file_util
::
path_join
(
s_output_dir
,
function_name
+
"_codegen.cpp"
);
ofstream
out
(
filename
);
string
code
=
writer
.
get_code
();
out
<<
code
;
out
.
close
();
m_compiler
.
reset
(
new
codegen
::
Compiler
());
m_execution_engine
.
reset
(
new
codegen
::
ExecutionEngine
());
m_compiler
->
set_precompiled_header_source
(
pch_header_source
);
auto
codegen_module
=
m_compiler
->
compile
(
code
);
if
(
codegen_module
==
nullptr
)
}
if
(
!
parameter_as_output
&&
!
contains
(
aliases
,
output_index
))
{
if
(
contains
(
constants
,
tv
.
get
()))
{
throw
runtime_error
(
"function failed to compile"
);
writer
<<
"ngraph::runtime::gpu::cuda_memcpyHtD(outputs["
<<
output_index
<<
"], "
<<
tv
->
get_tensor
().
get_name
()
<<
", "
<<
tv
->
get_tensor
().
size
()
<<
");
\n
"
;
}
m_execution_engine
->
add_module
(
codegen_module
);
m_execution_engine
->
finalize
();
m_compiled_function
=
m_execution_engine
->
find_function
<
EntryPoint_t
>
(
function_name
);
assert
(
m_compiled_function
);
m_is_compiled
=
true
;
if
(
m_release_function
)
else
{
release_function
();
string
type
=
et
.
c_type_string
();
stringstream
ss
;
ss
<<
"(("
<<
type
<<
"*)(outputs["
<<
output_index
<<
"]))"
;
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]
=
ss
.
str
();
}
}
output_index
++
;
}
void
GPU_ExternalFunction
::
handle_output_alias
(
codegen
::
CodeWriter
&
writer
,
const
Node
&
node
,
const
unordered_map
<
descriptor
::
TensorView
*
,
vector
<
size_t
>>&
output_alias_map
)
for
(
shared_ptr
<
Node
>
node
:
current_function
->
get_ordered_ops
())
{
auto
&
n
=
*
node
;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto
handler
=
dispatcher
.
find
(
type_index
(
typeid
(
n
)));
if
(
handler
==
dispatcher
.
end
())
{
throw
ngraph_error
(
"Unhandled op during code generation : "
+
node
->
description
());
}
vector
<
GPU_TensorViewWrapper
>
in
;
for
(
const
descriptor
::
Input
&
input
:
node
->
get_inputs
())
{
const
descriptor
::
Output
&
output
=
input
.
get_output
();
shared_ptr
<
descriptor
::
TensorView
>
tv
=
output
.
get_tensor_view
();
in
.
push_back
(
GPU_TensorViewWrapper
(
tv
,
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]));
}
vector
<
GPU_TensorViewWrapper
>
out
;
for
(
const
descriptor
::
Output
&
output
:
node
->
get_outputs
())
{
for
(
const
descriptor
::
Output
&
output
:
node
.
get_outputs
())
shared_ptr
<
descriptor
::
TensorView
>
tv
=
output
.
get_tensor_view
();
out
.
push_back
(
GPU_TensorViewWrapper
(
tv
,
m_variable_name_map
[
tv
->
get_tensor
().
get_name
()]));
}
// Emit operation prologue
if
(
!
node
->
is_parameter
()
&&
!
node
->
is_constant
())
{
if
(
m_emit_timing
)
{
shared_ptr
<
descriptor
::
TensorView
>
otv
=
output
.
get_tensor_view
();
auto
it
=
output_alias_map
.
find
(
otv
.
get
());
if
(
it
!=
output_alias_map
.
end
())
{
const
vector
<
size_t
>&
outputs
=
it
->
second
;
if
(
outputs
.
size
()
>
1
)
{
writer
<<
"{ // handle output alias for previous op
\n
"
;
writer
.
indent
++
;
for
(
size_t
i
=
1
;
i
<
outputs
.
size
();
i
++
)
{
writer
<<
"ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
"outputs["
<<
outputs
[
i
]
<<
"]), static_cast<void*>(outputs["
<<
outputs
[
0
]
<<
"]), "
<<
otv
->
get_tensor
().
size
()
<<
");
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
}
emit_debug_function_entry
(
writer
,
node
.
get
(),
in
,
out
);
}
}
shared_ptr
<
ngraph
::
runtime
::
CallFrame
>
GPU_ExternalFunction
::
make_call_frame
()
// Emit operation body
string
func_name
;
auto
it
=
match_functions
.
find
(
node
.
get
());
if
(
it
!=
match_functions
.
end
())
{
func_name
=
it
->
second
;
}
if
(
func_name
.
empty
())
{
if
(
!
m_is_compiled
)
handler
->
second
(
this
,
writer
,
node
.
get
(),
in
,
out
);
}
else
{
vector
<
string
>
names
;
for
(
const
GPU_TensorViewWrapper
&
tv
:
in
)
{
compile
(
);
names
.
push_back
(
tv
.
get_name
()
);
}
return
make_shared
<
GPU_CallFrame
>
(
shared_from_this
(),
m_compiled_function
);
for
(
const
GPU_TensorViewWrapper
&
tv
:
out
)
{
names
.
push_back
(
tv
.
get_name
());
}
writer
<<
func_name
<<
"("
<<
join
(
names
)
<<
");
\n
"
;
}
void
GPU_ExternalFunction
::
emit_debug_function_entry
(
codegen
::
CodeWriter
&
writer
,
Node
*
node
,
const
std
::
vector
<
GPU_TensorViewWrapper
>&
in
,
const
std
::
vector
<
GPU_TensorViewWrapper
>&
out
)
// Emit operation epilogue
if
(
!
node
->
is_parameter
()
&&
!
node
->
is_constant
())
{
writer
<<
"timer_"
<<
node
->
get_name
()
<<
".start();
\n
"
;
if
(
m_emit_timing
)
{
emit_debug_function_exit
(
writer
,
node
.
get
(),
in
,
out
);
}
}
}
writer
.
indent
--
;
// End generated function
writer
+=
"}
\n\n
"
;
}
// TODO: Cleanup and make this a utility function
void
GPU_ExternalFunction
::
emit_debug_function_exit
(
codegen
::
CodeWriter
&
writer
,
Node
*
node
,
const
std
::
vector
<
GPU_TensorViewWrapper
>&
in
,
const
std
::
vector
<
GPU_TensorViewWrapper
>&
out
)
file_util
::
make_directory
(
s_output_dir
);
string
filename
=
file_util
::
path_join
(
s_output_dir
,
function_name
+
"_codegen.cpp"
);
ofstream
out
(
filename
);
string
code
=
writer
.
get_code
();
out
<<
code
;
out
.
close
();
m_compiler
.
reset
(
new
codegen
::
Compiler
());
m_execution_engine
.
reset
(
new
codegen
::
ExecutionEngine
());
m_compiler
->
set_precompiled_header_source
(
pch_header_source
);
auto
codegen_module
=
m_compiler
->
compile
(
code
);
if
(
codegen_module
==
nullptr
)
{
throw
runtime_error
(
"function failed to compile"
);
}
m_execution_engine
->
add_module
(
codegen_module
);
m_execution_engine
->
finalize
();
m_compiled_function
=
m_execution_engine
->
find_function
<
EntryPoint_t
>
(
function_name
);
assert
(
m_compiled_function
);
m_is_compiled
=
true
;
if
(
m_release_function
)
{
release_function
();
}
}
void
runtime
::
gpu
::
GPU_ExternalFunction
::
handle_output_alias
(
codegen
::
CodeWriter
&
writer
,
const
Node
&
node
,
const
unordered_map
<
descriptor
::
TensorView
*
,
vector
<
size_t
>>&
output_alias_map
)
{
for
(
const
descriptor
::
Output
&
output
:
node
.
get_outputs
())
{
shared_ptr
<
descriptor
::
TensorView
>
otv
=
output
.
get_tensor_view
();
auto
it
=
output_alias_map
.
find
(
otv
.
get
());
if
(
it
!=
output_alias_map
.
end
())
{
const
vector
<
size_t
>&
outputs
=
it
->
second
;
if
(
outputs
.
size
()
>
1
)
{
writer
<<
"timer_"
<<
node
->
get_name
()
<<
".stop();
\n
"
;
writer
<<
"{ // handle output alias for previous op
\n
"
;
writer
.
indent
++
;
for
(
size_t
i
=
1
;
i
<
outputs
.
size
();
i
++
)
{
writer
<<
"ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
"outputs["
<<
outputs
[
i
]
<<
"]), static_cast<void*>(outputs["
<<
outputs
[
0
]
<<
"]), "
<<
otv
->
get_tensor
().
size
()
<<
");
\n
"
;
}
writer
.
indent
--
;
writer
<<
"}
\n
"
;
}
}
}
}
shared_ptr
<
ngraph
::
runtime
::
CallFrame
>
runtime
::
gpu
::
GPU_ExternalFunction
::
make_call_frame
()
{
if
(
!
m_is_compiled
)
{
compile
();
}
return
make_shared
<
GPU_CallFrame
>
(
shared_from_this
(),
m_compiled_function
);
}
void
runtime
::
gpu
::
GPU_ExternalFunction
::
emit_debug_function_entry
(
codegen
::
CodeWriter
&
writer
,
Node
*
node
,
const
std
::
vector
<
GPU_TensorViewWrapper
>&
in
,
const
std
::
vector
<
GPU_TensorViewWrapper
>&
out
)
{
writer
<<
"timer_"
<<
node
->
get_name
()
<<
".start();
\n
"
;
}
void
runtime
::
gpu
::
GPU_ExternalFunction
::
emit_debug_function_exit
(
codegen
::
CodeWriter
&
writer
,
Node
*
node
,
const
std
::
vector
<
GPU_TensorViewWrapper
>&
in
,
const
std
::
vector
<
GPU_TensorViewWrapper
>&
out
)
{
writer
<<
"timer_"
<<
node
->
get_name
()
<<
".stop();
\n
"
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment