Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
438e456c
Commit
438e456c
authored
Nov 20, 2017
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #10113 from wzw-intel:fusion
parents
75b980ac
45d11dde
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
181 additions
and
17 deletions
+181
-17
dnn.cpp
modules/dnn/src/dnn.cpp
+75
-2
convolution_layer.cpp
modules/dnn/src/layers/convolution_layer.cpp
+39
-4
default_kernel_config.hpp
modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
+0
-0
ocl4dnn.hpp
modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+7
-2
ocl4dnn_conv_spatial.cpp
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+48
-9
conv_layer_spatial.cl
modules/dnn/src/opencl/conv_layer_spatial.cl
+12
-0
No files found.
modules/dnn/src/dnn.cpp
View file @
438e456c
...
...
@@ -1233,12 +1233,13 @@ struct Net::Impl
}
}
// For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU
// For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU
/Power
if
(
preferableTarget
!=
DNN_TARGET_OPENCL
||
(
preferableTarget
==
DNN_TARGET_OPENCL
&&
nextData
&&
(
!
nextData
->
type
.
compare
(
"ReLU"
)
||
!
nextData
->
type
.
compare
(
"ChannelsPReLU"
)))
)
!
nextData
->
type
.
compare
(
"ChannelsPReLU"
)
||
!
nextData
->
type
.
compare
(
"Power"
)))
)
{
Ptr
<
ActivationLayer
>
nextActivLayer
;
...
...
@@ -1253,6 +1254,78 @@ struct Net::Impl
printf_
((
"
\t
fused with %s
\n
"
,
nextActivLayer
->
name
.
c_str
()));
activData
->
skipFlags
[
DNN_BACKEND_DEFAULT
]
=
true
;
ld
.
outputBlobs
=
layers
[
lpNext
.
lid
].
outputBlobs
;
if
(
preferableTarget
==
DNN_TARGET_OPENCL
)
{
nextData
=
&
layers
[
activData
->
consumers
[
0
].
lid
];
lpNext
=
LayerPin
(
activData
->
consumers
[
0
].
lid
,
0
);
}
}
}
// fuse convlution layer followed by eltwise + relu
if
(
preferableTarget
==
DNN_TARGET_OPENCL
)
{
Ptr
<
EltwiseLayer
>
nextEltwiseLayer
;
if
(
nextData
)
nextEltwiseLayer
=
nextData
->
layerInstance
.
dynamicCast
<
EltwiseLayer
>
();
if
(
!
nextEltwiseLayer
.
empty
()
&&
pinsToKeep
.
count
(
lpNext
)
==
0
)
{
LayerData
*
eltwiseData
=
nextData
;
// go down from the second input and find the first non-skipped layer.
LayerData
*
downLayerData
=
&
layers
[
eltwiseData
->
inputBlobsId
[
1
].
lid
];
while
(
downLayerData
->
skipFlags
[
DNN_BACKEND_DEFAULT
])
{
downLayerData
=
&
layers
[
downLayerData
->
inputBlobsId
[
0
].
lid
];
}
// second input layer is current layer.
if
(
ld
.
id
==
downLayerData
->
id
)
{
// go down from the first input and find the first non-skipped layer
downLayerData
=
&
layers
[
eltwiseData
->
inputBlobsId
[
0
].
lid
];
while
(
downLayerData
->
skipFlags
[
DNN_BACKEND_DEFAULT
])
{
if
(
!
downLayerData
->
type
.
compare
(
"Eltwise"
)
)
downLayerData
=
&
layers
[
downLayerData
->
inputBlobsId
[
1
].
lid
];
else
downLayerData
=
&
layers
[
downLayerData
->
inputBlobsId
[
0
].
lid
];
}
Ptr
<
ConvolutionLayer
>
convLayer
;
if
(
downLayerData
)
convLayer
=
downLayerData
->
layerInstance
.
dynamicCast
<
ConvolutionLayer
>
();
// first input layer is convolution layer
if
(
!
convLayer
.
empty
()
)
{
// fuse eltwise + activation layer
LayerData
*
firstConvLayerData
=
downLayerData
;
{
nextData
=
&
layers
[
eltwiseData
->
consumers
[
0
].
lid
];
lpNext
=
LayerPin
(
eltwiseData
->
consumers
[
0
].
lid
,
0
);
Ptr
<
ActivationLayer
>
nextActivLayer
;
if
(
nextData
)
nextActivLayer
=
nextData
->
layerInstance
.
dynamicCast
<
ActivationLayer
>
();
if
(
!
nextActivLayer
.
empty
()
&&
pinsToKeep
.
count
(
lpNext
)
==
0
&&
(
!
nextData
->
type
.
compare
(
"ReLU"
)
||
!
nextData
->
type
.
compare
(
"ChannelsPReLU"
)
||
!
nextData
->
type
.
compare
(
"Power"
))
&&
currLayer
->
setActivation
(
nextActivLayer
)
)
{
CV_Assert
(
firstConvLayerData
->
outputBlobs
.
size
()
==
1
&&
ld
.
inputBlobs
.
size
()
==
1
);
ld
.
inputBlobs
.
push_back
(
&
firstConvLayerData
->
outputBlobs
[
0
]);
printf_
((
"
\t
fused with %s
\n
"
,
nextEltwiseLayer
->
name
.
c_str
()));
printf_
((
"
\t
fused with %s
\n
"
,
nextActivLayer
->
name
.
c_str
()));
eltwiseData
->
skipFlags
[
DNN_BACKEND_DEFAULT
]
=
true
;
nextData
->
skipFlags
[
DNN_BACKEND_DEFAULT
]
=
true
;
ld
.
outputBlobs
=
layers
[
lpNext
.
lid
].
outputBlobs
;
}
}
}
}
}
}
}
...
...
modules/dnn/src/layers/convolution_layer.cpp
View file @
438e456c
...
...
@@ -142,6 +142,9 @@ public:
}
};
#define IS_POWER_LAYER(layer) \
(!layer.empty() && !layer->type.compare("Power"))
//TODO: simultaneously convolution and bias addition for cache optimization
class
ConvolutionLayerImpl
:
public
BaseConvolutionLayerImpl
{
...
...
@@ -161,6 +164,7 @@ public:
bool
newWeightAndBias
;
bool
newActiv
;
ocl4dnnFusedActiv_t
activType
;
float
power
;
#endif
ConvolutionLayerImpl
()
{
...
...
@@ -169,6 +173,7 @@ public:
newWeightAndBias
=
false
;
newActiv
=
false
;
activType
=
OCL4DNN_CONV_FUSED_ACTIV_NONE
;
power
=
0.
f
;
#endif
}
...
...
@@ -225,6 +230,22 @@ public:
#ifdef HAVE_OPENCL
newActiv
=
true
;
activType
=
OCL4DNN_CONV_FUSED_ACTIV_NONE
;
if
(
preferableTarget
==
DNN_TARGET_OPENCL
)
{
Ptr
<
PowerLayer
>
activ_power
=
activ
.
dynamicCast
<
PowerLayer
>
();
if
(
!
activ_power
.
empty
())
{
if
(
activ_power
->
scale
!=
1.
f
||
activ_power
->
shift
!=
0.
f
)
newWeightAndBias
=
true
;
if
(
activ_power
->
scale
!=
1.
f
)
weightsMat
.
release
();
power
=
activ_power
->
power
;
activType
=
OCL4DNN_CONV_FUSED_ACTIV_POWER
;
}
}
#endif
return
!
activ
.
empty
();
}
...
...
@@ -727,11 +748,12 @@ public:
biasvec
[
k
]
=
biasMat
.
at
<
float
>
(
k
);
}
if
(
!
bnorm
.
empty
()
||
!
scaleLayer
.
empty
()
)
if
(
!
bnorm
.
empty
()
||
!
scaleLayer
.
empty
()
||
IS_POWER_LAYER
(
activ
)
)
{
Mat
scale
,
shift
,
scale2
,
shift2
;
const
float
*
scaleptr
=
0
,
*
shiftptr
=
0
;
const
float
*
scaleptr2
=
0
,
*
shiftptr2
=
0
;
float
a
=
1.
f
,
b
=
0.
f
;
if
(
!
bnorm
.
empty
()
)
{
...
...
@@ -758,7 +780,14 @@ public:
}
}
if
(
shiftptr
||
shiftptr2
)
if
(
IS_POWER_LAYER
(
activ
)
)
{
Ptr
<
PowerLayer
>
activ_power
=
activ
.
dynamicCast
<
PowerLayer
>
();
a
=
activ_power
->
scale
;
b
=
activ_power
->
shift
;
}
if
(
shiftptr
||
shiftptr2
||
b
!=
0.
f
)
fusedBias
=
true
;
for
(
int
i
=
0
;
i
<
outCn
;
i
++
)
...
...
@@ -771,9 +800,9 @@ public:
int
j
,
wcols
=
weightsMat
.
cols
;
for
(
j
=
0
;
j
<
wcols
;
j
++
)
w_i
[
j
]
*=
(
s1
*
s2
);
w_i
[
j
]
*=
(
s1
*
s2
*
a
);
biasvec
[
i
]
=
biasvec
[
i
]
*
(
s1
*
s2
)
+
(
delta1
*
s2
+
delta2
);
biasvec
[
i
]
=
biasvec
[
i
]
*
(
s1
*
s2
*
a
)
+
(
delta1
*
s2
*
a
+
delta2
*
a
+
b
);
}
}
biasvec
[
outCn
]
=
biasvec
[
outCn
+
1
]
=
biasvec
[
outCn
-
1
];
...
...
@@ -827,10 +856,15 @@ public:
CV_Assert
(
!
reluslope
.
empty
());
convolutionOp
->
setActivPReLU
(
true
,
reluslope
);
}
else
if
(
activType
==
OCL4DNN_CONV_FUSED_ACTIV_POWER
)
{
convolutionOp
->
setActivPower
(
true
,
power
);
}
else
{
convolutionOp
->
setActivReLU
(
false
,
0
);
convolutionOp
->
setActivPReLU
(
false
,
reluslope
);
convolutionOp
->
setActivPower
(
false
,
1.
f
);
}
newActiv
=
false
;
}
...
...
@@ -840,6 +874,7 @@ public:
int
batch_size
=
inpMat
.
size
[
0
];
return
convolutionOp
->
Forward
(
inpMat
,
inputs
.
size
()
==
2
?
inputs
[
1
]
:
UMat
(),
umat_blobs
[
0
],
(
hasBias
()
||
fusedBias
)
?
umat_blobs
[
1
]
:
UMat
(),
outMat
,
...
...
modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
View file @
438e456c
This diff is collapsed.
Click to expand it.
modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
View file @
438e456c
...
...
@@ -77,6 +77,7 @@ typedef enum {
OCL4DNN_CONV_FUSED_ACTIV_NONE
=
0
,
OCL4DNN_CONV_FUSED_ACTIV_RELU
=
1
,
OCL4DNN_CONV_FUSED_ACTIV_PRELU
=
2
,
OCL4DNN_CONV_FUSED_ACTIV_POWER
=
3
}
ocl4dnnFusedActiv_t
;
template
<
typename
Dtype
>
...
...
@@ -86,11 +87,13 @@ class OCL4DNNConvSpatial
explicit
OCL4DNNConvSpatial
(
OCL4DNNConvConfig
config
);
~
OCL4DNNConvSpatial
();
bool
Forward
(
const
UMat
&
bottom_data
,
const
UMat
&
bottom_data2
,
const
UMat
&
weight
,
const
UMat
&
bias
,
UMat
&
top_data
,
int32_t
batch_size
);
void
setActivReLU
(
bool
fuse_activ
,
float
slope
);
void
setActivPReLU
(
bool
fuse_activ
,
std
::
vector
<
float
>
&
slope
);
void
setActivPower
(
bool
fuse_activ
,
float
power
);
void
setBias
(
bool
bias_term
);
private
:
...
...
@@ -252,8 +255,8 @@ class OCL4DNNConvSpatial
int
lx
,
int
ly
,
int
lz
,
bool
swizzle
,
bool
nullLocal
);
void
generateTunerItems
(
std
::
vector
<
cv
::
Ptr
<
tunerParam
>
>
&
tunerItems
);
void
setFusionDefine
(
ocl4dnnFusedActiv_t
fused_activ
);
void
setFusionArg
(
ocl4dnnFusedActiv_t
fused_activ
,
ocl
::
Kernel
&
kernel
,
cl_uint
&
argIdx
);
void
setFusionDefine
(
ocl4dnnFusedActiv_t
fused_activ
,
bool
fused_eltwise
);
void
setFusionArg
(
ocl4dnnFusedActiv_t
fused_activ
,
bool
fused_eltwise
,
ocl
::
Kernel
&
kernel
,
cl_uint
&
argIdx
);
int32_t
group_
;
bool
bias_term_
;
...
...
@@ -305,6 +308,8 @@ class OCL4DNNConvSpatial
float
negative_slope_
;
UMat
negative_slope_umat_
;
ocl4dnnFusedActiv_t
fused_activ_
;
float
power_
;
bool
fused_eltwise_
;
};
typedef
enum
{
...
...
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
View file @
438e456c
...
...
@@ -79,6 +79,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
group_
=
config
.
group
;
fused_activ_
=
OCL4DNN_CONV_FUSED_ACTIV_NONE
;
fused_eltwise_
=
false
;
power_
=
1.
f
;
negative_slope_
=
0
;
prev_kernel_type_
=
-
1
;
tuned_
=
false
;
...
...
@@ -141,8 +143,11 @@ OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
}
template
<
typename
Dtype
>
void
OCL4DNNConvSpatial
<
Dtype
>::
setFusionDefine
(
ocl4dnnFusedActiv_t
fused_activ
)
void
OCL4DNNConvSpatial
<
Dtype
>::
setFusionDefine
(
ocl4dnnFusedActiv_t
fused_activ
,
bool
fused_eltwise
)
{
if
(
fused_eltwise
)
addDef
(
"FUSED_CONV_ELTWISE"
,
1
);
switch
(
fused_activ
)
{
case
OCL4DNN_CONV_FUSED_ACTIV_RELU
:
addDef
(
"FUSED_CONV_RELU"
,
1
);
...
...
@@ -150,6 +155,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
case
OCL4DNN_CONV_FUSED_ACTIV_PRELU
:
addDef
(
"FUSED_CONV_PRELU"
,
1
);
break
;
case
OCL4DNN_CONV_FUSED_ACTIV_POWER
:
addDef
(
"FUSED_CONV_POWER"
,
1
);
break
;
default
:
;
}
...
...
@@ -157,8 +165,11 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
}
template
<
typename
Dtype
>
void
OCL4DNNConvSpatial
<
Dtype
>::
setFusionArg
(
ocl4dnnFusedActiv_t
fused_activ
,
ocl
::
Kernel
&
kernel
,
cl_uint
&
argIdx
)
void
OCL4DNNConvSpatial
<
Dtype
>::
setFusionArg
(
ocl4dnnFusedActiv_t
fused_activ
,
bool
fused_eltwise
,
ocl
::
Kernel
&
kernel
,
cl_uint
&
argIdx
)
{
if
(
fused_eltwise
)
kernel
.
set
(
argIdx
++
,
(
cl_mem
)
bottom_data2_
.
handle
(
ACCESS_READ
));
switch
(
fused_activ
)
{
case
OCL4DNN_CONV_FUSED_ACTIV_RELU
:
kernel
.
set
(
argIdx
++
,
(
float
)
negative_slope_
);
...
...
@@ -166,6 +177,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, oc
case
OCL4DNN_CONV_FUSED_ACTIV_PRELU
:
kernel
.
set
(
argIdx
++
,
(
cl_mem
)
negative_slope_umat_
.
handle
(
ACCESS_READ
));
break
;
case
OCL4DNN_CONV_FUSED_ACTIV_POWER
:
kernel
.
set
(
argIdx
++
,
(
float
)
power_
);
break
;
default
:
;
}
...
...
@@ -255,7 +269,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef
(
"ALIGNED_NUM_FILTERS"
,
(
int
)
alignSize
(
M_
,
simd_size
));
addDef
(
"OUT_BLOCK_SIZE"
,
(
output_block_width
*
output_block_height
));
addDef
(
"APPLY_BIAS"
,
bias_term_
);
setFusionDefine
(
fused_activ_
);
setFusionDefine
(
fused_activ_
,
fused_eltwise_
);
src_
=
cv
::
ocl
::
dnn
::
conv_layer_spatial_oclsrc
;
}
...
...
@@ -277,7 +291,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef
(
"APPLY_BIAS"
,
bias_term_
);
addDef
(
"OUTPUT_Z"
,
M_
);
addDef
(
"ZPAR"
,
1
);
setFusionDefine
(
fused_activ_
);
setFusionDefine
(
fused_activ_
,
fused_eltwise_
);
src_
=
cv
::
ocl
::
dnn
::
conv_layer_spatial_oclsrc
;
}
...
...
@@ -314,7 +328,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef
(
"TILE_N_LAST"
,
M_
%
32
);
addDef
(
"TILE_N_LAST_DIV8"
,
(
M_
%
32
)
/
8
);
addDef
(
"APPLY_BIAS"
,
bias_term_
);
setFusionDefine
(
fused_activ_
);
setFusionDefine
(
fused_activ_
,
fused_eltwise_
);
src_
=
ocl
::
dnn
::
conv_layer_spatial_oclsrc
;
}
}
...
...
@@ -370,14 +384,37 @@ void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float
fused_activ_
=
OCL4DNN_CONV_FUSED_ACTIV_NONE
;
}
template
<
typename
Dtype
>
void
OCL4DNNConvSpatial
<
Dtype
>::
setActivPower
(
bool
fuse_activ
,
float
power
)
{
if
(
fuse_activ
)
{
fused_activ_
=
OCL4DNN_CONV_FUSED_ACTIV_POWER
;
power_
=
power
;
}
else
fused_activ_
=
OCL4DNN_CONV_FUSED_ACTIV_NONE
;
}
template
<
typename
Dtype
>
bool
OCL4DNNConvSpatial
<
Dtype
>::
Forward
(
const
UMat
&
bottom
,
const
UMat
&
bottom2
,
const
UMat
&
weight
,
const
UMat
&
bias
,
UMat
&
top
,
int32_t
numImages
)
{
num_
=
numImages
;
if
(
!
bottom2
.
empty
())
{
fused_eltwise_
=
true
;
bottom_data2_
=
bottom2
;
}
else
{
fused_eltwise_
=
false
;
}
prepareKernel
(
bottom
,
top
,
weight
,
bias
,
numImages
);
if
(
bestKernelConfig
.
empty
())
return
false
;
...
...
@@ -428,7 +465,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
<<
"p"
<<
pad_w_
<<
"x"
<<
pad_h_
<<
"_"
<<
"num"
<<
num_
<<
"_"
<<
"M"
<<
M_
<<
"_"
<<
"activ"
<<
fused_activ_
;
<<
"activ"
<<
fused_activ_
<<
"_"
<<
"eltwise"
<<
fused_eltwise_
;
key_
=
ocl
::
Device
::
getDefault
().
vendorName
()
+
"_EU"
+
cv
::
format
(
"%d"
,
ocl
::
Device
::
getDefault
().
maxComputeUnits
())
+
"_"
+
keyBuilder
.
str
();
...
...
@@ -678,7 +716,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return
false
;
cl_uint
argIdx
=
0
;
setFusionArg
(
fused_activ_
,
kernel
,
argIdx
);
setFusionArg
(
fused_activ_
,
fused_eltwise_
,
kernel
,
argIdx
);
UMat
img_buffer
;
if
(
image_offset
)
...
...
@@ -771,7 +809,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return
false
;
cl_uint
argIdx
=
0
;
setFusionArg
(
fused_activ_
,
kernel
,
argIdx
);
setFusionArg
(
fused_activ_
,
fused_eltwise_
,
kernel
,
argIdx
);
UMat
img_buffer
;
if
(
image_offset
)
...
...
@@ -888,7 +926,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return
false
;
cl_uint
argIdx
=
0
;
setFusionArg
(
fused_activ_
,
kernel
,
argIdx
);
setFusionArg
(
fused_activ_
,
fused_eltwise_
,
kernel
,
argIdx
);
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
bottom
));
kernel
.
set
(
argIdx
++
,
image_offset
);
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
weight
));
...
...
@@ -1491,6 +1529,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
if
(
loadCachedConfig
())
// check in-memory cache
return
;
if
(
loadTunedConfig
())
// check external storage
return
;
...
...
modules/dnn/src/opencl/conv_layer_spatial.cl
View file @
438e456c
...
...
@@ -52,12 +52,21 @@
#
elif
defined
(
FUSED_CONV_PRELU
)
#
define
ACTIVATION_RELU_FUNCTION
(
x,
c
)
((
Dtype
)(
x
)
>
0
?
(
Dtype
)(
x
)
:
((
Dtype
)(
x
)
*
(
Dtype
)(
negative_slope[c]
)))
#
define
NEGATIVE_SLOPE_ARG
__global
const
Dtype
*negative_slope,
#
elif
defined
(
FUSED_CONV_POWER
)
#
define
ACTIVATION_RELU_FUNCTION
(
x,
c
)
pow
(
x,
power
)
#
define
NEGATIVE_SLOPE_ARG
Dtype
power,
#
else
#
define
ACTIVATION_RELU_FUNCTION
(
x,
c
)
(
x
)
#
define
NEGATIVE_SLOPE_ARG
#
endif
#
ifdef
FUSED_CONV_ELTWISE
#
define
ACTIVATION_FUNCTION
(
_dst_,
_offset_,
_data_,
_channel_
)
do
{
(
_dst_
)
[
(
_offset_
)
]
=
ACTIVATION_RELU_FUNCTION
(
eltwise_data[
(
_offset_
)
]
+
(
_data_
)
,
_channel_
)
;} while(0)
#
define
ELTWISE_DATA_ARG
__global
Dtype*
eltwise_data,
#
else
#
define
ACTIVATION_FUNCTION
(
_dst_,
_offset_,
_data_,
_channel_
)
do
{
(
_dst_
)
[
(
_offset_
)
]
=
ACTIVATION_RELU_FUNCTION
(
_data_,
_channel_
)
;} while(0)
#
define
ELTWISE_DATA_ARG
#
endif
#
define
__CAT
(
x,
y
)
x##y
...
...
@@ -99,6 +108,7 @@
#ifdef KERNEL_BASIC
__kernel void ConvolveBasic(
ELTWISE_DATA_ARG
NEGATIVE_SLOPE_ARG
__global Dtype* image_data,
int image_offset,
...
...
@@ -193,6 +203,7 @@ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
#endif
__kernel void
convolve_simd(
ELTWISE_DATA_ARG
NEGATIVE_SLOPE_ARG
__global Dtype* inputs_base,
filter_qualifier Dtype* weights_base,
...
...
@@ -413,6 +424,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
#define ROW_PITCH input_width
#define GEMM_LIKE_KERNEL_ARGS \
ELTWISE_DATA_ARG \
NEGATIVE_SLOPE_ARG \
const __global Dtype *src0, \
const __global Dtype *src1, \
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment