Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
181b448c
Commit
181b448c
authored
Dec 22, 2017
by
Li Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add one more convolution kernel tuning candidate
Signed-off-by:
Li Peng
<
peng.li@intel.com
>
parent
1bc1f3d3
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
236 additions
and
11 deletions
+236
-11
ocl4dnn_conv_spatial.cpp
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+1
-0
conv_layer_spatial.cl
modules/dnn/src/opencl/conv_layer_spatial.cl
+235
-11
No files found.
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
View file @
181b448c
...
...
@@ -1432,6 +1432,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
generate_gemmlike_tuneritems
(
tunerItems
,
1
,
8
,
32
);
generate_gemmlike_tuneritems
(
tunerItems
,
2
,
8
,
32
);
generate_gemmlike_tuneritems
(
tunerItems
,
1
,
16
,
32
);
generate_gemmlike_tuneritems
(
tunerItems
,
2
,
16
,
32
);
// idlf kernel
for
(
int
simd_size
=
8
;
simd_size
<=
16
;
simd_size
+=
8
)
...
...
modules/dnn/src/opencl/conv_layer_spatial.cl
View file @
181b448c
...
...
@@ -384,7 +384,6 @@ convolve_simd(
#elif defined KERNEL_GEMM_LIKE
#if APPLY_BIAS
// Dtype bias[4];
#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
#else
#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
...
...
@@ -446,9 +445,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
#define TILE_K KERNEL_WIDTH
#define TILE_N 32
#ifndef __BEIGNET__
__attribute__((intel_reqd_sub_group_size(8)))
#endif
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
const int group_x = get_group_id(0);
...
...
@@ -608,6 +605,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
Dtype4 *bias_vec;
bias_vec = (Dtype4*)bias;
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
if (group_x > 0xFFFFFFFEul) {
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
}
#else
const Dtype bias[4] = {0, 0, 0, 0};
#endif
if (global_y * TILE_M < output_width * output_height )
{
...
...
@@ -768,6 +770,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
Dtype4 *bias_vec;
bias_vec = (Dtype4*)bias;
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
if (group_x > 0xFFFFFFFEul) {
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
}
#else
const Dtype bias[4] = {0, 0, 0, 0};
#endif
if (global_y * TILE_M < output_width * output_height )
...
...
@@ -813,9 +820,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
#define TILE_K KERNEL_WIDTH
#define TILE_N 32
#ifndef __BEIGNET__
__attribute__((intel_reqd_sub_group_size(8)))
#endif
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
const int group_x = get_group_id(0);
...
...
@@ -1012,6 +1017,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
Dtype4 *bias_vec;
bias_vec = (Dtype4*)bias;
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
if (group_x > 0xFFFFFFFEul) {
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
}
#else
const Dtype bias[4] = {0, 0, 0, 0};
#endif
if( global_y * TILE_M < output_width * output_height )
...
...
@@ -1221,6 +1231,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
Dtype4 *bias_vec;
bias_vec = (Dtype4*)bias;
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
if (group_x > 0xFFFFFFFEul) {
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
}
#else
const Dtype bias[4] = {0, 0, 0, 0};
#endif
if( global_y * TILE_M < output_width * output_height )
{
...
...
@@ -1334,9 +1349,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
#define TILE_K KERNEL_WIDTH
#define TILE_N 32
#ifndef __BEIGNET__
__attribute__((intel_reqd_sub_group_size(16)))
#endif
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
const int group_x = get_group_id(0);
...
...
@@ -1396,18 +1409,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
// and KERNEL_WIDTH/2 rows of interleaved filter.
int patch_depth = 0;
#ifndef __BEIGNET__
__attribute__((opencl_unroll_hint(1)))
#endif
do
{
int patch_row = 0;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
curr_y = saved_y;
#endif
#
ifndef
__BEIGNET__
__attribute__((opencl_unroll_hint(1)))
#
endif
do
{
// Load atile and btile.
...
...
@@ -1495,11 +1504,226 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
Dtype2 *bias_vec;
bias_vec = (Dtype2*)bias;
*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
if (group_x > 0xFFFFFFFEul) {
dst[0] = bias[0] + bias[1];
}
#else
const Dtype bias[2] = {0, 0};
#endif
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
}
#endif
#ifdef GEMM_LIKE_CONV_32_2_SIMD16
//////////////////////////////////////////////////////////////////////////////
// Conv_Interleaved_32_2_SIMD16
//
// Convolution: each workitem computes 1 patch x 32 filters worth of output
// data.
#define TILE_M 2
#define TILE_K KERNEL_WIDTH
#define TILE_N 32
__attribute__((intel_reqd_sub_group_size(16)))
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
{
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int global_x = get_global_id(0);
const int global_y = get_global_id(1);
const int global_z = get_global_id(2);
int interleaved_y;
int kernel_y;
int kernel_idx;
#define DOT_PRODUCT_16( _result, _rowA, colB ) \
{ \
_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \
_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \
_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \
_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \
_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \
_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \
_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \
_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \
}
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
// True for all threads if filter_width is multiple of TILE_N
// else, true for all but right-most column of threads.
{
// Result ctile (*dst) is M rows x N columns
// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
Dtype16 blockC00 = 0.f;
Dtype16 blockC10 = 0.f;
Dtype16 blockC01 = 0.f;
Dtype16 blockC11 = 0.f;
// Src0 (patch input) is directly used as atile.
// Each work item points to the start of a different patch.
// atile is M rows x K columns.
int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
int saved_y0 = curr_y0;
int saved_y1 = curr_y1;
#endif
const __global Dtype *src0_read0 = src0
+ aligned_input_size * global_z // batch offset
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x0 - INPUT_PAD_W; // x offset
const __global Dtype *src0_read1 = src0
+ aligned_input_size * global_z // batch offset
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x1 - INPUT_PAD_W; // x offset
// Src1 (filter) is directly used as btile.
// It starts at the top of src1 and walks down.
// btile is K rows x N columns.
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
// and KERNEL_WIDTH/2 rows of interleaved filter.
int patch_depth = 0;
do
{
int patch_row = 0;
do
{
// Load atile and btile.
// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
// interleaved row is padded with zero to ensure same size as interleaved rows. This
// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
// ...
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;
Dtype* pblockA00 = (Dtype*)(&blockA00);
Dtype* pblockA01 = (Dtype*)(&blockA01);
#else
Dtype_t blockA00;
Dtype* pblockA00 = (Dtype*)(&blockA00);
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read0[pos * DILATION_X];
else
pblockA00[pos] = 0;
})
curr_y0 += DILATION_Y;
Dtype_t blockA01;
Dtype* pblockA01 = (Dtype*)(&blockA01);
pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA01[pos] = src0_read1[pos * DILATION_X];
else
pblockA01[pos] = 0;
})
curr_y1 += DILATION_Y;
src0_read0 += (ROW_PITCH * DILATION_Y);
src0_read1 += (ROW_PITCH * DILATION_Y);
#endif
Dtype blockB00[KERNEL_WIDTH*2];
Dtype4* p4BlockB00 = (Dtype4*)blockB00;
Dtype2* p2BlockB00 = (Dtype2*)blockB00;
Dtype* pBlockB00 = (Dtype* )blockB00;
interleaved_y = 0;
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
{
p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
src1_read += WIDTH1 * 2;
} )
if ( kernel_width_is_odd )
{
p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
src1_read += WIDTH1 * 2;
}
// Perform MADs
kernel_idx = 0;
interleaved_y = 0;
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
{
kernel_y = interleaved_y * 2;
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
} )
if ( kernel_width_is_odd )
{
kernel_y = interleaved_y * 2;
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
}
}
//while( ++patch_row < 1 ); //debug
while( ++patch_row < KERNEL_HEIGHT );
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 |
|
DILATION_Y
!=
1
curr_y0
=
saved_y0
;
curr_y1
=
saved_y1
;
#
endif
src0_read0
+=
slice_pitch
-
(
KERNEL_HEIGHT
*
ROW_PITCH
*
DILATION_Y
)
; // reset to start of next slice of patch
src0_read1
+=
slice_pitch
-
(
KERNEL_HEIGHT
*
ROW_PITCH
*
DILATION_Y
)
;
}
//while
(
++patch_depth
<
1
)
; //debug
while
(
++patch_depth
<
INPUT_DEPTH
)
;
//
Dst
resembles
a
cube
of
width
x
height
x
(
output
channel
*
batches
)
.
Each
tile
writes:
//
(
SIMD
*
TILE_M
)
x
1
x
TILE_N.
Partial
writes
most
likely
generated
if
padding
used.
int
out0_offset
=
global_z
*
out_pitch_z
//
batch
offset
+
(
group_x
*
TILE_N
)
*
out_pitch_y
//
channel
offset
+
(
(
global_y
*
TILE_M
+
0
)
/
output_width
+
OUT_PADDING_HEIGHT
)
*
OUT_PITCH_X
//
y
offset
+
(
(
global_y
*
TILE_M
+
0
)
%
output_width
)
+
OUT_PADDING_LEFT
; // x offset
int
out1_offset
=
global_z
*
out_pitch_z
//
batch
offset
+
(
group_x
*
TILE_N
)
*
out_pitch_y
//
channel
offset
+
(
(
global_y
*
TILE_M
+
1
)
/
output_width
+
OUT_PADDING_HEIGHT
)
*
OUT_PITCH_X
//
y
offset
+
(
(
global_y
*
TILE_M
+
1
)
%
output_width
)
+
OUT_PADDING_LEFT
; // x offset
#
if
APPLY_BIAS
Dtype
bias[2]
;
Dtype2
*bias_vec
;
bias_vec
=
(
Dtype2*
)
bias
;
*bias_vec
=
as_Dtype2
(
SUB_GROUP_BLOCK_READ2
((
__global
INT_TYPE
*
)
biases_base
+
group_x
*
TILE_N
))
;
if
(
group_x
>
0xFFFFFFFEul
)
{
dst[0]
=
bias[0]
+
bias[1]
;
}
#
else
const
Dtype
bias[2]
=
{0,
0}
;
#
endif
INTERLEAVED_SIMD16_OUTPUT
(
dst,
out0_offset,
0
)
;
INTERLEAVED_SIMD16_OUTPUT
(
dst,
out1_offset,
1
)
;
}
}
#
endif
#
elif
defined
KERNEL_DWCONV
__kernel
void
DWCONV
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment