Commit 4c5a8682 authored by Li Peng's avatar Li Peng

Fix gemmlike convolution input reading

use vload3 for half3 or float3 input vector reading,
also check read position to see if it exceed input width
Signed-off-by: 's avatarLi Peng <peng.li@intel.com>
parent 7fe07279
...@@ -502,15 +502,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -502,15 +502,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
#if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3(0, src0_read);
Dtype* pblockA00 = (Dtype*)(&blockA00);
#else
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
#endif
#else #else
Dtype_t blockA00; Dtype_t blockA00;
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
int pos = 0; int pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read[pos * DILATION_X]; pblockA00[pos] = src0_read[pos * DILATION_X];
else else
pblockA00[pos] = 0; pblockA00[pos] = 0;
...@@ -564,7 +572,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -564,7 +572,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while( ++patch_row < 1 ); //debug //while( ++patch_row < 1 ); //debug
while( ++patch_row < KERNEL_HEIGHT ); while( ++patch_row < KERNEL_HEIGHT );
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch // reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
} }
//while ( ++patch_depth < 1 ); //debug //while ( ++patch_depth < 1 ); //debug
while ( ++patch_depth < INPUT_DEPTH ); while ( ++patch_depth < INPUT_DEPTH );
...@@ -653,7 +662,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -653,7 +662,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0; int pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read[pos * DILATION_X]; pblockA00[pos] = src0_read[pos * DILATION_X];
else else
pblockA00[pos] = 0; pblockA00[pos] = 0;
...@@ -730,7 +742,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -730,7 +742,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while( ++patch_row < 1 ); //debug //while( ++patch_row < 1 ); //debug
while( ++patch_row < KERNEL_HEIGHT ); while( ++patch_row < KERNEL_HEIGHT );
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch // reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
} }
//while ( ++patch_depth < 1 ); //debug //while ( ++patch_depth < 1 ); //debug
while ( ++patch_depth < INPUT_DEPTH ); while ( ++patch_depth < INPUT_DEPTH );
...@@ -883,17 +896,38 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -883,17 +896,38 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// ... // ...
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; #if KERNEL_WIDTH == 3
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;
Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
Dtype* pblockA01 = (Dtype*)(&blockA01); Dtype* pblockA01 = (Dtype*)(&blockA01);
#else
Dtype_t blockA00 = { (Dtype)0.f };
Dtype_t blockA01 = { (Dtype)0.f };
Dtype* pblockA00 = (Dtype*)(&blockA00);
Dtype* pblockA01 = (Dtype*)(&blockA01);
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_x0 + pos < input_width)
pblockA00[pos] = src0_read0[pos];
if (curr_x1 + pos < input_width)
pblockA01[pos] = src0_read1[pos];
})
src0_read0 += ROW_PITCH;
src0_read1 += ROW_PITCH;
#endif
#else #else
Dtype_t blockA00; Dtype_t blockA00;
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
int pos = 0; int pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read0[pos * DILATION_X]; pblockA00[pos] = src0_read0[pos * DILATION_X];
else else
pblockA00[pos] = 0; pblockA00[pos] = 0;
...@@ -904,7 +938,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -904,7 +938,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0; pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA01[pos] = src0_read1[pos * DILATION_X]; pblockA01[pos] = src0_read1[pos * DILATION_X];
else else
pblockA01[pos] = 0; pblockA01[pos] = 0;
...@@ -972,7 +1009,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -972,7 +1009,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0; curr_y0 = saved_y0;
curr_y1 = saved_y1; curr_y1 = saved_y1;
#endif #endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch // reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
} }
//while ( ++patch_depth < 1 ); //debug //while ( ++patch_depth < 1 ); //debug
...@@ -1084,7 +1122,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -1084,7 +1122,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0; int pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read0[pos * DILATION_X]; pblockA00[pos] = src0_read0[pos * DILATION_X];
else else
pblockA00[pos] = 0; pblockA00[pos] = 0;
...@@ -1095,7 +1136,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -1095,7 +1136,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0; pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA01[pos] = src0_read1[pos * DILATION_X]; pblockA01[pos] = src0_read1[pos * DILATION_X];
else else
pblockA01[pos] = 0; pblockA01[pos] = 0;
...@@ -1185,7 +1229,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -1185,7 +1229,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0; curr_y0 = saved_y0;
curr_y1 = saved_y1; curr_y1 = saved_y1;
#endif #endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch // reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
} }
//while ( ++patch_depth < 1 ); //debug //while ( ++patch_depth < 1 ); //debug
...@@ -1409,15 +1454,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -1409,15 +1454,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
#if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3(0, src0_read);
Dtype* pblockA00 = (Dtype*)(&blockA00);
#else
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
#endif
#else #else
Dtype_t blockA00; Dtype_t blockA00;
Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA00 = (Dtype*)(&blockA00);
int pos = 0; int pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read[pos * DILATION_X]; pblockA00[pos] = src0_read[pos * DILATION_X];
else else
pblockA00[pos] = 0; pblockA00[pos] = 0;
...@@ -1463,7 +1516,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -1463,7 +1516,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while( ++patch_row < 1 ); //debug //while( ++patch_row < 1 ); //debug
while( ++patch_row < KERNEL_HEIGHT ); while( ++patch_row < KERNEL_HEIGHT );
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch // reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
} }
//while ( ++patch_depth < 1 ); //debug //while ( ++patch_depth < 1 ); //debug
while ( ++patch_depth < INPUT_DEPTH ); while ( ++patch_depth < INPUT_DEPTH );
...@@ -1600,7 +1654,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -1600,7 +1654,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0; int pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read0[pos * DILATION_X]; pblockA00[pos] = src0_read0[pos * DILATION_X];
else else
pblockA00[pos] = 0; pblockA00[pos] = 0;
...@@ -1611,7 +1668,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -1611,7 +1668,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0; pos = 0;
LOOP(KERNEL_WIDTH, pos, LOOP(KERNEL_WIDTH, pos,
{ {
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) if (curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA01[pos] = src0_read1[pos * DILATION_X]; pblockA01[pos] = src0_read1[pos * DILATION_X];
else else
pblockA01[pos] = 0; pblockA01[pos] = 0;
...@@ -1667,7 +1727,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) ...@@ -1667,7 +1727,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0; curr_y0 = saved_y0;
curr_y1 = saved_y1; curr_y1 = saved_y1;
#endif #endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch // reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
} }
//while ( ++patch_depth < 1 ); //debug //while ( ++patch_depth < 1 ); //debug
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment