Commit 4c5a8682 authored by Li Peng's avatar Li Peng

Fix gemmlike convolution input reading

use vload3 for half3 or float3 input vector reading,
also check read position to see if it exceed input width
Signed-off-by: 's avatarLi Peng <peng.li@intel.com>
parent 7fe07279
......@@ -467,7 +467,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int saved_y = curr_y;
#endif
const __global Dtype *src0_read = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
+ (curr_x - INPUT_PAD_W); // x offset
......@@ -502,15 +502,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
#if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3(0, src0_read);
Dtype* pblockA00 = (Dtype*)(&blockA00);
#else
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
Dtype* pblockA00 = (Dtype*)(&blockA00);
#endif
#else
Dtype_t blockA00;
Dtype* pblockA00 = (Dtype*)(&blockA00);
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read[pos * DILATION_X];
else
pblockA00[pos] = 0;
......@@ -564,17 +572,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while( ++patch_row < 1 ); //debug
while( ++patch_row < KERNEL_HEIGHT );
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
}
//while ( ++patch_depth < 1 ); //debug
while ( ++patch_depth < INPUT_DEPTH );
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
int out_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
int out_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
__global Dtype *out = dst + out_offset;
#if APPLY_BIAS
......@@ -621,7 +630,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int saved_y = curr_y;
#endif
const __global Dtype *src0_read = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
+ (curr_x - INPUT_PAD_W); // x offset
......@@ -653,7 +662,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read[pos * DILATION_X];
else
pblockA00[pos] = 0;
......@@ -730,17 +742,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while( ++patch_row < 1 ); //debug
while( ++patch_row < KERNEL_HEIGHT );
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
}
//while ( ++patch_depth < 1 ); //debug
while ( ++patch_depth < INPUT_DEPTH );
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
int out_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
int out_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
__global Dtype *out = dst + out_offset;
#if APPLY_BIAS
Dtype bias[4];
......@@ -849,11 +862,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int saved_y1 = curr_y1;
#endif
const __global Dtype *src0_read0 = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x0 - INPUT_PAD_W; // x offset
const __global Dtype *src0_read1 = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x1 - INPUT_PAD_W; // x offset
......@@ -883,17 +896,38 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// ...
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;
#if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;
Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;
Dtype* pblockA00 = (Dtype*)(&blockA00);
Dtype* pblockA01 = (Dtype*)(&blockA01);
#else
Dtype_t blockA00 = { (Dtype)0.f };
Dtype_t blockA01 = { (Dtype)0.f };
Dtype* pblockA00 = (Dtype*)(&blockA00);
Dtype* pblockA01 = (Dtype*)(&blockA01);
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_x0 + pos < input_width)
pblockA00[pos] = src0_read0[pos];
if (curr_x1 + pos < input_width)
pblockA01[pos] = src0_read1[pos];
})
src0_read0 += ROW_PITCH;
src0_read1 += ROW_PITCH;
#endif
#else
Dtype_t blockA00;
Dtype* pblockA00 = (Dtype*)(&blockA00);
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read0[pos * DILATION_X];
else
pblockA00[pos] = 0;
......@@ -904,7 +938,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA01[pos] = src0_read1[pos * DILATION_X];
else
pblockA01[pos] = 0;
......@@ -972,7 +1009,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0;
curr_y1 = saved_y1;
#endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
}
//while ( ++patch_depth < 1 ); //debug
......@@ -980,14 +1018,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
int out0_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
int out0_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
int out1_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
int out1_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
#if APPLY_BIAS
Dtype bias[4];
......@@ -1049,11 +1087,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int saved_y1 = curr_y1;
#endif
const __global Dtype *src0_read0 = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x0 - INPUT_PAD_W; // x offset
const __global Dtype *src0_read1 = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x1 - INPUT_PAD_W; // x offset
......@@ -1084,7 +1122,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read0[pos * DILATION_X];
else
pblockA00[pos] = 0;
......@@ -1095,7 +1136,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA01[pos] = src0_read1[pos * DILATION_X];
else
pblockA01[pos] = 0;
......@@ -1185,7 +1229,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0;
curr_y1 = saved_y1;
#endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
}
//while ( ++patch_depth < 1 ); //debug
......@@ -1193,14 +1238,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
int out0_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
int out0_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
int out1_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
int out1_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
__global Dtype *out1 = dst + out1_offset;
#if APPLY_BIAS
......@@ -1352,9 +1397,9 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int saved_y = curr_y;
#endif
const __global Dtype *src0_read = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x - INPUT_PAD_W; // x offset
+ curr_x - INPUT_PAD_W; // x offset
const __global Dtype *src0_read_orig = src0_read;
// Src1 (filter) is directly used as btile.
......@@ -1409,15 +1454,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
#if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3(0, src0_read);
Dtype* pblockA00 = (Dtype*)(&blockA00);
#else
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
Dtype* pblockA00 = (Dtype*)(&blockA00);
#endif
#else
Dtype_t blockA00;
Dtype* pblockA00 = (Dtype*)(&blockA00);
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read[pos * DILATION_X];
else
pblockA00[pos] = 0;
......@@ -1463,17 +1516,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while( ++patch_row < 1 ); //debug
while( ++patch_row < KERNEL_HEIGHT );
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
}
//while ( ++patch_depth < 1 ); //debug
while ( ++patch_depth < INPUT_DEPTH );
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
int out_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
int out_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
__global Dtype *out = dst + out_offset;
#if APPLY_BIAS
......@@ -1556,11 +1610,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int saved_y1 = curr_y1;
#endif
const __global Dtype *src0_read0 = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x0 - INPUT_PAD_W; // x offset
const __global Dtype *src0_read1 = src0
+ aligned_input_size * global_z // batch offset
+ aligned_input_size * global_z // batch offset
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
+ curr_x1 - INPUT_PAD_W; // x offset
......@@ -1600,7 +1654,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA00[pos] = src0_read0[pos * DILATION_X];
else
pblockA00[pos] = 0;
......@@ -1611,7 +1668,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0;
LOOP(KERNEL_WIDTH, pos,
{
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
if (curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
pblockA01[pos] = src0_read1[pos * DILATION_X];
else
pblockA01[pos] = 0;
......@@ -1667,7 +1727,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0;
curr_y1 = saved_y1;
#endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
}
//while ( ++patch_depth < 1 ); //debug
......@@ -1675,14 +1736,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
int out0_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
int out0_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
int out1_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
int out1_offset = global_z * out_pitch_z // batch offset
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
#if APPLY_BIAS
Dtype bias[2];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment