Commit 15677d6d authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

minor optimization of SURF_GPU (orientation calculation, changed block size to 32x4)

parent eaa66141
...@@ -500,6 +500,20 @@ namespace cv { namespace gpu { namespace surf ...@@ -500,6 +500,20 @@ namespace cv { namespace gpu { namespace surf
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__device__ void reduceSum32(volatile float* v_sum, float& sum)
{
v_sum[threadIdx.x] = sum;
if (threadIdx.x < 16)
{
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 16];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 8];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 4];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 2];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 1];
}
}
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir) __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
...@@ -508,8 +522,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -508,8 +522,7 @@ namespace cv { namespace gpu { namespace surf
__shared__ float s_Y[128]; __shared__ float s_Y[128];
__shared__ float s_angle[128]; __shared__ float s_angle[128];
__shared__ float s_sumx[64 * 4]; __shared__ float s_sum[32 * 4];
__shared__ float s_sumy[64 * 4];
/* The sampling intervals and wavelet sized for selecting an orientation /* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */ and building the keypoint descriptor are defined relative to 's' */
...@@ -525,7 +538,6 @@ namespace cv { namespace gpu { namespace surf ...@@ -525,7 +538,6 @@ namespace cv { namespace gpu { namespace surf
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
{ {
// Calc X, Y, angle and store it to shared memory // Calc X, Y, angle and store it to shared memory
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
float X = 0.0f, Y = 0.0f, angle = 0.0f; float X = 0.0f, Y = 0.0f, angle = 0.0f;
...@@ -547,13 +559,9 @@ namespace cv { namespace gpu { namespace surf ...@@ -547,13 +559,9 @@ namespace cv { namespace gpu { namespace surf
angle *= 180.0f / CV_PI; angle *= 180.0f / CV_PI;
} }
} }
if (tid < 128)
{
s_X[tid] = X; s_X[tid] = X;
s_Y[tid] = Y; s_Y[tid] = Y;
s_angle[tid] = angle; s_angle[tid] = angle;
}
}
__syncthreads(); __syncthreads();
float bestx = 0, besty = 0, best_mod = 0; float bestx = 0, besty = 0, best_mod = 0;
...@@ -570,43 +578,29 @@ namespace cv { namespace gpu { namespace surf ...@@ -570,43 +578,29 @@ namespace cv { namespace gpu { namespace surf
sumx = s_X[threadIdx.x]; sumx = s_X[threadIdx.x];
sumy = s_Y[threadIdx.x]; sumy = s_Y[threadIdx.x];
} }
d = abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[threadIdx.x + 32];
sumy += s_Y[threadIdx.x + 32];
}
d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir); d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{ {
sumx += s_X[threadIdx.x + 64]; sumx += s_X[threadIdx.x + 64];
sumy += s_Y[threadIdx.x + 64]; sumy += s_Y[threadIdx.x + 64];
} }
d = abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
float* s_sumx_row = s_sumx + threadIdx.y * 64; if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
float* s_sumy_row = s_sumy + threadIdx.y * 64;
s_sumx_row[threadIdx.x] = sumx;
s_sumy_row[threadIdx.x] = sumy;
__syncthreads();
if (threadIdx.x < 32)
{ {
volatile float* v_sumx_row = s_sumx_row; sumx += s_X[threadIdx.x + 96];
volatile float* v_sumy_row = s_sumy_row; sumy += s_Y[threadIdx.x + 96];
}
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2]; float* s_sum_row = s_sum + threadIdx.y * 32;
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1]; reduceSum32(s_sum_row, sumx);
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1]; reduceSum32(s_sum_row, sumy);
}
const float temp_mod = sumx * sumx + sumy * sumy; const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod) if (temp_mod > best_mod)
...@@ -615,7 +609,6 @@ namespace cv { namespace gpu { namespace surf ...@@ -615,7 +609,6 @@ namespace cv { namespace gpu { namespace surf
bestx = sumx; bestx = sumx;
besty = sumy; besty = sumy;
} }
__syncthreads();
} }
if (threadIdx.x == 0) if (threadIdx.x == 0)
...@@ -672,7 +665,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -672,7 +665,7 @@ namespace cv { namespace gpu { namespace surf
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
{ {
dim3 threads; dim3 threads;
threads.x = 64; threads.x = 32;
threads.y = 4; threads.y = 4;
dim3 grid; dim3 grid;
...@@ -742,8 +735,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -742,8 +735,7 @@ namespace cv { namespace gpu { namespace surf
} }
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
int tid)
{ {
__shared__ float s_PATCH[6][6]; __shared__ float s_PATCH[6][6];
...@@ -778,7 +770,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -778,7 +770,7 @@ namespace cv { namespace gpu { namespace surf
if (threadIdx.x < 5 && threadIdx.y < 5) if (threadIdx.x < 5 && threadIdx.y < 5)
{ {
tid = threadIdx.y * 5 + threadIdx.x; const int tid = threadIdx.y * 5 + threadIdx.x;
const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
...@@ -834,11 +826,11 @@ namespace cv { namespace gpu { namespace surf ...@@ -834,11 +826,11 @@ namespace cv { namespace gpu { namespace surf
__shared__ float sdxabs[25]; __shared__ float sdxabs[25];
__shared__ float sdyabs[25]; __shared__ float sdyabs[25];
const int tid = threadIdx.y * blockDim.x + threadIdx.x; calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);
__syncthreads(); __syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
sdxabs[tid] = fabs(sdx[tid]); // |dx| array sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array sdyabs[tid] = fabs(sdy[tid]); // |dy| array
__syncthreads(); __syncthreads();
...@@ -870,11 +862,11 @@ namespace cv { namespace gpu { namespace surf ...@@ -870,11 +862,11 @@ namespace cv { namespace gpu { namespace surf
__shared__ float sdabs1[25]; __shared__ float sdabs1[25];
__shared__ float sdabs2[25]; __shared__ float sdabs2[25];
const int tid = threadIdx.y * blockDim.x + threadIdx.x; calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);
__syncthreads(); __syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (sdy[tid] >= 0) if (sdy[tid] >= 0)
{ {
sd1[tid] = sdx[tid]; sd1[tid] = sdx[tid];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment