Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
15677d6d
Commit
15677d6d
authored
May 24, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
minor optimization of SURF_GPU (orientation calculation, changed block size to 32x4)
parent
eaa66141
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
56 additions
and
64 deletions
+56
-64
surf.cu
modules/gpu/src/cuda/surf.cu
+56
-64
No files found.
modules/gpu/src/cuda/surf.cu
View file @
15677d6d
...
@@ -500,6 +500,20 @@ namespace cv { namespace gpu { namespace surf
...
@@ -500,6 +500,20 @@ namespace cv { namespace gpu { namespace surf
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__device__ void reduceSum32(volatile float* v_sum, float& sum)
{
v_sum[threadIdx.x] = sum;
if (threadIdx.x < 16)
{
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 16];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 8];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 4];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 2];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 1];
}
}
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
...
@@ -508,8 +522,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -508,8 +522,7 @@ namespace cv { namespace gpu { namespace surf
__shared__ float s_Y[128];
__shared__ float s_Y[128];
__shared__ float s_angle[128];
__shared__ float s_angle[128];
__shared__ float s_sumx[64 * 4];
__shared__ float s_sum[32 * 4];
__shared__ float s_sumy[64 * 4];
/* The sampling intervals and wavelet sized for selecting an orientation
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
and building the keypoint descriptor are defined relative to 's' */
...
@@ -525,35 +538,30 @@ namespace cv { namespace gpu { namespace surf
...
@@ -525,35 +538,30 @@ namespace cv { namespace gpu { namespace surf
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
{
{
// Calc X, Y, angle and store it to shared memory
// Calc X, Y, angle and store it to shared memory
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
float X = 0.0f, Y = 0.0f, angle = 0.0f;
float X = 0.0f, Y = 0.0f, angle = 0.0f;
if (tid < ORI_SAMPLES)
if (tid < ORI_SAMPLES)
{
{
const float margin = (float)(grad_wav_size - 1) / 2.0f;
const float margin = (float)(grad_wav_size - 1) / 2.0f;
const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
{
X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
angle = atan2f(Y, X);
if (angle < 0)
angle += 2.0f * CV_PI;
angle *= 180.0f / CV_PI;
}
}
if (tid < 128)
{
{
s_X[tid] = X;
X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
s_Y[tid] = Y;
Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
s_angle[tid] = angle;
angle = atan2f(Y, X);
if (angle < 0)
angle += 2.0f * CV_PI;
angle *= 180.0f / CV_PI;
}
}
}
}
s_X[tid] = X;
s_Y[tid] = Y;
s_angle[tid] = angle;
__syncthreads();
__syncthreads();
float bestx = 0, besty = 0, best_mod = 0;
float bestx = 0, besty = 0, best_mod = 0;
...
@@ -570,43 +578,29 @@ namespace cv { namespace gpu { namespace surf
...
@@ -570,43 +578,29 @@ namespace cv { namespace gpu { namespace surf
sumx = s_X[threadIdx.x];
sumx = s_X[threadIdx.x];
sumy = s_Y[threadIdx.x];
sumy = s_Y[threadIdx.x];
}
}
d = abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[threadIdx.x + 32];
sumy += s_Y[threadIdx.x + 32];
}
d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
{
sumx += s_X[threadIdx.x + 64];
sumx += s_X[threadIdx.x + 64];
sumy += s_Y[threadIdx.x + 64];
sumy += s_Y[threadIdx.x + 64];
}
}
d = abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
float* s_sumx_row = s_sumx + threadIdx.y * 64;
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
float* s_sumy_row = s_sumy + threadIdx.y * 64;
s_sumx_row[threadIdx.x] = sumx;
s_sumy_row[threadIdx.x] = sumy;
__syncthreads();
if (threadIdx.x < 32)
{
{
volatile float* v_sumx_row = s_sumx_row;
sumx += s_X[threadIdx.x + 96];
volatile float* v_sumy_row = s_sumy_row;
sumy += s_Y[threadIdx.x + 96];
}
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2];
float* s_sum_row = s_sum + threadIdx.y * 32;
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1];
reduceSum32(s_sum_row, sumx);
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1];
reduceSum32(s_sum_row, sumy);
}
const float temp_mod = sumx * sumx + sumy * sumy;
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
if (temp_mod > best_mod)
...
@@ -615,7 +609,6 @@ namespace cv { namespace gpu { namespace surf
...
@@ -615,7 +609,6 @@ namespace cv { namespace gpu { namespace surf
bestx = sumx;
bestx = sumx;
besty = sumy;
besty = sumy;
}
}
__syncthreads();
}
}
if (threadIdx.x == 0)
if (threadIdx.x == 0)
...
@@ -672,7 +665,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -672,7 +665,7 @@ namespace cv { namespace gpu { namespace surf
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
{
{
dim3 threads;
dim3 threads;
threads.x =
64
;
threads.x =
32
;
threads.y = 4;
threads.y = 4;
dim3 grid;
dim3 grid;
...
@@ -742,8 +735,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -742,8 +735,7 @@ namespace cv { namespace gpu { namespace surf
}
}
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
int tid)
{
{
__shared__ float s_PATCH[6][6];
__shared__ float s_PATCH[6][6];
...
@@ -778,7 +770,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -778,7 +770,7 @@ namespace cv { namespace gpu { namespace surf
if (threadIdx.x < 5 && threadIdx.y < 5)
if (threadIdx.x < 5 && threadIdx.y < 5)
{
{
tid = threadIdx.y * 5 + threadIdx.x;
const int
tid = threadIdx.y * 5 + threadIdx.x;
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
...
@@ -834,11 +826,11 @@ namespace cv { namespace gpu { namespace surf
...
@@ -834,11 +826,11 @@ namespace cv { namespace gpu { namespace surf
__shared__ float sdxabs[25];
__shared__ float sdxabs[25];
__shared__ float sdyabs[25];
__shared__ float sdyabs[25];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);
__syncthreads();
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
__syncthreads();
__syncthreads();
...
@@ -870,11 +862,11 @@ namespace cv { namespace gpu { namespace surf
...
@@ -870,11 +862,11 @@ namespace cv { namespace gpu { namespace surf
__shared__ float sdabs1[25];
__shared__ float sdabs1[25];
__shared__ float sdabs2[25];
__shared__ float sdabs2[25];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);
__syncthreads();
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (sdy[tid] >= 0)
if (sdy[tid] >= 0)
{
{
sd1[tid] = sdx[tid];
sd1[tid] = sdx[tid];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment