Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
fcff1260
Commit
fcff1260
authored
Mar 16, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed some warnings in surf.cu
parent
6c2cdb67
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
66 additions
and
85 deletions
+66
-85
surf.cu
modules/gpu/src/cuda/surf.cu
+66
-85
No files found.
modules/gpu/src/cuda/surf.cu
View file @
fcff1260
...
...
@@ -47,6 +47,7 @@
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
...
...
@@ -85,14 +86,14 @@ namespace cv { namespace gpu { namespace surf
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
typedef double real_t;
#else
typedef float real_t;
#endif
float ratio = (float)newSize / oldSize;
float ratio = (float)newSize / oldSize;
real_t d = 0;
...
...
@@ -202,30 +203,24 @@ namespace cv { namespace gpu { namespace surf
{
static __device__ bool check(int sum_i, int sum_j, int size)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
typedef double real_t;
#else
typedef float real_t;
#endif
float ratio = (float)size / 9.0f;
real_t d = 0;
int dx1 = __float2int_rn(ratio * c_DM[0]);
int dy1 = __float2int_rn(ratio * c_DM[1]);
int dx2 = __float2int_rn(ratio * c_DM[2]);
int dy2 = __float2int_rn(ratio * c_DM[3]);
real_t t = 0;
t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
return (d >= 0.5);
float ratio = (float)size / 9.0f;
float d = 0;
int dx1 = __float2int_rn(ratio * c_DM[0]);
int dy1 = __float2int_rn(ratio * c_DM[1]);
int dx2 = __float2int_rn(ratio * c_DM[2]);
int dy2 = __float2int_rn(ratio * c_DM[3]);
float t = 0;
t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
return (d >= 0.5f);
}
};
...
...
@@ -727,27 +722,16 @@ namespace cv { namespace gpu { namespace surf
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
};
__device__
void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2
)
__device__
unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir
)
{
const float centerX = s_pt[SF_X];
const float centerY = s_pt[SF_Y];
const float size = s_pt[SF_SIZE];
const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = size * 1.2f / 9.0f;
/* Extract a window of pixels around the keypoint of size 20s */
const int win_size = (int)((PATCH_SZ + 1) * s);
float sin_dir;
float cos_dir;
sincosf(descriptor_dir, &sin_dir, &cos_dir);
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
/* Nearest neighbour version (faster) */
const float win_offset = -(float)(win_size - 1) / 2;
return tex2D(imgTex, pixel_x, pixel_y);
}
__device__ unsigned char calcPATCH(int i1, int j1, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir, int win_size)
{
/* Scale the window to size PATCH_SZ so each pixel's size is s. This
makes calculating the gradients with wavelets of size 2s easy */
const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;
...
...
@@ -756,38 +740,42 @@ namespace cv { namespace gpu { namespace surf
const int i = __float2int_rd(icoo);
const int j = __float2int_rd(jcoo);
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
float res = calcWin(i, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (j + 1 - jcoo);
res += calcWin(i + 1, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (j + 1 - jcoo);
res += calcWin(i + 1, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (jcoo - j);
res += calcWin(i, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (jcoo - j);
float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo);
return saturate_cast<unsigned char>(res);
}
pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir;
pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir;
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)
{
__shared__ float s_PATCH[6][6];
res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo);
// get the interest point parameters (x, y, size, response, angle)
__shared__ float s_pt[5];
if (threadIdx.y == 0)
s_pt[threadIdx.x] = ((float*)(&keypoints[blockIdx.x]))[threadIdx.x];
__syncthreads();
pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir;
pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir;
const float centerX = s_pt[SF_X];
const float centerY = s_pt[SF_Y];
const float size = s_pt[SF_SIZE];
const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);
res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j);
pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir;
pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir;
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = size * 1.2f / 9.0f;
res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j);
/* Extract a window of pixels around the keypoint of size 20s */
const int win_size = (int)((PATCH_SZ + 1) * s);
s_PATCH[i2][j2] = (unsigned char)res;
}
float sin_dir;
float cos_dir;
sincosf(descriptor_dir, &sin_dir, &cos_dir);
__device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)
{
// get the interest point parameters (x, y, size, response, angle)
__shared__ float s_pt[5];
if (tid < 5)
{
s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid];
}
__syncthreads();
/* Nearest neighbour version (faster) */
const float win_offset = -(float)(win_size - 1) / 2;
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
...
...
@@ -796,13 +784,13 @@ namespace cv { namespace gpu { namespace surf
const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = yBlock * blockDim.y + threadIdx.y;
calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x
);
s_PATCH[threadIdx.y][threadIdx.x] = calcPATCH(yIndex, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size
);
if (threadIdx.x == 0)
calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5
);
s_PATCH[threadIdx.y][5] = calcPATCH(yIndex, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size
);
if (threadIdx.y == 0)
calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x
);
s_PATCH[5][threadIdx.x] = calcPATCH(yBlock * blockDim.y + 5, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size
);
if (threadIdx.x == 0 && threadIdx.y == 0)
calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5
);
s_PATCH[5][5] = calcPATCH(yBlock * blockDim.y + 5, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size
);
__syncthreads();
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
...
...
@@ -814,8 +802,7 @@ namespace cv { namespace gpu { namespace surf
s_dy_bin[tid] = vy;
}
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,
volatile float* sdata3, volatile float* sdata4, int tid)
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
{
// first step is to reduce from 25 to 16
if (tid < 9) // use 9 threads
...
...
@@ -851,12 +838,9 @@ namespace cv { namespace gpu { namespace surf
}
}
// Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float s_PATCH[6][6];
__shared__ float sdx[25];
__shared__ float sdy[25];
__shared__ float sdxabs[25];
...
...
@@ -864,7 +848,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy(s
_PATCH, s
dx, sdy, features, tid);
calc_dx_dy(sdx, sdy, features, tid);
__syncthreads();
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
...
...
@@ -886,12 +870,9 @@ namespace cv { namespace gpu { namespace surf
}
}
// Spawn 16 blocks per interest point
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float s_PATCH[6][6];
__shared__ float sdx[25];
__shared__ float sdy[25];
...
...
@@ -903,7 +884,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy(s
_PATCH, s
dx, sdy, features, tid);
calc_dx_dy(sdx, sdy, features, tid);
__syncthreads();
if (sdy[tid] >= 0)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment