Commit 32a2fde8 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

temporarily disabled compute descriptor kernel for new cards (some problems with…

temporarily disabled compute descriptor kernel for new cards (some problems with threads synchronization), old version of kernels is used.
parent 5b3d786e
......@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace surf
__constant__ float c_dxy_scale;
// The scale associated with the first interval of the first octave
__constant__ float c_initialScale;
//! The interest operator threshold
// The interest operator threshold
__constant__ float c_threshold;
// Ther octave
......@@ -685,7 +685,6 @@ namespace cv { namespace gpu { namespace surf
// - SURF says to only use a circle, but the branching logic would slow it down
// - Gaussian weighting should reduce the effects of the outer points anyway
if (tid2 < 169)
{
dx -= texLookups[threadIdx.x ][threadIdx.y ];
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];
......@@ -835,24 +834,12 @@ namespace cv { namespace gpu { namespace surf
descriptor_base[threadIdx.x] = lookup / len;
}
__device__ void calc_dx_dy(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt,
int xIndex, int yIndex, int tid)
{
// get the interest point parameters (x, y, size, response, angle)
__shared__ float ipt[5];
if (threadIdx.x < 5 && threadIdx.y == 0 && threadIdx.z == 0)
{
ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];
}
__syncthreads();
float sin_theta, cos_theta;
sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xIndex = threadIdx.y * 5 + threadIdx.x % 5;
const int yIndex = threadIdx.z * 5 + threadIdx.x / 5;
// Compute rotated sampling points
// (clockwise rotation since we are rotating the lattice)
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
......@@ -865,7 +852,6 @@ namespace cv { namespace gpu { namespace surf
// a b c
// d f
// g h i
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
......@@ -883,53 +869,64 @@ namespace cv { namespace gpu { namespace surf
// rotate responses (store all dxs then all dys)
// - counterclockwise rotation to rotate back to zero orientation
sdx[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
sdy[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
}
__device__ void reduce_sum(float sdata1[4][4][25], float sdata2[4][4][25], float sdata3[4][4][25],
float sdata4[4][4][25])
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)
{
// first step is to reduce from 25 to 16
if (threadIdx.x < 9) // use 9 threads
// get the interest point parameters (x, y, size, response, angle)
__shared__ float ipt[5];
if (threadIdx.x < 5 && threadIdx.y == 0)
{
sdata1[threadIdx.z][threadIdx.y][threadIdx.x] += sdata1[threadIdx.z][threadIdx.y][threadIdx.x + 16];
sdata2[threadIdx.z][threadIdx.y][threadIdx.x] += sdata2[threadIdx.z][threadIdx.y][threadIdx.x + 16];
sdata3[threadIdx.z][threadIdx.y][threadIdx.x] += sdata3[threadIdx.z][threadIdx.y][threadIdx.x + 16];
sdata4[threadIdx.z][threadIdx.y][threadIdx.x] += sdata4[threadIdx.z][threadIdx.y][threadIdx.x + 16];
ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];
}
__syncthreads();
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
if (threadIdx.x < 16)
{
volatile float* smem = sdata1[threadIdx.z][threadIdx.y];
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (threadIdx.y & 3); // threadIdx.y % 4
const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4)
const int xIndex = (xBlock * 5) + (threadIdx.x % 5);
const int yIndex = (yBlock * 5) + (threadIdx.x / 5);
smem = sdata2[threadIdx.z][threadIdx.y];
calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x);
}
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,
volatile float* sdata3, volatile float* sdata4, int tid)
{
// first step is to reduce from 25 to 16
if (tid < 9) // use 9 threads
{
sdata1[tid] += sdata1[tid + 16];
sdata2[tid] += sdata2[tid + 16];
sdata3[tid] += sdata3[tid + 16];
sdata4[tid] += sdata4[tid + 16];
}
smem = sdata3[threadIdx.z][threadIdx.y];
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
if (tid < 16)
{
sdata1[tid] += sdata1[tid + 8];
sdata1[tid] += sdata1[tid + 4];
sdata1[tid] += sdata1[tid + 2];
sdata1[tid] += sdata1[tid + 1];
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
sdata2[tid] += sdata2[tid + 8];
sdata2[tid] += sdata2[tid + 4];
sdata2[tid] += sdata2[tid + 2];
sdata2[tid] += sdata2[tid + 1];
smem = sdata4[threadIdx.z][threadIdx.y];
sdata3[tid] += sdata3[tid + 8];
sdata3[tid] += sdata3[tid + 4];
sdata3[tid] += sdata3[tid + 2];
sdata3[tid] += sdata3[tid + 1];
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
sdata4[tid] += sdata4[tid + 8];
sdata4[tid] += sdata4[tid + 4];
sdata4[tid] += sdata4[tid + 2];
sdata4[tid] += sdata4[tid + 1];
}
}
......@@ -938,31 +935,43 @@ namespace cv { namespace gpu { namespace surf
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[4][4][25];
__shared__ float sdy[4][4][25];
__shared__ float sdx [16 * 25];
__shared__ float sdy [16 * 25];
__shared__ float sdxabs[16 * 25];
__shared__ float sdyabs[16 * 25];
calc_dx_dy(sdx, sdy, features);
__syncthreads();
__shared__ float sdesc[64];
__shared__ float sdxabs[4][4][25];
__shared__ float sdyabs[4][4][25];
float* sdx_bin = sdx + (threadIdx.y * 25);
float* sdy_bin = sdy + (threadIdx.y * 25);
float* sdxabs_bin = sdxabs + (threadIdx.y * 25);
float* sdyabs_bin = sdyabs + (threadIdx.y * 25);
sdxabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); // |dx| array
sdyabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); // |dy| array
calc_dx_dy(sdx_bin, sdy_bin, features);
__syncthreads();
reduce_sum(sdx, sdy, sdxabs, sdyabs);
sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array
sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array
__syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 16 + threadIdx.y * 4;
reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x);
__syncthreads();
float* sdesc_bin = sdesc + (threadIdx.y << 2);
// write dx, dy, |dx|, |dy|
if (threadIdx.x == 0)
{
descriptors_block[0] = sdx[threadIdx.z][threadIdx.y][0];
descriptors_block[1] = sdy[threadIdx.z][threadIdx.y][0];
descriptors_block[2] = sdxabs[threadIdx.z][threadIdx.y][0];
descriptors_block[3] = sdyabs[threadIdx.z][threadIdx.y][0];
sdesc_bin[0] = sdx_bin[0];
sdesc_bin[1] = sdy_bin[0];
sdesc_bin[2] = sdxabs_bin[0];
sdesc_bin[3] = sdyabs_bin[0];
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 64)
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
}
// Spawn 16 blocks per interest point
......@@ -970,74 +979,90 @@ namespace cv { namespace gpu { namespace surf
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[4][4][25];
__shared__ float sdy[4][4][25];
calc_dx_dy(sdx, sdy, features);
__syncthreads();
__shared__ float sdx[16 * 25];
__shared__ float sdy[16 * 25];
// sum (reduce) 5x5 area response
__shared__ float sd1[4][4][25];
__shared__ float sd2[4][4][25];
__shared__ float sdabs1[4][4][25];
__shared__ float sdabs2[4][4][25];
__shared__ float sd1[16 * 25];
__shared__ float sd2[16 * 25];
__shared__ float sdabs1[16 * 25];
__shared__ float sdabs2[16 * 25];
__shared__ float sdesc[128];
float* sdx_bin = sdx + (threadIdx.y * 25);
float* sdy_bin = sdy + (threadIdx.y * 25);
float* sd1_bin = sd1 + (threadIdx.y * 25);
float* sd2_bin = sd2 + (threadIdx.y * 25);
float* sdabs1_bin = sdabs1 + (threadIdx.y * 25);
float* sdabs2_bin = sdabs2 + (threadIdx.y * 25);
calc_dx_dy(sdx_bin, sdy_bin, features);
__syncthreads();
if (sdy[threadIdx.z][threadIdx.y][threadIdx.x] >= 0)
if (sdy_bin[threadIdx.x] >= 0)
{
sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x];
sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]);
sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x];
sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]);
sd2_bin[threadIdx.x] = 0;
sdabs2_bin[threadIdx.x] = 0;
}
else
{
sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x];
sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]);
sd1_bin[threadIdx.x] = 0;
sdabs1_bin[threadIdx.x] = 0;
sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x];
sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]);
}
__syncthreads();
reduce_sum(sd1, sd2, sdabs1, sdabs2);
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
__syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 32 + threadIdx.y * 8;
float* sdesc_bin = sdesc + (threadIdx.y << 3);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (threadIdx.x == 0)
{
descriptors_block[0] = sd1[threadIdx.z][threadIdx.y][0];
descriptors_block[1] = sdabs1[threadIdx.z][threadIdx.y][0];
descriptors_block[2] = sd2[threadIdx.z][threadIdx.y][0];
descriptors_block[3] = sdabs2[threadIdx.z][threadIdx.y][0];
sdesc_bin[0] = sd1_bin[0];
sdesc_bin[1] = sdabs1_bin[0];
sdesc_bin[2] = sd2_bin[0];
sdesc_bin[3] = sdabs2_bin[0];
}
__syncthreads();
if (sdx[threadIdx.z][threadIdx.y][threadIdx.x] >= 0)
if (sdx_bin[threadIdx.x] >= 0)
{
sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x];
sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]);
sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x];
sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
sd2_bin[threadIdx.x] = 0;
sdabs2_bin[threadIdx.x] = 0;
}
else
{
sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x];
sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]);
sd1_bin[threadIdx.x] = 0;
sdabs1_bin[threadIdx.x] = 0;
sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x];
sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
}
__syncthreads();
reduce_sum(sd1, sd2, sdabs1, sdabs2);
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
__syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (threadIdx.x == 0)
{
descriptors_block[4] = sd1[threadIdx.z][threadIdx.y][0];
descriptors_block[5] = sdabs1[threadIdx.z][threadIdx.y][0];
descriptors_block[6] = sd2[threadIdx.z][threadIdx.y][0];
descriptors_block[7] = sdabs2[threadIdx.z][threadIdx.y][0];
sdesc_bin[4] = sd1_bin[0];
sdesc_bin[5] = sdabs1_bin[0];
sdesc_bin[6] = sd2_bin[0];
sdesc_bin[7] = sdabs2_bin[0];
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 128)
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
}
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
......@@ -1046,7 +1071,7 @@ namespace cv { namespace gpu { namespace surf
if (descriptors.cols == 64)
{
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
......@@ -1058,7 +1083,7 @@ namespace cv { namespace gpu { namespace surf
}
else
{
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
......@@ -1080,9 +1105,6 @@ namespace cv { namespace gpu { namespace surf
}
__syncthreads();
float sin_theta, cos_theta;
sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
......@@ -1090,100 +1112,40 @@ namespace cv { namespace gpu { namespace surf
const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = yBlock * blockDim.y + threadIdx.y;
// Compute rotated sampling points
// (clockwise rotation since we are rotating the lattice)
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
+ sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
+ cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
// gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
// a b c
// d f
// g h i
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]);
const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
// compute axis-aligned HaarX, HaarY
// (could group the additions together into multiplications)
const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx
const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy
// rotate responses (store all dxs then all dys)
// - counterclockwise rotation to rotate back to zero orientation
sdx[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
sdy[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
}
__device__ void reduce_sum_old(float sdata[25], int tid)
{
// first step is to reduce from 25 to 16
if (tid < 9) // use 9 threads
sdata[tid] += sdata[tid + 16];
__syncthreads();
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
if (tid < 16)
{
volatile float* smem = sdata;
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid);
}
// Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25];
__shared__ float sdy[25];
__shared__ float sdxabs[25];
__shared__ float sdyabs[25];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy_old(sdx, sdy, features, tid);
__syncthreads();
__shared__ float sabs[25];
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
__syncthreads();
sabs[tid] = fabs(sdx[tid]); // |dx| array
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
__syncthreads();
reduce_sum_old(sdx, tid);
reduce_sum_old(sdy, tid);
reduce_sum_old(sabs, tid);
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
// write dx, dy, |dx|
// write dx, dy, |dx|, |dy|
if (tid == 0)
{
descriptors_block[0] = sdx[0];
descriptors_block[1] = sdy[0];
descriptors_block[2] = sabs[0];
}
__syncthreads();
sabs[tid] = fabs(sdy[tid]); // |dy| array
__syncthreads();
reduce_sum_old(sabs, tid);
// write |dy|
if (tid == 0)
{
descriptors_block[3] = sabs[0];
descriptors_block[2] = sdxabs[0];
descriptors_block[3] = sdyabs[0];
}
}
......@@ -1191,23 +1153,21 @@ namespace cv { namespace gpu { namespace surf
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)
{
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25];
__shared__ float sdy[25];
calc_dx_dy_old(sdx, sdy, features, tid);
__syncthreads();
// sum (reduce) 5x5 area response
__shared__ float sd1[25];
__shared__ float sd2[25];
__shared__ float sdabs1[25];
__shared__ float sdabs2[25];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy_old(sdx, sdy, features, tid);
__syncthreads();
if (sdy[tid] >= 0)
{
sd1[tid] = sdx[tid];
......@@ -1224,10 +1184,10 @@ namespace cv { namespace gpu { namespace surf
}
__syncthreads();
reduce_sum_old(sd1, tid);
reduce_sum_old(sd2, tid);
reduce_sum_old(sdabs1, tid);
reduce_sum_old(sdabs2, tid);
reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);
__syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (tid == 0)
......@@ -1255,10 +1215,8 @@ namespace cv { namespace gpu { namespace surf
}
__syncthreads();
reduce_sum_old(sd1, tid);
reduce_sum_old(sd2, tid);
reduce_sum_old(sdabs1, tid);
reduce_sum_old(sdabs2, tid);
reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);
__syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (tid == 0)
......
......@@ -233,8 +233,8 @@ namespace
typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors,
const KeyPoint_GPU* features, int nFeatures);
const compute_descriptors_t compute_descriptors =
DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
const compute_descriptors_t compute_descriptors = compute_descriptors_gpu_old;
//DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
if (keypoints.cols > 0)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment