Commit 58f69197 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

made GPU version of SURF more consistent with CPU one

parent c067c633
......@@ -1537,83 +1537,55 @@ namespace cv
};
////////////////////////////////// SURF //////////////////////////////////////////
struct CV_EXPORTS SURFParams_GPU
{
SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f),
l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f),
edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {}
//! The interest operator threshold
float threshold;
//! The number of octaves to process
int nOctaves;
//! The number of intervals in each octave
int nIntervals;
//! The scale associated with the first interval of the first octave
float initialScale;
//! mask parameter l_1
float l1;
//! mask parameter l_2
float l2;
//! mask parameter l_3
float l3;
//! mask parameter l_4
float l4;
//! The amount to scale the edge rejection mask
float edgeScale;
//! The initial sampling step in pixels.
int initialStep;
//! True, if generate 128-len descriptors, false - 64-len descriptors
bool extended;
//! max features = featuresRatio * img.size().srea()
float featuresRatio;
};
class CV_EXPORTS SURF_GPU : public SURFParams_GPU
class CV_EXPORTS SURF_GPU : public CvSURFParams
{
public:
//! the default constructor
SURF_GPU();
//! the full constructor taking all the necessary parameters
explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! upload host keypoints to device memory
static void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
//! download keypoints from device to host memory
static void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);
void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);
//! download descriptors from device to host memory
static void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
//! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images
//! keypoints will have 1 row and type CV_32FC(6)
//! keypoints.at<float[6]>(1, i) contains i'th keypoint
//! format: (x, y, size, response, angle, octave)
//! format: (x, y, laplacian, size, dir, hessian)
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
//! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints = false, bool calcOrientation = true);
bool useProvidedKeypoints = false);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints = false, bool calcOrientation = true);
bool useProvidedKeypoints = false);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
bool useProvidedKeypoints = false, bool calcOrientation = true);
bool useProvidedKeypoints = false);
//! max keypoints = keypointsRatio * img.size().area()
float keypointsRatio;
GpuMat sum;
GpuMat sumf;
GpuMat sum, mask1, maskSum, intBuffer;
GpuMat mask1;
GpuMat maskSum;
GpuMat det, trace;
GpuMat hessianBuffer;
GpuMat maxPosBuffer;
GpuMat featuresBuffer;
GpuMat keypointsBuffer;
};
}
......
......@@ -111,20 +111,20 @@ namespace cv
{
float x;
float y;
float laplacian;
float size;
float response;
float angle;
float octave;
float dir;
float hessian;
};
enum KeypointLayout
{
SF_X,
SF_Y,
SF_LAPLACIAN,
SF_SIZE,
SF_RESPONSE,
SF_ANGLE,
SF_OCTAVE,
SF_DIR,
SF_HESSIAN,
SF_FEATURE_STRIDE
};
}
......
......@@ -55,52 +55,6 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace surf
{
////////////////////////////////////////////////////////////////////////
// Help funcs
// Wrapper for host reference to pass into kernel
template <typename T>
class DeviceReference
{
public:
explicit DeviceReference(T& host_val) : d_ptr(0), h_ptr(&host_val)
{
cudaSafeCall( cudaMalloc((void**)&d_ptr, sizeof(T)) );
cudaSafeCall( cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice) );
}
~DeviceReference()
{
cudaSafeCall( cudaMemcpy(h_ptr, d_ptr, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaFree(d_ptr) );
}
// Casting to device pointer
operator T*() {return d_ptr;}
operator const T*() const {return d_ptr;}
private:
T* d_ptr;
T* h_ptr;
};
__device__ void clearLastBit(int* f)
{
*f &= ~0x1;
}
__device__ void clearLastBit(float& f)
{
clearLastBit((int*)&f);
}
__device__ void setLastBit(int* f)
{
*f |= 0x1;
}
__device__ void setLastBit(float& f)
{
setLastBit((int*)&f);
}
////////////////////////////////////////////////////////////////////////
// Global parameters
......@@ -108,215 +62,126 @@ namespace cv { namespace gpu { namespace surf
__constant__ int c_max_candidates;
// The maximum number of features that memory is reserved for.
__constant__ int c_max_features;
// The number of intervals in the octave.
__constant__ int c_nIntervals;
// Mask sizes derived from the mask parameters
__constant__ float c_mask_width;
// Mask sizes derived from the mask parameters
__constant__ float c_mask_height;
// Mask sizes derived from the mask parameters
__constant__ float c_dxy_center_offset;
// Mask sizes derived from the mask parameters
__constant__ float c_dxy_half_width;
// Mask sizes derived from the mask parameters
__constant__ float c_dxy_scale;
// The scale associated with the first interval of the first octave
__constant__ float c_initialScale;
// The interest operator threshold
__constant__ float c_threshold;
// Ther octave
// The maximum number of keypoints that memory is reserved for.
__constant__ int c_max_keypoints;
// The image size.
__constant__ int c_img_rows;
__constant__ int c_img_cols;
// The number of layers.
__constant__ int c_nOctaveLayers;
// The hessian threshold.
__constant__ float c_hessianThreshold;
// The current octave.
__constant__ int c_octave;
// The width of the octave buffer.
__constant__ int c_x_size;
// The height of the octave buffer.
__constant__ int c_y_size;
// The size of the octave border in pixels.
__constant__ int c_border;
// The step size used in this octave in pixels.
__constant__ int c_step;
// The current layer size.
__constant__ int c_layer_rows;
__constant__ int c_layer_cols;
////////////////////////////////////////////////////////////////////////
// Integral image texture
texture<float, 2, cudaReadModeElementType> sumTex(0, cudaFilterModeLinear, cudaAddressModeClamp);
typedef texture<unsigned int, 2, cudaReadModeElementType> IntTex;
__device__ float iiAreaLookupCDHalfWH(float cx, float cy, float halfWidth, float halfHeight)
{
float result = 0.f;
result += tex2D(sumTex, cx - halfWidth, cy - halfHeight);
result -= tex2D(sumTex, cx + halfWidth, cy - halfHeight);
result -= tex2D(sumTex, cx - halfWidth, cy + halfHeight);
result += tex2D(sumTex, cx + halfWidth, cy + halfHeight);
return result;
}
////////////////////////////////////////////////////////////////////////
// Hessian
IntTex sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__device__ float evalDyy(float x, float y, float t, float mask_width, float mask_height, float fscale)
template <int N>
__device__ float icvCalcHaarPattern(const IntTex& tex, const float src[][5], int oldSize, int newSize, int y, int x)
{
float Dyy = 0.f;
Dyy += iiAreaLookupCDHalfWH(x, y, mask_width, mask_height);
Dyy -= t * iiAreaLookupCDHalfWH(x, y, mask_width, fscale);
Dyy *= 1.0f / (fscale * fscale);
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
typedef double real_t;
#else
typedef float real_t;
#endif
return Dyy;
}
float ratio = (float)newSize / oldSize;
real_t d = 0;
__device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale)
{
float Dxx = 0.f;
#pragma unroll
for (int k = 0; k < N; ++k)
{
int dx1 = __float2int_rn(ratio * src[k][0]);
int dy1 = __float2int_rn(ratio * src[k][1]);
int dx2 = __float2int_rn(ratio * src[k][2]);
int dy2 = __float2int_rn(ratio * src[k][3]);
Dxx += iiAreaLookupCDHalfWH(x, y, mask_height, mask_width);
Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale , mask_width);
real_t t = 0;
t += tex2D(tex, x + dx1, y + dy1);
t -= tex2D(tex, x + dx1, y + dy2);
t -= tex2D(tex, x + dx2, y + dy1);
t += tex2D(tex, x + dx2, y + dy2);
Dxx *= 1.0f / (fscale * fscale);
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
return Dxx;
return (float)d;
}
__device__ float evalDxy(float x, float y, float fscale)
{
float center_offset = c_dxy_center_offset * fscale;
float half_width = c_dxy_half_width * fscale;
float Dxy = 0.f;
Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width);
Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width);
Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width);
Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width);
Dxy *= 1.0f / (fscale * fscale);
////////////////////////////////////////////////////////////////////////
// Hessian
return Dxy;
}
__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
__device__ float calcScale(int hidx_z)
__host__ __device__ int calcSize(int octave, int layer)
{
float d = (c_initialScale * (1 << c_octave)) / (c_nIntervals - 2);
return c_initialScale * (1 << c_octave) + d * (hidx_z - 1.0f) + 0.5f;
}
__global__ void fasthessian(PtrStepf hessianBuffer)
{
// Determine the indices in the Hessian buffer
int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
int hidx_y = threadIdx.y + blockIdx.y * blockDim.y;
int hidx_z = threadIdx.z;
float fscale = calcScale(hidx_z);
/* Wavelet size at first layer of first octave. */
const int HAAR_SIZE0 = 9;
// Compute the lookup location of the mask center
float x = hidx_x * c_step + c_border;
float y = hidx_y * c_step + c_border;
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbours of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
// Scale the mask dimensions according to the scale
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
{
float mask_width = c_mask_width * fscale;
float mask_height = c_mask_height * fscale;
// Compute the filter responses
float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxy = evalDxy(x, y, fscale);
// Combine the responses and store the Laplacian sign
float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
if (Dxx + Dyy > 0.f)
setLastBit(result);
else
clearLastBit(result);
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
}
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
__global__ void fasthessian_old(PtrStepf hessianBuffer)
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
{
// Determine the indices in the Hessian buffer
int gridDim_y = gridDim.y / c_nIntervals;
int blockIdx_y = blockIdx.y % gridDim_y;
int blockIdx_z = blockIdx.y / gridDim_y;
int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
int hidx_y = threadIdx.y + blockIdx_y * blockDim.y;
int hidx_z = blockIdx_z;
// Determine the indices
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
const int blockIdx_y = blockIdx.y % gridDim_y;
const int blockIdx_z = blockIdx.y / gridDim_y;
float fscale = calcScale(hidx_z);
const int j = threadIdx.x + blockIdx.x * blockDim.x;
const int i = threadIdx.y + blockIdx_y * blockDim.y;
const int layer = blockIdx_z;
// Compute the lookup location of the mask center
float x = hidx_x * c_step + c_border;
float y = hidx_y * c_step + c_border;
const int size = calcSize(c_octave, layer);
// Scale the mask dimensions according to the scale
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
{
float mask_width = c_mask_width * fscale;
float mask_height = c_mask_height * fscale;
// Compute the filter responses
float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxy = evalDxy(x, y, fscale);
const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
// Combine the responses and store the Laplacian sign
float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
/* Ignore pixels where some of the kernel is outside the image */
const int margin = (size >> 1) >> c_octave;
if (Dxx + Dyy > 0.f)
setLastBit(result);
else
clearLastBit(result);
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{
const float dx = icvCalcHaarPattern<3>(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
const float dy = icvCalcHaarPattern<3>(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
const float dxy = icvCalcHaarPattern<4>(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
}
}
dim3 calcBlockSize(int nIntervals)
{
int threadsPerBlock = 512;
dim3 threads;
threads.z = nIntervals;
threadsPerBlock /= nIntervals;
if (threadsPerBlock >= 48)
threads.x = 16;
else
threads.x = 8;
threadsPerBlock /= threads.x;
threads.y = threadsPerBlock;
return threads;
}
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
{
dim3 grid;
grid.x = divUp(x_size, threads.x);
grid.y = divUp(y_size, threads.y);
fasthessian<<<grid, threads>>>(hessianBuffer);
cudaSafeCall( cudaGetLastError() );
const int min_size = calcSize(octave, 0);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
cudaSafeCall( cudaThreadSynchronize() );
}
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)
{
dim3 threads(16, 16);
dim3 grid;
grid.x = divUp(x_size, threads.x);
grid.y = divUp(y_size, threads.y) * threadsOld.z;
fasthessian_old<<<grid, threads>>>(hessianBuffer);
grid.x = divUp(max_samples_j, threads.x);
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
......@@ -325,109 +190,113 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
// NONMAX
texture<int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
IntTex maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
struct WithOutMask
{
static __device__ bool check(float, float, float)
static __device__ bool check(int, int, int)
{
return true;
}
};
__constant__ float c_DM[1][5] = {{0, 0, 9, 9, 1}};
struct WithMask
{
static __device__ bool check(float x, float y, float fscale)
static __device__ bool check(int sum_i, int sum_j, int size)
{
float half_width = fscale / 2;
float result = 0.f;
result += tex2D(maskSumTex, x - half_width, y - half_width);
result -= tex2D(maskSumTex, x + half_width, y - half_width);
result -= tex2D(maskSumTex, x - half_width, y + half_width);
result += tex2D(maskSumTex, x + half_width, y + half_width);
result /= (fscale * fscale);
return (result >= 0.5f);
float mval = icvCalcHaarPattern<1>(maskSumTex, c_DM , 9, size, sum_i, sum_j);
return (mval >= 0.5);
}
};
template <typename Mask>
__global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter)
__global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
extern __shared__ float fh_vals[];
extern __shared__ float N9[];
// The hidx variables are the indices to the hessian buffer.
int hidx_x = threadIdx.x + blockIdx.x * (blockDim.x - 2);
int hidx_y = threadIdx.y + blockIdx.y * (blockDim.y - 2);
int hidx_z = threadIdx.z;
int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
const int gridDim_y = gridDim.y / c_nOctaveLayers;
const int blockIdx_y = blockIdx.y % gridDim_y;
const int blockIdx_z = blockIdx.y / gridDim_y;
// Is this thread within the hessian buffer?
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
{
fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];
}
__syncthreads();
const int layer = blockIdx_z + 1;
const int size = calcSize(c_octave, layer);
// Is this location one of the ones being processed for nonmax suppression.
// Blocks overlap by one so we don't process the border threads.
bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1
&& threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < c_y_size - 1
&& threadIdx.z > 0 && threadIdx.z < blockDim.z-1;
/* Ignore pixels without a 3x3x3 neighbourhood in the layer above */
const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
float val = fh_vals[localLin];
const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
// Compute the lookup location of the mask center
float x = hidx_x * c_step + c_border;
float y = hidx_y * c_step + c_border;
float fscale = calcScale(hidx_z);
// Is this thread within the hessian buffer?
const int zoff = blockDim.x * blockDim.y;
const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + i)[j];
N9[localLin ] = det.ptr(c_layer_rows * (layer ) + i)[j];
N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + i)[j];
__syncthreads();
if (inBounds2 && val >= c_threshold && Mask::check(x, y, fscale))
if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
{
// Check to see if we have a max (in its 26 neighbours)
int zoff = blockDim.x * blockDim.y;
bool condmax = val > fh_vals[localLin + 1]
&& val > fh_vals[localLin - 1]
&& val > fh_vals[localLin - blockDim.x + 1]
&& val > fh_vals[localLin - blockDim.x ]
&& val > fh_vals[localLin - blockDim.x - 1]
&& val > fh_vals[localLin + blockDim.x + 1]
&& val > fh_vals[localLin + blockDim.x ]
&& val > fh_vals[localLin + blockDim.x - 1]
&& val > fh_vals[localLin - zoff + 1]
&& val > fh_vals[localLin - zoff ]
&& val > fh_vals[localLin - zoff - 1]
&& val > fh_vals[localLin - zoff - blockDim.x + 1]
&& val > fh_vals[localLin - zoff - blockDim.x ]
&& val > fh_vals[localLin - zoff - blockDim.x - 1]
&& val > fh_vals[localLin - zoff + blockDim.x + 1]
&& val > fh_vals[localLin - zoff + blockDim.x ]
&& val > fh_vals[localLin - zoff + blockDim.x - 1]
&& val > fh_vals[localLin + zoff + 1]
&& val > fh_vals[localLin + zoff ]
&& val > fh_vals[localLin + zoff - 1]
&& val > fh_vals[localLin + zoff - blockDim.x + 1]
&& val > fh_vals[localLin + zoff - blockDim.x ]
&& val > fh_vals[localLin + zoff - blockDim.x - 1]
&& val > fh_vals[localLin + zoff + blockDim.x + 1]
&& val > fh_vals[localLin + zoff + blockDim.x ]
&& val > fh_vals[localLin + zoff + blockDim.x - 1]
;
if(condmax)
float val0 = N9[localLin];
if (val0 > c_hessianThreshold)
{
unsigned int i = atomicInc(maxCounter,(unsigned int) -1);
// Coordinates for the start of the wavelet in the sum image. There
// is some integer division involved, so don't try to simplify this
// (cancel out sampleStep) without checking the result is the same
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (i < c_max_candidates)
if (Mask::check(sum_i, sum_j, size))
{
int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave};
maxPosBuffer[i] = f;
// Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
&& val0 > N9[localLin - blockDim.x - zoff]
&& val0 > N9[localLin + 1 - blockDim.x - zoff]
&& val0 > N9[localLin - 1 - zoff]
&& val0 > N9[localLin - zoff]
&& val0 > N9[localLin + 1 - zoff]
&& val0 > N9[localLin - 1 + blockDim.x - zoff]
&& val0 > N9[localLin + blockDim.x - zoff]
&& val0 > N9[localLin + 1 + blockDim.x - zoff]
&& val0 > N9[localLin - 1 - blockDim.x]
&& val0 > N9[localLin - blockDim.x]
&& val0 > N9[localLin + 1 - blockDim.x]
&& val0 > N9[localLin - 1 ]
&& val0 > N9[localLin + 1 ]
&& val0 > N9[localLin - 1 + blockDim.x]
&& val0 > N9[localLin + blockDim.x]
&& val0 > N9[localLin + 1 + blockDim.x]
&& val0 > N9[localLin - 1 - blockDim.x + zoff]
&& val0 > N9[localLin - blockDim.x + zoff]
&& val0 > N9[localLin + 1 - blockDim.x + zoff]
&& val0 > N9[localLin - 1 + zoff]
&& val0 > N9[localLin + zoff]
&& val0 > N9[localLin + 1 + zoff]
&& val0 > N9[localLin - 1 + blockDim.x + zoff]
&& val0 > N9[localLin + blockDim.x + zoff]
&& val0 > N9[localLin + 1 + blockDim.x + zoff]
;
if(condmax)
{
unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
if (ind < c_max_candidates)
{
const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
}
}
}
}
}
......@@ -435,21 +304,26 @@ namespace cv { namespace gpu { namespace surf
#endif
}
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
int x_size, int y_size, bool use_mask, const dim3& threads)
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
{
dim3 grid;
grid.x = divUp(x_size, threads.x - 2);
grid.y = divUp(y_size, threads.y - 2);
const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave;
int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
dim3 threads(16, 16);
const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
dim3 grid;
grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
DeviceReference<unsigned int> maxCounterWrapper(maxCounter);
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
if (use_mask)
nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
else
nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
cudaSafeCall( cudaGetLastError() );
......@@ -459,166 +333,117 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
#define MID_IDX 1
__global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer,
KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
__global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x;
int hidx_y = maxPosBuffer[blockIdx.x].y - 1 + threadIdx.y;
int hidx_z = maxPosBuffer[blockIdx.x].z - 1 + threadIdx.z;
const int4 maxPos = maxPosBuffer[blockIdx.x];
const int j = maxPos.x - 1 + threadIdx.x;
const int i = maxPos.y - 1 + threadIdx.y;
const int layer = maxPos.z - 1 + threadIdx.z;
__shared__ float fh_vals[3][3][3];
__shared__ float N9[3][3][3];
__shared__ KeyPoint_GPU p;
fh_vals[threadIdx.z][threadIdx.y][threadIdx.x] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];
N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{
__shared__ float H[3][3];
//dxx
H[0][0] = fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ]
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]
+ fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ];
__shared__ float dD[3];
//dyy
H[1][1] = fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1]
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]
+ fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1];
//dx
dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
//dy
dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
//ds
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
//dss
H[2][2] = fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ]
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]
+ fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ];
__shared__ float H[3][3];
//dxx
H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
//dxy
H[0][1]= 0.25f*
(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX - 1] +
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX - 1]);
H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
//dxs
H[0][2]= 0.25f*
(fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX ] -
fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX ] -
fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX ] +
fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX ]);
//dys
H[1][2]= 0.25f*
(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX + 1] -
fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX - 1] -
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX + 1] +
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX - 1]);
H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
//dyx = dxy
H[1][0] = H[0][1];
//dyy
H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
//dys
H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
//dsx = dxs
H[2][0] = H[0][2];
//dsy = dys
H[2][1] = H[1][2];
//dss
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
__shared__ float dD[3];
float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1])
- H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0])
+ H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]);
//dx
dD[0] = 0.5f*(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] -
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]);
//dy
dD[1] = 0.5f*(fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]);
//ds
dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] -
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]);
__shared__ float invdet;
invdet = 1.f /
(
H[0][0]*H[1][1]*H[2][2]
+ H[0][1]*H[1][2]*H[2][0]
+ H[0][2]*H[1][0]*H[2][1]
- H[0][0]*H[1][2]*H[2][1]
- H[0][1]*H[1][0]*H[2][2]
- H[0][2]*H[1][1]*H[2][0]
);
// // 1-based entries of a 3x3 inverse
// /* [ |a22 a23| |a12 a13| |a12 a13|] */
// /* [ |a32 a33| -|a32 a33| |a22 a23|] */
// /* [ ] */
// /* [ |a21 a23| |a11 a13| |a11 a13|] */
// /* A^(-1) = [-|a31 a33| |a31 a33| -|a21 a23|] / d */
// /* [ ] */
// /* [ |a21 a22| |a11 a12| |a11 a12|] */
// /* [ |a31 a32| -|a31 a32| |a21 a22|] */
__shared__ float Hinv[3][3];
Hinv[0][0] = invdet*(H[1][1]*H[2][2]-H[1][2]*H[2][1]);
Hinv[0][1] = -invdet*(H[0][1]*H[2][2]-H[0][2]*H[2][1]);
Hinv[0][2] = invdet*(H[0][1]*H[1][2]-H[0][2]*H[1][1]);
Hinv[1][0] = -invdet*(H[1][0]*H[2][2]-H[1][2]*H[2][0]);
Hinv[1][1] = invdet*(H[0][0]*H[2][2]-H[0][2]*H[2][0]);
Hinv[1][2] = -invdet*(H[0][0]*H[1][2]-H[0][2]*H[1][0]);
Hinv[2][0] = invdet*(H[1][0]*H[2][1]-H[1][1]*H[2][0]);
Hinv[2][1] = -invdet*(H[0][0]*H[2][1]-H[0][1]*H[2][0]);
Hinv[2][2] = invdet*(H[0][0]*H[1][1]-H[0][1]*H[1][0]);
__shared__ float x[3];
x[0] = -(Hinv[0][0]*(dD[0]) + Hinv[0][1]*(dD[1]) + Hinv[0][2]*(dD[2]));
x[1] = -(Hinv[1][0]*(dD[0]) + Hinv[1][1]*(dD[1]) + Hinv[1][2]*(dD[2]));
x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2]));
if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f)
if (det != 0.0f)
{
// if the step is within the interpolation region, perform it
float invdet = 1.0f / det;
// Get a new feature index.
unsigned int i = atomicInc(featureCounter, (unsigned int)-1);
__shared__ float x[3];
if (i < c_max_features)
x[0] = invdet *
(dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) -
H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) +
H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2]));
x[1] = invdet *
(H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) -
dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) +
H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0]));
x[2] = invdet *
(H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) -
H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) +
dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]));
if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
{
p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border;
p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border;
// if the step is within the interpolation region, perform it
if (x[2] > 0)
{
float a = calcScale(maxPosBuffer[blockIdx.x].z);
float b = calcScale(maxPosBuffer[blockIdx.x].z + 1);
// Get a new feature index.
unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
p.size = (1.f - x[2]) * a + x[2] * b;
}
else
if (ind < c_max_features)
{
float a = calcScale(maxPosBuffer[blockIdx.x].z);
float b = calcScale(maxPosBuffer[blockIdx.x].z - 1);
const int size = calcSize(c_octave, maxPos.z);
p.size = (1.f + x[2]) * a - x[2] * b;
}
const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
const float center_i = sum_i + (float)(size - 1) / 2;
const float center_j = sum_j + (float)(size - 1) / 2;
p.octave = c_octave;
p.x = center_j + x[0] * (1 << c_octave);
p.y = center_i + x[1] * (1 << c_octave);
p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX];
int ds = size - calcSize(c_octave, maxPos.z - 1);
p.size = roundf(size + x[2] * ds);
// Should we split up this transfer over many threads?
featuresBuffer[i] = p;
}
} // If the subpixel interpolation worked
p.laplacian = maxPos.w;
p.dir = 0.0f;
p.hessian = N9[1][1][1];
// Should we split up this transfer over many threads?
featuresBuffer[ind] = p;
}
} // If the subpixel interpolation worked
}
} // If this is thread 0.
#endif
}
#undef MID_IDX
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter,
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter)
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
{
dim3 threads;
threads.x = 3;
......@@ -628,9 +453,7 @@ namespace cv { namespace gpu { namespace surf
dim3 grid;
grid.x = maxCounter;
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featuresBuffer, featureCounter);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
......@@ -639,139 +462,217 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
// Orientation
// precomputed values for a Gaussian with a standard deviation of 2
__constant__ float c_gauss1D[13] =
{
0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f,
0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f,
0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f,
0.002215924206f
};
#define ORI_SEARCH_INC 5
#define ORI_WIN 60
#define ORI_SAMPLES 113
__global__ void find_orientation(KeyPoint_GPU* features)
{
int tid = threadIdx.y * 17 + threadIdx.x;
int tid2 = numeric_limits_gpu<int>::max();
__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
if (threadIdx.x < 13 && threadIdx.y < 13)
{
tid2 = threadIdx.y * 13 + threadIdx.x;
}
__global__ void icvCalcOrientation(const KeyPoint_GPU* featureBuffer, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ float texLookups[17][17];
__shared__ float s_X[128];
__shared__ float s_Y[128];
__shared__ float s_angle[128];
__shared__ float Edx[13*13];
__shared__ float Edy[13*13];
__shared__ float xys[3];
__shared__ float s_sumx[64 * 4];
__shared__ float s_sumy[64 * 4];
// Read my x, y, size.
if (tid < 3)
{
xys[tid] = ((float*)(&features[blockIdx.x]))[tid];
}
__shared__ float s_feature[6];
if (threadIdx.x < 6 && threadIdx.y == 0)
s_feature[threadIdx.x] = ((float*)(&featureBuffer[blockIdx.x]))[threadIdx.x];
__syncthreads();
// Read all texture locations into memory
// Maybe I should use __mul24 here?
texLookups[threadIdx.x][threadIdx.y] = tex2D(sumTex, xys[SF_X] + ((int)threadIdx.x - 8) * xys[SF_SIZE],
xys[SF_Y] + ((int)threadIdx.y - 8) * xys[SF_SIZE]);
__syncthreads();
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = s_feature[SF_SIZE] * 1.2f / 9.0f;
float dx = 0.f;
float dy = 0.f;
/* To find the dominant orientation, the gradients in x and y are
sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the
wavelet pattern is balanced and symmetric around its center */
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
// Computes lookups for all points in a 13x13 lattice.
// - SURF says to only use a circle, but the branching logic would slow it down
// - Gaussian weighting should reduce the effects of the outer points anyway
if (tid2 < 169)
// check when grad_wav_size is too big
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
{
dx -= texLookups[threadIdx.x ][threadIdx.y ];
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];
dx -= texLookups[threadIdx.x + 4][threadIdx.y ];
dx += texLookups[threadIdx.x ][threadIdx.y + 4];
dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];
dx += texLookups[threadIdx.x + 4][threadIdx.y + 4];
dy -= texLookups[threadIdx.x ][threadIdx.y ];
dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2];
dy -= texLookups[threadIdx.x ][threadIdx.y + 4];
dy += texLookups[threadIdx.x + 4][threadIdx.y ];
dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];
dy += texLookups[threadIdx.x + 4][threadIdx.y + 4];
float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y];
Edx[tid2] = dx * g;
Edy[tid2] = dy * g;
}
// Calc X, Y, angle and store it to shared memory
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
__syncthreads();
float X = 0.0f, Y = 0.0f, angle = 0.0f;
// This is a scan to get the summed dx, dy values.
// Gets 128-168
if (tid < 41)
{
Edx[tid] += Edx[tid + 128];
}
__syncthreads();
if (tid < 64)
{
Edx[tid] += Edx[tid + 64];
}
__syncthreads();
if (tid < 32)
{
volatile float* smem = Edx;
smem[tid] += smem[tid + 32];
smem[tid] += smem[tid + 16];
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
if (tid < ORI_SAMPLES)
{
const float margin = (float)(grad_wav_size - 1) / 2.0f;
const int x = __float2int_rn(s_feature[SF_X] + c_aptX[tid] * s - margin);
const int y = __float2int_rn(s_feature[SF_Y] + c_aptY[tid] * s - margin);
// Gets 128-168
if (tid < 41)
{
Edy[tid] += Edy[tid + 128];
}
__syncthreads();
if (tid < 64)
{
Edy[tid] += Edy[tid + 64];
}
__syncthreads();
if (tid < 32)
{
volatile float* smem = Edy;
smem[tid] += smem[tid + 32];
smem[tid] += smem[tid + 16];
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
{
X = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NY, 4, grad_wav_size, y, x);
angle = atan2f(Y, X);
if (angle < 0)
angle += 2.0f * CV_PI;
angle *= 180.0f / CV_PI;
}
}
if (tid < 128)
{
s_X[tid] = X;
s_Y[tid] = Y;
s_angle[tid] = angle;
}
}
__syncthreads();
// Thread 0 saves back the result.
if (tid == 0)
{
features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI);
float bestx = 0, besty = 0, best_mod = 0;
#pragma unroll
for (int i = 0; i < 18; ++i)
{
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
float sumx = 0.0f, sumy = 0.0f;
int d = abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx = s_X[threadIdx.x];
sumy = s_Y[threadIdx.x];
}
d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[threadIdx.x + 64];
sumy += s_Y[threadIdx.x + 64];
}
float* s_sumx_row = s_sumx + threadIdx.y * 64;
float* s_sumy_row = s_sumy + threadIdx.y * 64;
s_sumx_row[threadIdx.x] = sumx;
s_sumy_row[threadIdx.x] = sumy;
__syncthreads();
if (threadIdx.x < 32)
{
volatile float* v_sumx_row = s_sumx_row;
volatile float* v_sumy_row = s_sumy_row;
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1];
}
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
{
best_mod = temp_mod;
bestx = sumx;
besty = sumy;
}
__syncthreads();
}
if (threadIdx.x == 0)
{
s_X[threadIdx.y] = bestx;
s_Y[threadIdx.y] = besty;
s_angle[threadIdx.y] = best_mod;
}
__syncthreads();
if (threadIdx.x < 2 && threadIdx.y == 0)
{
volatile float* v_x = s_X;
volatile float* v_y = s_Y;
volatile float* v_mod = s_angle;
bestx = v_x[threadIdx.x];
besty = v_y[threadIdx.x];
best_mod = v_mod[threadIdx.x];
float temp_mod = v_mod[threadIdx.x + 2];
if (temp_mod > best_mod)
{
v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
v_mod[threadIdx.x] = best_mod = temp_mod;
}
temp_mod = v_mod[threadIdx.x + 1];
if (temp_mod > best_mod)
{
v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
}
}
if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
{
// Get a new feature index.
unsigned int ind = atomicInc(keypointCounter, (unsigned int)-1);
if (ind < c_max_keypoints)
{
float kp_dir = atan2f(besty, bestx);
if (kp_dir < 0)
kp_dir += 2.0f * CV_PI;
kp_dir *= 180.0f / CV_PI;
__shared__ KeyPoint_GPU kp;
kp.x = s_feature[SF_X];
kp.y = s_feature[SF_Y];
kp.laplacian = s_feature[SF_LAPLACIAN];
kp.size = s_feature[SF_SIZE];
kp.dir = kp_dir;
kp.hessian = s_feature[SF_HESSIAN];
keypoints[ind] = kp;
}
}
}
#endif
}
void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures)
#undef ORI_SEARCH_INC
#undef ORI_WIN
#undef ORI_SAMPLES
void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
{
dim3 threads;
threads.x = 17;
threads.y = 17;
threads.x = 64;
threads.y = 4;
dim3 grid;
grid.x = nFeatures;
find_orientation<<<grid, threads>>>(features);
icvCalcOrientation<<<grid, threads>>>(featureBuffer, keypoints, keypointCounter);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
......@@ -780,117 +681,119 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
// Descriptors
// precomputed values for a Gaussian with a standard deviation of 3.3
// - it appears SURF uses a different value, but not sure what it is
__constant__ float c_3p3gauss1D[20] =
{
0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,
0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,
0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,
0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f
#define PATCH_SZ 20
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
{
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
};
template <int BLOCK_DIM_X>
__global__ void normalize_descriptors(PtrStepf descriptors)
__device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2)
{
// no need for thread ID
float* descriptor_base = descriptors.ptr(blockIdx.x);
const float centerX = s_pt[SF_X];
const float centerY = s_pt[SF_Y];
const float size = s_pt[SF_SIZE];
const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);
// read in the unnormalized descriptor values (squared)
__shared__ float sqDesc[BLOCK_DIM_X];
const float lookup = descriptor_base[threadIdx.x];
sqDesc[threadIdx.x] = lookup * lookup;
__syncthreads();
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = size * 1.2f / 9.0f;
if (BLOCK_DIM_X >= 128)
{
if (threadIdx.x < 64)
sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
__syncthreads();
}
/* Extract a window of pixels around the keypoint of size 20s */
const int win_size = (int)((PATCH_SZ + 1) * s);
// reduction to get total
if (threadIdx.x < 32)
{
volatile float* smem = sqDesc;
float sin_dir;
float cos_dir;
sincosf(descriptor_dir, &sin_dir, &cos_dir);
smem[threadIdx.x] += smem[threadIdx.x + 32];
smem[threadIdx.x] += smem[threadIdx.x + 16];
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
}
/* Nearest neighbour version (faster) */
const float win_offset = -(float)(win_size - 1) / 2;
// compute length (square root)
__shared__ float len;
if (threadIdx.x == 0)
{
len = sqrtf(sqDesc[0]);
}
__syncthreads();
/* Scale the window to size PATCH_SZ so each pixel's size is s. This
makes calculating the gradients with wavelets of size 2s easy */
const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size;
// normalize and store in output
descriptor_base[threadIdx.x] = lookup / len;
}
const int i = __float2int_rd(icoo);
const int j = __float2int_rd(jcoo);
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt,
int xIndex, int yIndex, int tid)
{
float sin_theta, cos_theta;
sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);
// Compute rotated sampling points
// (clockwise rotation since we are rotating the lattice)
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
+ sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
+ cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
// gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
// a b c
// d f
// g h i
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]);
const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
// compute axis-aligned HaarX, HaarY
// (could group the additions together into multiplications)
const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx
const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy
// rotate responses (store all dxs then all dys)
// - counterclockwise rotation to rotate back to zero orientation
sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo);
pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir;
pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir;
res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo);
pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir;
pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir;
res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j);
pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir;
pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir;
res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j);
s_PATCH[i2][j2] = (unsigned char)res;
}
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)
__device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)
{
// get the interest point parameters (x, y, size, response, angle)
__shared__ float ipt[5];
if (threadIdx.x < 5 && threadIdx.y == 0)
__shared__ float s_pt[5];
if (tid < 5)
{
ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];
s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid];
}
__syncthreads();
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (threadIdx.y & 3); // threadIdx.y % 4
const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4)
const int xIndex = (xBlock * 5) + (threadIdx.x % 5);
const int yIndex = (yBlock * 5) + (threadIdx.x / 5);
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = yBlock * blockDim.y + threadIdx.y;
calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x);
if (threadIdx.x == 0)
calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5);
if (threadIdx.y == 0)
calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x);
if (threadIdx.x == 0 && threadIdx.y == 0)
calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5);
__syncthreads();
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw;
const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw;
calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x);
s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
}
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,
......@@ -933,193 +836,9 @@ namespace cv { namespace gpu { namespace surf
// Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx [16 * 25];
__shared__ float sdy [16 * 25];
__shared__ float sdxabs[16 * 25];
__shared__ float sdyabs[16 * 25];
__shared__ float sdesc[64];
float* sdx_bin = sdx + (threadIdx.y * 25);
float* sdy_bin = sdy + (threadIdx.y * 25);
float* sdxabs_bin = sdxabs + (threadIdx.y * 25);
float* sdyabs_bin = sdyabs + (threadIdx.y * 25);
calc_dx_dy(sdx_bin, sdy_bin, features);
__syncthreads();
sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array
sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array
__syncthreads();
reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x);
__syncthreads();
float* sdesc_bin = sdesc + (threadIdx.y << 2);
// write dx, dy, |dx|, |dy|
if (threadIdx.x == 0)
{
sdesc_bin[0] = sdx_bin[0];
sdesc_bin[1] = sdy_bin[0];
sdesc_bin[2] = sdxabs_bin[0];
sdesc_bin[3] = sdyabs_bin[0];
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 64)
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
}
// Spawn 16 blocks per interest point
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[16 * 25];
__shared__ float sdy[16 * 25];
// sum (reduce) 5x5 area response
__shared__ float sd1[16 * 25];
__shared__ float sd2[16 * 25];
__shared__ float sdabs1[16 * 25];
__shared__ float sdabs2[16 * 25];
__shared__ float sdesc[128];
float* sdx_bin = sdx + (threadIdx.y * 25);
float* sdy_bin = sdy + (threadIdx.y * 25);
float* sd1_bin = sd1 + (threadIdx.y * 25);
float* sd2_bin = sd2 + (threadIdx.y * 25);
float* sdabs1_bin = sdabs1 + (threadIdx.y * 25);
float* sdabs2_bin = sdabs2 + (threadIdx.y * 25);
calc_dx_dy(sdx_bin, sdy_bin, features);
__syncthreads();
if (sdy_bin[threadIdx.x] >= 0)
{
sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x];
sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]);
sd2_bin[threadIdx.x] = 0;
sdabs2_bin[threadIdx.x] = 0;
}
else
{
sd1_bin[threadIdx.x] = 0;
sdabs1_bin[threadIdx.x] = 0;
sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x];
sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]);
}
__syncthreads();
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
__syncthreads();
float* sdesc_bin = sdesc + (threadIdx.y << 3);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (threadIdx.x == 0)
{
sdesc_bin[0] = sd1_bin[0];
sdesc_bin[1] = sdabs1_bin[0];
sdesc_bin[2] = sd2_bin[0];
sdesc_bin[3] = sdabs2_bin[0];
}
__syncthreads();
if (sdx_bin[threadIdx.x] >= 0)
{
sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x];
sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
sd2_bin[threadIdx.x] = 0;
sdabs2_bin[threadIdx.x] = 0;
}
else
{
sd1_bin[threadIdx.x] = 0;
sdabs1_bin[threadIdx.x] = 0;
sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x];
sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
}
__syncthreads();
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
__syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (threadIdx.x == 0)
{
sdesc_bin[4] = sd1_bin[0];
sdesc_bin[5] = sdabs1_bin[0];
sdesc_bin[6] = sd2_bin[0];
sdesc_bin[7] = sdabs2_bin[0];
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 128)
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
}
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
{
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
}
__device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid)
{
// get the interest point parameters (x, y, scale, strength, theta)
__shared__ float ipt[5];
if (tid < 5)
{
ipt[tid] = ((float*)&features[blockIdx.x])[tid];
}
__syncthreads();
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = yBlock * blockDim.y + threadIdx.y;
calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid);
}
// Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float s_PATCH[6][6];
__shared__ float sdx[25];
__shared__ float sdy[25];
__shared__ float sdxabs[25];
......@@ -1127,7 +846,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy_old(sdx, sdy, features, tid);
calc_dx_dy(s_PATCH, sdx, sdy, features, tid);
__syncthreads();
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
......@@ -1151,9 +870,10 @@ namespace cv { namespace gpu { namespace surf
// Spawn 16 blocks per interest point
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float s_PATCH[6][6];
__shared__ float sdx[25];
__shared__ float sdy[25];
......@@ -1165,7 +885,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy_old(sdx, sdy, features, tid);
calc_dx_dy(s_PATCH, sdx, sdy, features, tid);
__syncthreads();
if (sdy[tid] >= 0)
......@@ -1184,7 +904,7 @@ namespace cv { namespace gpu { namespace surf
}
__syncthreads();
reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
__syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
......@@ -1215,7 +935,7 @@ namespace cv { namespace gpu { namespace surf
}
__syncthreads();
reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
__syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
......@@ -1228,13 +948,56 @@ namespace cv { namespace gpu { namespace surf
}
}
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
{
// no need for thread ID
float* descriptor_base = descriptors.ptr(blockIdx.x);
// read in the unnormalized descriptor values (squared)
__shared__ float sqDesc[BLOCK_DIM_X];
const float lookup = descriptor_base[threadIdx.x];
sqDesc[threadIdx.x] = lookup * lookup;
__syncthreads();
if (BLOCK_DIM_X >= 128)
{
if (threadIdx.x < 64)
sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
__syncthreads();
}
// reduction to get total
if (threadIdx.x < 32)
{
volatile float* smem = sqDesc;
smem[threadIdx.x] += smem[threadIdx.x + 32];
smem[threadIdx.x] += smem[threadIdx.x + 16];
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
}
// compute length (square root)
__shared__ float len;
if (threadIdx.x == 0)
{
len = sqrtf(sqDesc[0]);
}
__syncthreads();
// normalize and store in output
descriptor_base[threadIdx.x] = lookup / len;
}
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
{
compute_descriptors64_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
......@@ -1246,7 +1009,7 @@ namespace cv { namespace gpu { namespace surf
}
else
{
compute_descriptors128_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
......
......@@ -48,123 +48,93 @@ using namespace std;
#if !defined (HAVE_CUDA)
cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }
cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float) { throw_nogpu(); }
int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}
void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector<float>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace surf
{
dim3 calcBlockSize(int nIntervals);
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld);
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
int x_size, int y_size, bool use_mask, const dim3& threads);
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter,
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter);
void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter);
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
}}}
using namespace cv::gpu::surf;
namespace
{
class SURF_GPU_Invoker : private SURFParams_GPU
class SURF_GPU_Invoker : private CvSURFParams
{
public:
SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :
SURFParams_GPU(surf),
SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :
CvSURFParams(surf),
sum(surf.sum), sumf(surf.sumf),
sum(surf.sum), mask1(surf.mask1), maskSum(surf.maskSum), intBuffer(surf.intBuffer), det(surf.det), trace(surf.trace),
mask1(surf.mask1), maskSum(surf.maskSum),
hessianBuffer(surf.hessianBuffer),
maxPosBuffer(surf.maxPosBuffer),
featuresBuffer(surf.featuresBuffer),
maxPosBuffer(surf.maxPosBuffer), featuresBuffer(surf.featuresBuffer), keypointsBuffer(surf.keypointsBuffer),
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty()),
mask_width(0), mask_height(0),
featureCounter(0), maxCounter(0)
use_mask(!mask.empty())
{
CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);
CV_Assert(DeviceInfo().supports(GLOBAL_ATOMICS));
max_features = static_cast<int>(img.size().area() * featuresRatio);
max_candidates = static_cast<int>(1.5 * max_features);
CV_Assert(max_features > 0);
CV_Assert(nOctaves > 0 && nOctaveLayers > 0);
CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
featuresBuffer.create(1, max_features, CV_32FC(6));
maxPosBuffer.create(1, max_candidates, CV_32SC4);
maxKeypoints = static_cast<int>(img.size().area() * surf.keypointsRatio);
maxFeatures = static_cast<int>(1.5 * maxKeypoints);
maxCandidates = static_cast<int>(1.5 * maxFeatures);
mask_width = l2 * 0.5f;
mask_height = 1.0f + l1;
CV_Assert(maxKeypoints > 0);
cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 2) * sizeof(unsigned int)) );
cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 2) * sizeof(unsigned int)) );
// Dxy gap half-width
float dxy_center_offset = 0.5f * (l4 + l3);
// Dxy squares half-width
float dxy_half_width = 0.5f * l3;
uploadConstant("cv::gpu::surf::c_max_candidates", maxCandidates);
uploadConstant("cv::gpu::surf::c_max_features", maxFeatures);
uploadConstant("cv::gpu::surf::c_max_keypoints", maxKeypoints);
uploadConstant("cv::gpu::surf::c_img_rows", img_rows);
uploadConstant("cv::gpu::surf::c_img_cols", img_cols);
uploadConstant("cv::gpu::surf::c_nOctaveLayers", nOctaveLayers);
uploadConstant("cv::gpu::surf::c_hessianThreshold", static_cast<float>(hessianThreshold));
// rescale edge_scale to fit with the filter dimensions
float dxy_scale = edgeScale * std::pow((2.f + 2.f * l1) * l2 / (4.f * l3 * l3), 2.f);
// Compute border required such that the filters don't overstep the image boundaries
float smax0 = 2.0f * initialScale + 0.5f;
int border0 = static_cast<int>(std::ceil(smax0 * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));
int width0 = (img_cols - 2 * border0) / initialStep;
int height0 = (img_rows - 2 * border0) / initialStep;
uploadConstant("cv::gpu::surf::c_max_candidates", max_candidates);
uploadConstant("cv::gpu::surf::c_max_features", max_features);
uploadConstant("cv::gpu::surf::c_nIntervals", nIntervals);
uploadConstant("cv::gpu::surf::c_mask_width", mask_width);
uploadConstant("cv::gpu::surf::c_mask_height", mask_height);
uploadConstant("cv::gpu::surf::c_dxy_center_offset", dxy_center_offset);
uploadConstant("cv::gpu::surf::c_dxy_half_width", dxy_half_width);
uploadConstant("cv::gpu::surf::c_dxy_scale", dxy_scale);
uploadConstant("cv::gpu::surf::c_initialScale", initialScale);
uploadConstant("cv::gpu::surf::c_threshold", threshold);
hessianBuffer.create(height0 * nIntervals, width0, CV_32F);
bindTexture("cv::gpu::surf::imgTex", (DevMem2D)img);
integral(img, sum);
sum.convertTo(sumf, CV_32F, 1.0 / 255.0);
bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sumf);
integralBuffered(img, sum, intBuffer);
bindTexture("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum);
if (!mask.empty())
{
if (use_mask)
{
min(mask, 1.0, mask1);
integral(mask1, maskSum);
bindTexture("cv::gpu::surf::maskSumTex", (DevMem2Di)maskSum);
}
integralBuffered(mask1, maskSum, intBuffer);
bindTexture("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);
}
}
~SURF_GPU_Invoker()
{
cudaSafeCall( cudaFree(d_counters) );
unbindTexture("cv::gpu::surf::imgTex");
unbindTexture("cv::gpu::surf::sumTex");
if (use_mask)
unbindTexture("cv::gpu::surf::maskSumTex");
......@@ -172,102 +142,115 @@ namespace
void detectKeypoints(GpuMat& keypoints)
{
typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
const fasthessian_t fasthessian =
DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old;
ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, det);
ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, trace);
ensureSizeIsEnough(1, maxCandidates, CV_32SC4, maxPosBuffer);
ensureSizeIsEnough(1, maxFeatures, CV_32FC(6), featuresBuffer);
dim3 threads = calcBlockSize(nIntervals);
for(int octave = 0; octave < nOctaves; ++octave)
for (int octave = 0; octave < nOctaves; ++octave)
{
int step = initialStep * (1 << octave);
// Compute border required such that the filters don't overstep the image boundaries
float d = (initialScale * (1 << octave)) / (nIntervals - 2);
float smax = initialScale * (1 << octave) + d * (nIntervals - 2.0f) + 0.5f;
int border = static_cast<int>(std::ceil(smax * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));
int x_size = (img_cols - 2 * border) / step;
int y_size = (img_rows - 2 * border) / step;
if (x_size <= 0 || y_size <= 0)
break;
uploadConstant("cv::gpu::surf::c_octave", octave);
uploadConstant("cv::gpu::surf::c_x_size", x_size);
uploadConstant("cv::gpu::surf::c_y_size", y_size);
uploadConstant("cv::gpu::surf::c_border", border);
uploadConstant("cv::gpu::surf::c_step", step);
fasthessian(hessianBuffer, x_size, y_size, threads);
// Reset the candidate count.
maxCounter = 0;
nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads);
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave;
uploadConstant("cv::gpu::surf::c_octave", octave);
uploadConstant("cv::gpu::surf::c_layer_rows", layer_rows);
uploadConstant("cv::gpu::surf::c_layer_cols", layer_cols);
icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);
icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr<int4>(), d_counters + 2 + octave,
img_rows, img_cols, octave, use_mask, nOctaveLayers);
unsigned int maxCounter;
cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 2 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
if (maxCounter > 0)
{
fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,
featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);
featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr<int4>(), maxCounter,
featuresBuffer.ptr<KeyPoint_GPU>(), d_counters);
}
}
unsigned int featureCounter;
cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
if (featureCounter > 0)
featuresBuffer.colRange(0, featureCounter).copyTo(keypoints);
else
keypoints.release();
findOrientation(featuresBuffer.colRange(0, featureCounter), keypoints);
}
void findOrientation(GpuMat& keypoints)
void findOrientation(const GpuMat& features, GpuMat& keypoints)
{
if (keypoints.cols > 0)
find_orientation_gpu(keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
if (features.cols > 0)
{
ensureSizeIsEnough(1, maxKeypoints, CV_32FC(6), keypointsBuffer);
icvCalcOrientation_gpu(features.ptr<KeyPoint_GPU>(), features.cols, keypointsBuffer.ptr<KeyPoint_GPU>(),
d_counters + 1);
unsigned int keypointsCounter;
cudaSafeCall( cudaMemcpy(&keypointsCounter, d_counters + 1, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
keypointsCounter = std::min(keypointsCounter, static_cast<unsigned int>(maxKeypoints));
if (keypointsCounter > 0)
keypointsBuffer.colRange(0, keypointsCounter).copyTo(keypoints);
else
keypoints.release();
}
}
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
{
typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors,
const KeyPoint_GPU* features, int nFeatures);
const compute_descriptors_t compute_descriptors = compute_descriptors_gpu_old;
//DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
if (keypoints.cols > 0)
{
descriptors.create(keypoints.cols, descriptorSize, CV_32F);
compute_descriptors(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
}
}
private:
GpuMat& sum;
GpuMat& sumf;
GpuMat& mask1;
GpuMat& maskSum;
GpuMat& intBuffer;
GpuMat& det;
GpuMat& trace;
GpuMat& hessianBuffer;
GpuMat& maxPosBuffer;
GpuMat& featuresBuffer;
GpuMat& keypointsBuffer;
int img_cols, img_rows;
bool use_mask;
float mask_width, mask_height;
unsigned int featureCounter;
unsigned int maxCounter;
int maxCandidates;
int maxFeatures;
int maxKeypoints;
int max_candidates;
int max_features;
unsigned int* d_counters;
};
}
cv::gpu::SURF_GPU::SURF_GPU()
{
hessianThreshold = 100;
extended = 1;
nOctaves = 4;
nOctaveLayers = 2;
keypointsRatio = 0.01f;
}
cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio)
{
hessianThreshold = _threshold;
extended = _extended;
nOctaves = _nOctaves;
nOctaveLayers = _nOctaveLayers;
keypointsRatio = _keypointsRatio;
}
int cv::gpu::SURF_GPU::descriptorSize() const
{
return extended ? 128 : 64;
......@@ -281,27 +264,64 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa
{
Mat keypointsCPU(1, keypoints.size(), CV_32FC(6));
const KeyPoint* keypoints_ptr = &keypoints[0];
KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();
for (size_t i = 0; i < keypoints.size(); ++i, ++keypoints_ptr, ++keypointsCPU_ptr)
for (size_t i = 0; i < keypoints.size(); ++i)
{
const KeyPoint& kp = *keypoints_ptr;
KeyPoint_GPU& gkp = *keypointsCPU_ptr;
const KeyPoint& kp = keypoints[i];
KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];
gkp.x = kp.pt.x;
gkp.y = kp.pt.y;
gkp.laplacian = 1.0f;
gkp.size = kp.size;
gkp.octave = static_cast<float>(kp.octave);
gkp.angle = kp.angle;
gkp.response = kp.response;
gkp.dir = kp.angle;
gkp.hessian = kp.response;
}
keypointsGPU.upload(keypointsCPU);
}
}
namespace
{
int calcSize(int octave, int layer)
{
/* Wavelet size at first layer of first octave. */
const int HAAR_SIZE0 = 9;
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbours of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
int getPointOctave(const KeyPoint_GPU& kpt, const CvSURFParams& params)
{
int best_octave = 0;
float min_diff = numeric_limits<float>::max();
for (int octave = 1; octave < params.nOctaves; ++octave)
{
for (int layer = 0; layer < params.nOctaveLayers; ++layer)
{
float diff = std::abs(kpt.size - (float)calcSize(octave, layer));
if (min_diff > diff)
{
min_diff = diff;
best_octave = octave;
if (min_diff == 0)
return best_octave;
}
}
}
return best_octave;
}
}
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints)
{
if (keypointsGPU.empty())
......@@ -313,21 +333,23 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
Mat keypointsCPU = keypointsGPU;
keypoints.resize(keypointsGPU.cols);
KeyPoint* keypoints_ptr = &keypoints[0];
const KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();
for (int i = 0; i < keypointsGPU.cols; ++i, ++keypoints_ptr, ++keypointsCPU_ptr)
for (int i = 0; i < keypointsGPU.cols; ++i)
{
KeyPoint& kp = *keypoints_ptr;
const KeyPoint_GPU& gkp = *keypointsCPU_ptr;
KeyPoint& kp = keypoints[i];
const KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];
kp.pt.x = gkp.x;
kp.pt.y = gkp.y;
kp.size = gkp.size;
kp.octave = static_cast<int>(gkp.octave);
kp.angle = gkp.angle;
kp.response = gkp.response;
kp.angle = gkp.dir;
kp.response = gkp.hessian;
kp.octave = getPointOctave(gkp, *this);
kp.class_id = static_cast<int>(gkp.laplacian);
}
}
}
......@@ -353,23 +375,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat
SURF_GPU_Invoker surf(*this, img, mask);
surf.detectKeypoints(keypoints);
surf.findOrientation(keypoints);
}
}
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints, bool calcOrientation)
bool useProvidedKeypoints)
{
if (!img.empty())
{
SURF_GPU_Invoker surf(*this, img, mask);
if (!useProvidedKeypoints)
surf.detectKeypoints(keypoints);
if (calcOrientation)
surf.findOrientation(keypoints);
else
{
GpuMat keypointsBuf;
surf.findOrientation(keypoints, keypointsBuf);
keypointsBuf.copyTo(keypoints);
}
surf.computeDescriptors(keypoints, descriptors, descriptorSize());
}
......@@ -385,24 +408,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector
}
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints,
GpuMat& descriptors, bool useProvidedKeypoints, bool calcOrientation)
GpuMat& descriptors, bool useProvidedKeypoints)
{
GpuMat keypointsGPU;
if (useProvidedKeypoints)
uploadKeypoints(keypoints, keypointsGPU);
(*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation);
(*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
downloadKeypoints(keypointsGPU, keypoints);
}
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints,
vector<float>& descriptors, bool useProvidedKeypoints, bool calcOrientation)
vector<float>& descriptors, bool useProvidedKeypoints)
{
GpuMat descriptorsGPU;
(*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation);
(*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
downloadDescriptors(descriptorsGPU, descriptors);
}
......
......@@ -48,7 +48,6 @@ using namespace std;
const string FEATURES2D_DIR = "features2d";
const string IMAGE_FILENAME = "aloe.png";
const string VALID_FILE_NAME = "surf.xml.gz";
class CV_GPU_SURFTest : public cvtest::BaseTest
{
......@@ -59,17 +58,20 @@ public:
protected:
bool isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2);
int getValidCount(const vector<KeyPoint>& keypoints1, const vector<KeyPoint>& keypoints2, const vector<DMatch>& matches);
void compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints,
const Mat& validDescriptors, const Mat& calcDescriptors);
void emptyDataTest(SURF_GPU& fdetector);
void regressionTest(SURF_GPU& fdetector);
void emptyDataTest();
void accuracyTest();
virtual void run(int);
};
void CV_GPU_SURFTest::emptyDataTest(SURF_GPU& fdetector)
void CV_GPU_SURFTest::emptyDataTest()
{
SURF_GPU fdetector;
GpuMat image;
vector<KeyPoint> keypoints;
vector<float> descriptors;
......@@ -114,116 +116,80 @@ bool CV_GPU_SURFTest::isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2)
p1.class_id == p2.class_id );
}
void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints,
const Mat& validDescriptors, const Mat& calcDescriptors)
int CV_GPU_SURFTest::getValidCount(const vector<KeyPoint>& keypoints1, const vector<KeyPoint>& keypoints2,
const vector<DMatch>& matches)
{
if (validKeypoints.size() != calcKeypoints.size())
int count = 0;
for (size_t i = 0; i < matches.size(); ++i)
{
ts->printf(cvtest::TS::LOG, "Keypoints sizes doesn't equal (validCount = %d, calcCount = %d).\n",
validKeypoints.size(), calcKeypoints.size());
ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
return;
const DMatch& m = matches[i];
const KeyPoint& kp1 = keypoints1[m.queryIdx];
const KeyPoint& kp2 = keypoints2[m.trainIdx];
if (isSimilarKeypoints(kp1, kp2))
++count;
}
if (validDescriptors.size() != calcDescriptors.size())
return count;
}
void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints,
const Mat& validDescriptors, const Mat& calcDescriptors)
{
BruteForceMatcher< L2<float> > matcher;
vector<DMatch> matches;
matcher.match(validDescriptors, calcDescriptors, matches);
int validCount = getValidCount(validKeypoints, calcKeypoints, matches);
float validRatio = (float)validCount / matches.size();
if (validRatio < 0.5f)
{
ts->printf(cvtest::TS::LOG, "Descriptors sizes doesn't equal.\n");
ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
ts->printf(cvtest::TS::LOG, "Bad accuracy - %f.\n", validRatio);
ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
return;
}
for (size_t v = 0; v < validKeypoints.size(); v++)
{
int nearestIdx = -1;
float minDist = std::numeric_limits<float>::max();
for (size_t c = 0; c < calcKeypoints.size(); c++)
{
float curDist = (float)norm(calcKeypoints[c].pt - validKeypoints[v].pt);
if (curDist < minDist)
{
minDist = curDist;
nearestIdx = c;
}
}
assert(minDist >= 0);
if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
{
ts->printf(cvtest::TS::LOG, "Bad keypoints accuracy.\n");
ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
return;
}
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f)
{
ts->printf(cvtest::TS::LOG, "Bad descriptors accuracy.\n");
ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
return;
}
}
}
void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector)
void CV_GPU_SURFTest::accuracyTest()
{
string imgFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + IMAGE_FILENAME;
string resFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + VALID_FILE_NAME;
// Read the test image.
GpuMat image(imread(imgFilename, 0));
Mat image = imread(imgFilename, 0);
if (image.empty())
{
ts->printf( cvtest::TS::LOG, "Image %s can not be read.\n", imgFilename.c_str() );
ts->set_failed_test_info( cvtest::TS::FAIL_INVALID_TEST_DATA );
return;
}
FileStorage fs(resFilename, FileStorage::READ);
Mat mask(image.size(), CV_8UC1, Scalar::all(1));
mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0));
// Compute keypoints.
GpuMat mask(image.size(), CV_8UC1, Scalar::all(1));
mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0));
vector<KeyPoint> calcKeypoints;
GpuMat calcDespcriptors;
fdetector(image, mask, calcKeypoints, calcDespcriptors);
if (fs.isOpened()) // Compare computed and valid keypoints.
{
// Read validation keypoints set.
vector<KeyPoint> validKeypoints;
Mat validDespcriptors;
read(fs["keypoints"], validKeypoints);
read(fs["descriptors"], validDespcriptors);
if (validKeypoints.empty() || validDespcriptors.empty())
{
ts->printf(cvtest::TS::LOG, "Validation file can not be read.\n");
ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
return;
}
compareKeypointSets(validKeypoints, calcKeypoints, validDespcriptors, calcDespcriptors);
}
else // Write detector parameters and computed keypoints as validation data.
{
fs.open(resFilename, FileStorage::WRITE);
if (!fs.isOpened())
{
ts->printf(cvtest::TS::LOG, "File %s can not be opened to write.\n", resFilename.c_str());
ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
return;
}
else
{
write(fs, "keypoints", calcKeypoints);
write(fs, "descriptors", (Mat)calcDespcriptors);
}
}
GpuMat calcDescriptors;
SURF_GPU fdetector; fdetector.extended = false;
fdetector(GpuMat(image), GpuMat(mask), calcKeypoints, calcDescriptors);
// Calc validation keypoints set.
vector<KeyPoint> validKeypoints;
vector<float> validDescriptors;
SURF fdetector_gold; fdetector_gold.extended = false;
fdetector_gold(image, mask, validKeypoints, validDescriptors);
compareKeypointSets(validKeypoints, calcKeypoints,
Mat(validKeypoints.size(), fdetector_gold.descriptorSize(), CV_32F, &validDescriptors[0]), calcDescriptors);
}
void CV_GPU_SURFTest::run( int /*start_from*/ )
{
SURF_GPU fdetector;
emptyDataTest(fdetector);
regressionTest(fdetector);
emptyDataTest();
accuracyTest();
}
TEST(SURF, empty_data_and_regression) { CV_GPU_SURFTest test; test.safe_run(); }
TEST(SURF, empty_data_and_accuracy) { CV_GPU_SURFTest test; test.safe_run(); }
......@@ -264,10 +264,11 @@ TEST(SURF)
SURF surf;
vector<KeyPoint> keypoints1, keypoints2;
vector<float> descriptors1, descriptors2;
CPU_ON;
surf(src1, Mat(), keypoints1);
surf(src2, Mat(), keypoints2);
surf(src1, Mat(), keypoints1, descriptors1);
surf(src2, Mat(), keypoints2, descriptors2);
CPU_OFF;
gpu::SURF_GPU d_surf;
......@@ -275,8 +276,8 @@ TEST(SURF)
gpu::GpuMat d_descriptors1, d_descriptors2;
GPU_ON;
d_surf(d_src1, gpu::GpuMat(), d_keypoints1);
d_surf(d_src2, gpu::GpuMat(), d_keypoints2);
d_surf(d_src1, gpu::GpuMat(), d_keypoints1, d_descriptors1);
d_surf(d_src2, gpu::GpuMat(), d_keypoints2, d_descriptors2);
GPU_OFF;
}
......
......@@ -51,10 +51,10 @@ int main(int argc, char* argv[])
vector<KeyPoint> keypoints1, keypoints2;
vector<float> descriptors1, descriptors2;
vector<DMatch> matches;
SURF_GPU::downloadKeypoints(keypoints1GPU, keypoints1);
SURF_GPU::downloadKeypoints(keypoints2GPU, keypoints2);
SURF_GPU::downloadDescriptors(descriptors1GPU, descriptors1);
SURF_GPU::downloadDescriptors(descriptors2GPU, descriptors2);
surf.downloadKeypoints(keypoints1GPU, keypoints1);
surf.downloadKeypoints(keypoints2GPU, keypoints2);
surf.downloadDescriptors(descriptors1GPU, descriptors1);
surf.downloadDescriptors(descriptors2GPU, descriptors2);
BruteForceMatcher_GPU< L2<float> >::matchDownload(trainIdx, distance, matches);
// drawing the results
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment