Commit 58f69197 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

made GPU version of SURF more consistent with CPU one

parent c067c633
...@@ -1537,83 +1537,55 @@ namespace cv ...@@ -1537,83 +1537,55 @@ namespace cv
}; };
////////////////////////////////// SURF ////////////////////////////////////////// ////////////////////////////////// SURF //////////////////////////////////////////
struct CV_EXPORTS SURFParams_GPU
{
SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f),
l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f),
edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {}
//! The interest operator threshold
float threshold;
//! The number of octaves to process
int nOctaves;
//! The number of intervals in each octave
int nIntervals;
//! The scale associated with the first interval of the first octave
float initialScale;
//! mask parameter l_1
float l1;
//! mask parameter l_2
float l2;
//! mask parameter l_3
float l3;
//! mask parameter l_4
float l4;
//! The amount to scale the edge rejection mask
float edgeScale;
//! The initial sampling step in pixels.
int initialStep;
//! True, if generate 128-len descriptors, false - 64-len descriptors
bool extended;
//! max features = featuresRatio * img.size().srea()
float featuresRatio;
};
class CV_EXPORTS SURF_GPU : public SURFParams_GPU class CV_EXPORTS SURF_GPU : public CvSURFParams
{ {
public: public:
//! the default constructor
SURF_GPU();
//! the full constructor taking all the necessary parameters
explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f);
//! returns the descriptor size in float's (64 or 128) //! returns the descriptor size in float's (64 or 128)
int descriptorSize() const; int descriptorSize() const;
//! upload host keypoints to device memory //! upload host keypoints to device memory
static void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU); void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
//! download keypoints from device to host memory //! download keypoints from device to host memory
static void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints); void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);
//! download descriptors from device to host memory //! download descriptors from device to host memory
static void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors); void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
//! finds the keypoints using fast hessian detector used in SURF //! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images //! supports CV_8UC1 images
//! keypoints will have 1 row and type CV_32FC(6) //! keypoints will have 1 row and type CV_32FC(6)
//! keypoints.at<float[6]>(1, i) contains i'th keypoint //! keypoints.at<float[6]>(1, i) contains i'th keypoint
//! format: (x, y, size, response, angle, octave) //! format: (x, y, laplacian, size, dir, hessian)
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints); void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
//! finds the keypoints and computes their descriptors. //! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints = false, bool calcOrientation = true); bool useProvidedKeypoints = false);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints); void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints = false, bool calcOrientation = true); bool useProvidedKeypoints = false);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
bool useProvidedKeypoints = false, bool calcOrientation = true); bool useProvidedKeypoints = false);
//! max keypoints = keypointsRatio * img.size().area()
float keypointsRatio;
GpuMat sum; GpuMat sum, mask1, maskSum, intBuffer;
GpuMat sumf;
GpuMat mask1; GpuMat det, trace;
GpuMat maskSum;
GpuMat hessianBuffer;
GpuMat maxPosBuffer; GpuMat maxPosBuffer;
GpuMat featuresBuffer; GpuMat featuresBuffer;
GpuMat keypointsBuffer;
}; };
} }
......
...@@ -111,20 +111,20 @@ namespace cv ...@@ -111,20 +111,20 @@ namespace cv
{ {
float x; float x;
float y; float y;
float laplacian;
float size; float size;
float response; float dir;
float angle; float hessian;
float octave;
}; };
enum KeypointLayout enum KeypointLayout
{ {
SF_X, SF_X,
SF_Y, SF_Y,
SF_LAPLACIAN,
SF_SIZE, SF_SIZE,
SF_RESPONSE, SF_DIR,
SF_ANGLE, SF_HESSIAN,
SF_OCTAVE,
SF_FEATURE_STRIDE SF_FEATURE_STRIDE
}; };
} }
......
...@@ -55,52 +55,6 @@ using namespace cv::gpu::device; ...@@ -55,52 +55,6 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace surf namespace cv { namespace gpu { namespace surf
{ {
////////////////////////////////////////////////////////////////////////
// Help funcs
// Wrapper for host reference to pass into kernel
template <typename T>
class DeviceReference
{
public:
explicit DeviceReference(T& host_val) : d_ptr(0), h_ptr(&host_val)
{
cudaSafeCall( cudaMalloc((void**)&d_ptr, sizeof(T)) );
cudaSafeCall( cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice) );
}
~DeviceReference()
{
cudaSafeCall( cudaMemcpy(h_ptr, d_ptr, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaFree(d_ptr) );
}
// Casting to device pointer
operator T*() {return d_ptr;}
operator const T*() const {return d_ptr;}
private:
T* d_ptr;
T* h_ptr;
};
__device__ void clearLastBit(int* f)
{
*f &= ~0x1;
}
__device__ void clearLastBit(float& f)
{
clearLastBit((int*)&f);
}
__device__ void setLastBit(int* f)
{
*f |= 0x1;
}
__device__ void setLastBit(float& f)
{
setLastBit((int*)&f);
}
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Global parameters // Global parameters
...@@ -108,215 +62,126 @@ namespace cv { namespace gpu { namespace surf ...@@ -108,215 +62,126 @@ namespace cv { namespace gpu { namespace surf
__constant__ int c_max_candidates; __constant__ int c_max_candidates;
// The maximum number of features that memory is reserved for. // The maximum number of features that memory is reserved for.
__constant__ int c_max_features; __constant__ int c_max_features;
// The number of intervals in the octave. // The maximum number of keypoints that memory is reserved for.
__constant__ int c_nIntervals; __constant__ int c_max_keypoints;
// Mask sizes derived from the mask parameters // The image size.
__constant__ float c_mask_width; __constant__ int c_img_rows;
// Mask sizes derived from the mask parameters __constant__ int c_img_cols;
__constant__ float c_mask_height; // The number of layers.
// Mask sizes derived from the mask parameters __constant__ int c_nOctaveLayers;
__constant__ float c_dxy_center_offset; // The hessian threshold.
// Mask sizes derived from the mask parameters __constant__ float c_hessianThreshold;
__constant__ float c_dxy_half_width;
// Mask sizes derived from the mask parameters // The current octave.
__constant__ float c_dxy_scale;
// The scale associated with the first interval of the first octave
__constant__ float c_initialScale;
// The interest operator threshold
__constant__ float c_threshold;
// Ther octave
__constant__ int c_octave; __constant__ int c_octave;
// The width of the octave buffer. // The current layer size.
__constant__ int c_x_size; __constant__ int c_layer_rows;
// The height of the octave buffer. __constant__ int c_layer_cols;
__constant__ int c_y_size;
// The size of the octave border in pixels.
__constant__ int c_border;
// The step size used in this octave in pixels.
__constant__ int c_step;
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Integral image texture // Integral image texture
texture<float, 2, cudaReadModeElementType> sumTex(0, cudaFilterModeLinear, cudaAddressModeClamp); typedef texture<unsigned int, 2, cudaReadModeElementType> IntTex;
__device__ float iiAreaLookupCDHalfWH(float cx, float cy, float halfWidth, float halfHeight) IntTex sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
{
float result = 0.f;
result += tex2D(sumTex, cx - halfWidth, cy - halfHeight);
result -= tex2D(sumTex, cx + halfWidth, cy - halfHeight);
result -= tex2D(sumTex, cx - halfWidth, cy + halfHeight);
result += tex2D(sumTex, cx + halfWidth, cy + halfHeight);
return result;
}
////////////////////////////////////////////////////////////////////////
// Hessian
__device__ float evalDyy(float x, float y, float t, float mask_width, float mask_height, float fscale) template <int N>
__device__ float icvCalcHaarPattern(const IntTex& tex, const float src[][5], int oldSize, int newSize, int y, int x)
{ {
float Dyy = 0.f; #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
typedef double real_t;
Dyy += iiAreaLookupCDHalfWH(x, y, mask_width, mask_height); #else
Dyy -= t * iiAreaLookupCDHalfWH(x, y, mask_width, fscale); typedef float real_t;
#endif
Dyy *= 1.0f / (fscale * fscale);
return Dyy; float ratio = (float)newSize / oldSize;
}
real_t d = 0;
__device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale) #pragma unroll
{ for (int k = 0; k < N; ++k)
float Dxx = 0.f; {
int dx1 = __float2int_rn(ratio * src[k][0]);
int dy1 = __float2int_rn(ratio * src[k][1]);
int dx2 = __float2int_rn(ratio * src[k][2]);
int dy2 = __float2int_rn(ratio * src[k][3]);
Dxx += iiAreaLookupCDHalfWH(x, y, mask_height, mask_width); real_t t = 0;
Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale , mask_width); t += tex2D(tex, x + dx1, y + dy1);
t -= tex2D(tex, x + dx1, y + dy2);
t -= tex2D(tex, x + dx2, y + dy1);
t += tex2D(tex, x + dx2, y + dy2);
Dxx *= 1.0f / (fscale * fscale); d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
return Dxx; return (float)d;
} }
__device__ float evalDxy(float x, float y, float fscale) ////////////////////////////////////////////////////////////////////////
{ // Hessian
float center_offset = c_dxy_center_offset * fscale;
float half_width = c_dxy_half_width * fscale;
float Dxy = 0.f;
Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width);
Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width);
Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width);
Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width);
Dxy *= 1.0f / (fscale * fscale);
return Dxy; __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
} __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
__device__ float calcScale(int hidx_z) __host__ __device__ int calcSize(int octave, int layer)
{ {
float d = (c_initialScale * (1 << c_octave)) / (c_nIntervals - 2); /* Wavelet size at first layer of first octave. */
return c_initialScale * (1 << c_octave) + d * (hidx_z - 1.0f) + 0.5f; const int HAAR_SIZE0 = 9;
}
__global__ void fasthessian(PtrStepf hessianBuffer)
{
// Determine the indices in the Hessian buffer
int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
int hidx_y = threadIdx.y + blockIdx.y * blockDim.y;
int hidx_z = threadIdx.z;
float fscale = calcScale(hidx_z);
// Compute the lookup location of the mask center /* Wavelet size increment between layers. This should be an even number,
float x = hidx_x * c_step + c_border; such that the wavelet sizes in an octave are either all even or all odd.
float y = hidx_y * c_step + c_border; This ensures that when looking for the neighbours of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
// Scale the mask dimensions according to the scale return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
{
float mask_width = c_mask_width * fscale;
float mask_height = c_mask_height * fscale;
// Compute the filter responses
float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxy = evalDxy(x, y, fscale);
// Combine the responses and store the Laplacian sign
float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
if (Dxx + Dyy > 0.f)
setLastBit(result);
else
clearLastBit(result);
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
}
} }
__global__ void fasthessian_old(PtrStepf hessianBuffer) __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
{ {
// Determine the indices in the Hessian buffer // Determine the indices
int gridDim_y = gridDim.y / c_nIntervals; const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
int blockIdx_y = blockIdx.y % gridDim_y; const int blockIdx_y = blockIdx.y % gridDim_y;
int blockIdx_z = blockIdx.y / gridDim_y; const int blockIdx_z = blockIdx.y / gridDim_y;
int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
int hidx_y = threadIdx.y + blockIdx_y * blockDim.y;
int hidx_z = blockIdx_z;
float fscale = calcScale(hidx_z); const int j = threadIdx.x + blockIdx.x * blockDim.x;
const int i = threadIdx.y + blockIdx_y * blockDim.y;
const int layer = blockIdx_z;
// Compute the lookup location of the mask center const int size = calcSize(c_octave, layer);
float x = hidx_x * c_step + c_border;
float y = hidx_y * c_step + c_border;
// Scale the mask dimensions according to the scale const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals) const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
{
float mask_width = c_mask_width * fscale;
float mask_height = c_mask_height * fscale;
// Compute the filter responses
float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxy = evalDxy(x, y, fscale);
// Combine the responses and store the Laplacian sign /* Ignore pixels where some of the kernel is outside the image */
float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy); const int margin = (size >> 1) >> c_octave;
if (Dxx + Dyy > 0.f) if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
setLastBit(result); {
else const float dx = icvCalcHaarPattern<3>(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
clearLastBit(result); const float dy = icvCalcHaarPattern<3>(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
const float dxy = icvCalcHaarPattern<4>(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result; det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
} }
} }
dim3 calcBlockSize(int nIntervals) void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
{
int threadsPerBlock = 512;
dim3 threads;
threads.z = nIntervals;
threadsPerBlock /= nIntervals;
if (threadsPerBlock >= 48)
threads.x = 16;
else
threads.x = 8;
threadsPerBlock /= threads.x;
threads.y = threadsPerBlock;
return threads;
}
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
{ {
dim3 grid; const int min_size = calcSize(octave, 0);
grid.x = divUp(x_size, threads.x); const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
grid.y = divUp(y_size, threads.y); const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
fasthessian<<<grid, threads>>>(hessianBuffer);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)
{
dim3 threads(16, 16); dim3 threads(16, 16);
dim3 grid; dim3 grid;
grid.x = divUp(x_size, threads.x); grid.x = divUp(max_samples_j, threads.x);
grid.y = divUp(y_size, threads.y) * threadsOld.z; grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
fasthessian_old<<<grid, threads>>>(hessianBuffer); icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -325,109 +190,113 @@ namespace cv { namespace gpu { namespace surf ...@@ -325,109 +190,113 @@ namespace cv { namespace gpu { namespace surf
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// NONMAX // NONMAX
texture<int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp); IntTex maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
struct WithOutMask struct WithOutMask
{ {
static __device__ bool check(float, float, float) static __device__ bool check(int, int, int)
{ {
return true; return true;
} }
}; };
__constant__ float c_DM[1][5] = {{0, 0, 9, 9, 1}};
struct WithMask struct WithMask
{ {
static __device__ bool check(float x, float y, float fscale) static __device__ bool check(int sum_i, int sum_j, int size)
{ {
float half_width = fscale / 2; float mval = icvCalcHaarPattern<1>(maskSumTex, c_DM , 9, size, sum_i, sum_j);
return (mval >= 0.5);
float result = 0.f;
result += tex2D(maskSumTex, x - half_width, y - half_width);
result -= tex2D(maskSumTex, x + half_width, y - half_width);
result -= tex2D(maskSumTex, x - half_width, y + half_width);
result += tex2D(maskSumTex, x + half_width, y + half_width);
result /= (fscale * fscale);
return (result >= 0.5f);
} }
}; };
template <typename Mask> template <typename Mask>
__global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter) __global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
extern __shared__ float fh_vals[]; extern __shared__ float N9[];
// The hidx variables are the indices to the hessian buffer. // The hidx variables are the indices to the hessian buffer.
int hidx_x = threadIdx.x + blockIdx.x * (blockDim.x - 2); const int gridDim_y = gridDim.y / c_nOctaveLayers;
int hidx_y = threadIdx.y + blockIdx.y * (blockDim.y - 2); const int blockIdx_y = blockIdx.y % gridDim_y;
int hidx_z = threadIdx.z; const int blockIdx_z = blockIdx.y / gridDim_y;
int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
// Is this thread within the hessian buffer? const int layer = blockIdx_z + 1;
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
{ const int size = calcSize(c_octave, layer);
fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];
}
__syncthreads();
// Is this location one of the ones being processed for nonmax suppression. /* Ignore pixels without a 3x3x3 neighbourhood in the layer above */
// Blocks overlap by one so we don't process the border threads. const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1
&& threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < c_y_size - 1
&& threadIdx.z > 0 && threadIdx.z < blockDim.z-1;
float val = fh_vals[localLin]; const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
// Compute the lookup location of the mask center // Is this thread within the hessian buffer?
float x = hidx_x * c_step + c_border; const int zoff = blockDim.x * blockDim.y;
float y = hidx_y * c_step + c_border; const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
float fscale = calcScale(hidx_z); N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + i)[j];
N9[localLin ] = det.ptr(c_layer_rows * (layer ) + i)[j];
N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + i)[j];
__syncthreads();
if (inBounds2 && val >= c_threshold && Mask::check(x, y, fscale)) if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
{ {
// Check to see if we have a max (in its 26 neighbours) float val0 = N9[localLin];
int zoff = blockDim.x * blockDim.y;
bool condmax = val > fh_vals[localLin + 1] if (val0 > c_hessianThreshold)
&& val > fh_vals[localLin - 1]
&& val > fh_vals[localLin - blockDim.x + 1]
&& val > fh_vals[localLin - blockDim.x ]
&& val > fh_vals[localLin - blockDim.x - 1]
&& val > fh_vals[localLin + blockDim.x + 1]
&& val > fh_vals[localLin + blockDim.x ]
&& val > fh_vals[localLin + blockDim.x - 1]
&& val > fh_vals[localLin - zoff + 1]
&& val > fh_vals[localLin - zoff ]
&& val > fh_vals[localLin - zoff - 1]
&& val > fh_vals[localLin - zoff - blockDim.x + 1]
&& val > fh_vals[localLin - zoff - blockDim.x ]
&& val > fh_vals[localLin - zoff - blockDim.x - 1]
&& val > fh_vals[localLin - zoff + blockDim.x + 1]
&& val > fh_vals[localLin - zoff + blockDim.x ]
&& val > fh_vals[localLin - zoff + blockDim.x - 1]
&& val > fh_vals[localLin + zoff + 1]
&& val > fh_vals[localLin + zoff ]
&& val > fh_vals[localLin + zoff - 1]
&& val > fh_vals[localLin + zoff - blockDim.x + 1]
&& val > fh_vals[localLin + zoff - blockDim.x ]
&& val > fh_vals[localLin + zoff - blockDim.x - 1]
&& val > fh_vals[localLin + zoff + blockDim.x + 1]
&& val > fh_vals[localLin + zoff + blockDim.x ]
&& val > fh_vals[localLin + zoff + blockDim.x - 1]
;
if(condmax)
{ {
unsigned int i = atomicInc(maxCounter,(unsigned int) -1); // Coordinates for the start of the wavelet in the sum image. There
// is some integer division involved, so don't try to simplify this
// (cancel out sampleStep) without checking the result is the same
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (i < c_max_candidates) if (Mask::check(sum_i, sum_j, size))
{ {
int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave}; // Check to see if we have a max (in its 26 neighbours)
maxPosBuffer[i] = f; const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
&& val0 > N9[localLin - blockDim.x - zoff]
&& val0 > N9[localLin + 1 - blockDim.x - zoff]
&& val0 > N9[localLin - 1 - zoff]
&& val0 > N9[localLin - zoff]
&& val0 > N9[localLin + 1 - zoff]
&& val0 > N9[localLin - 1 + blockDim.x - zoff]
&& val0 > N9[localLin + blockDim.x - zoff]
&& val0 > N9[localLin + 1 + blockDim.x - zoff]
&& val0 > N9[localLin - 1 - blockDim.x]
&& val0 > N9[localLin - blockDim.x]
&& val0 > N9[localLin + 1 - blockDim.x]
&& val0 > N9[localLin - 1 ]
&& val0 > N9[localLin + 1 ]
&& val0 > N9[localLin - 1 + blockDim.x]
&& val0 > N9[localLin + blockDim.x]
&& val0 > N9[localLin + 1 + blockDim.x]
&& val0 > N9[localLin - 1 - blockDim.x + zoff]
&& val0 > N9[localLin - blockDim.x + zoff]
&& val0 > N9[localLin + 1 - blockDim.x + zoff]
&& val0 > N9[localLin - 1 + zoff]
&& val0 > N9[localLin + zoff]
&& val0 > N9[localLin + 1 + zoff]
&& val0 > N9[localLin - 1 + blockDim.x + zoff]
&& val0 > N9[localLin + blockDim.x + zoff]
&& val0 > N9[localLin + 1 + blockDim.x + zoff]
;
if(condmax)
{
unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
if (ind < c_max_candidates)
{
const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
}
}
} }
} }
} }
...@@ -435,21 +304,26 @@ namespace cv { namespace gpu { namespace surf ...@@ -435,21 +304,26 @@ namespace cv { namespace gpu { namespace surf
#endif #endif
} }
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int x_size, int y_size, bool use_mask, const dim3& threads) int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
{ {
dim3 grid; const int layer_rows = img_rows >> octave;
grid.x = divUp(x_size, threads.x - 2); const int layer_cols = img_cols >> octave;
grid.y = divUp(y_size, threads.y - 2);
int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
dim3 threads(16, 16);
const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float); dim3 grid;
grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
DeviceReference<unsigned int> maxCounterWrapper(maxCounter); const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
if (use_mask) if (use_mask)
nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper); icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
else else
nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper); icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -459,166 +333,117 @@ namespace cv { namespace gpu { namespace surf ...@@ -459,166 +333,117 @@ namespace cv { namespace gpu { namespace surf
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// INTERPOLATION // INTERPOLATION
#define MID_IDX 1 __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
__global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer,
KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x; const int4 maxPos = maxPosBuffer[blockIdx.x];
int hidx_y = maxPosBuffer[blockIdx.x].y - 1 + threadIdx.y;
int hidx_z = maxPosBuffer[blockIdx.x].z - 1 + threadIdx.z; const int j = maxPos.x - 1 + threadIdx.x;
const int i = maxPos.y - 1 + threadIdx.y;
const int layer = maxPos.z - 1 + threadIdx.z;
__shared__ float fh_vals[3][3][3]; __shared__ float N9[3][3][3];
__shared__ KeyPoint_GPU p; __shared__ KeyPoint_GPU p;
fh_vals[threadIdx.z][threadIdx.y][threadIdx.x] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x]; N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
__syncthreads(); __syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{ {
__shared__ float H[3][3]; __shared__ float dD[3];
//dxx
H[0][0] = fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ]
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]
+ fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ];
//dyy //dx
H[1][1] = fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ] //dy
+ fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]; dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
//ds
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
//dss __shared__ float H[3][3];
H[2][2] = fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ]
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]
+ fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ];
//dxx
H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
//dxy //dxy
H[0][1]= 0.25f* H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX - 1] +
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX - 1]);
//dxs //dxs
H[0][2]= 0.25f* H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
(fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX ] -
fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX ] -
fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX ] +
fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX ]);
//dys
H[1][2]= 0.25f*
(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX + 1] -
fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX - 1] -
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX + 1] +
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX - 1]);
//dyx = dxy //dyx = dxy
H[1][0] = H[0][1]; H[1][0] = H[0][1];
//dyy
H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
//dys
H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
//dsx = dxs //dsx = dxs
H[2][0] = H[0][2]; H[2][0] = H[0][2];
//dsy = dys //dsy = dys
H[2][1] = H[1][2]; H[2][1] = H[1][2];
//dss
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
__shared__ float dD[3]; float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1])
- H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0])
+ H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]);
//dx if (det != 0.0f)
dD[0] = 0.5f*(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] -
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]);
//dy
dD[1] = 0.5f*(fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]);
//ds
dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] -
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]);
__shared__ float invdet;
invdet = 1.f /
(
H[0][0]*H[1][1]*H[2][2]
+ H[0][1]*H[1][2]*H[2][0]
+ H[0][2]*H[1][0]*H[2][1]
- H[0][0]*H[1][2]*H[2][1]
- H[0][1]*H[1][0]*H[2][2]
- H[0][2]*H[1][1]*H[2][0]
);
// // 1-based entries of a 3x3 inverse
// /* [ |a22 a23| |a12 a13| |a12 a13|] */
// /* [ |a32 a33| -|a32 a33| |a22 a23|] */
// /* [ ] */
// /* [ |a21 a23| |a11 a13| |a11 a13|] */
// /* A^(-1) = [-|a31 a33| |a31 a33| -|a21 a23|] / d */
// /* [ ] */
// /* [ |a21 a22| |a11 a12| |a11 a12|] */
// /* [ |a31 a32| -|a31 a32| |a21 a22|] */
__shared__ float Hinv[3][3];
Hinv[0][0] = invdet*(H[1][1]*H[2][2]-H[1][2]*H[2][1]);
Hinv[0][1] = -invdet*(H[0][1]*H[2][2]-H[0][2]*H[2][1]);
Hinv[0][2] = invdet*(H[0][1]*H[1][2]-H[0][2]*H[1][1]);
Hinv[1][0] = -invdet*(H[1][0]*H[2][2]-H[1][2]*H[2][0]);
Hinv[1][1] = invdet*(H[0][0]*H[2][2]-H[0][2]*H[2][0]);
Hinv[1][2] = -invdet*(H[0][0]*H[1][2]-H[0][2]*H[1][0]);
Hinv[2][0] = invdet*(H[1][0]*H[2][1]-H[1][1]*H[2][0]);
Hinv[2][1] = -invdet*(H[0][0]*H[2][1]-H[0][1]*H[2][0]);
Hinv[2][2] = invdet*(H[0][0]*H[1][1]-H[0][1]*H[1][0]);
__shared__ float x[3];
x[0] = -(Hinv[0][0]*(dD[0]) + Hinv[0][1]*(dD[1]) + Hinv[0][2]*(dD[2]));
x[1] = -(Hinv[1][0]*(dD[0]) + Hinv[1][1]*(dD[1]) + Hinv[1][2]*(dD[2]));
x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2]));
if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f)
{ {
// if the step is within the interpolation region, perform it float invdet = 1.0f / det;
// Get a new feature index. __shared__ float x[3];
unsigned int i = atomicInc(featureCounter, (unsigned int)-1);
if (i < c_max_features) x[0] = invdet *
(dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) -
H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) +
H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2]));
x[1] = invdet *
(H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) -
dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) +
H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0]));
x[2] = invdet *
(H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) -
H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) +
dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]));
if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
{ {
p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border; // if the step is within the interpolation region, perform it
p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border;
if (x[2] > 0) // Get a new feature index.
{ unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
float a = calcScale(maxPosBuffer[blockIdx.x].z);
float b = calcScale(maxPosBuffer[blockIdx.x].z + 1);
p.size = (1.f - x[2]) * a + x[2] * b; if (ind < c_max_features)
}
else
{ {
float a = calcScale(maxPosBuffer[blockIdx.x].z); const int size = calcSize(c_octave, maxPos.z);
float b = calcScale(maxPosBuffer[blockIdx.x].z - 1);
p.size = (1.f + x[2]) * a - x[2] * b; const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
} const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
const float center_i = sum_i + (float)(size - 1) / 2;
const float center_j = sum_j + (float)(size - 1) / 2;
p.octave = c_octave; p.x = center_j + x[0] * (1 << c_octave);
p.y = center_i + x[1] * (1 << c_octave);
p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX]; int ds = size - calcSize(c_octave, maxPos.z - 1);
p.size = roundf(size + x[2] * ds);
// Should we split up this transfer over many threads? p.laplacian = maxPos.w;
featuresBuffer[i] = p; p.dir = 0.0f;
} p.hessian = N9[1][1][1];
} // If the subpixel interpolation worked
// Should we split up this transfer over many threads?
featuresBuffer[ind] = p;
}
} // If the subpixel interpolation worked
}
} // If this is thread 0. } // If this is thread 0.
#endif #endif
} }
#undef MID_IDX
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter)
{ {
dim3 threads; dim3 threads;
threads.x = 3; threads.x = 3;
...@@ -628,9 +453,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -628,9 +453,7 @@ namespace cv { namespace gpu { namespace surf
dim3 grid; dim3 grid;
grid.x = maxCounter; grid.x = maxCounter;
DeviceReference<unsigned int> featureCounterWrapper(featureCounter); icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featuresBuffer, featureCounter);
fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -639,139 +462,217 @@ namespace cv { namespace gpu { namespace surf ...@@ -639,139 +462,217 @@ namespace cv { namespace gpu { namespace surf
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Orientation // Orientation
// precomputed values for a Gaussian with a standard deviation of 2 #define ORI_SEARCH_INC 5
__constant__ float c_gauss1D[13] = #define ORI_WIN 60
{ #define ORI_SAMPLES 113
0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f,
0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f,
0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f,
0.002215924206f
};
__global__ void find_orientation(KeyPoint_GPU* features) __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
{ __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
int tid = threadIdx.y * 17 + threadIdx.x; __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
int tid2 = numeric_limits_gpu<int>::max();
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
if (threadIdx.x < 13 && threadIdx.y < 13) __global__ void icvCalcOrientation(const KeyPoint_GPU* featureBuffer, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
{ {
tid2 = threadIdx.y * 13 + threadIdx.x; #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
}
__shared__ float texLookups[17][17]; __shared__ float s_X[128];
__shared__ float s_Y[128];
__shared__ float s_angle[128];
__shared__ float Edx[13*13]; __shared__ float s_sumx[64 * 4];
__shared__ float Edy[13*13]; __shared__ float s_sumy[64 * 4];
__shared__ float xys[3];
// Read my x, y, size. __shared__ float s_feature[6];
if (tid < 3)
{ if (threadIdx.x < 6 && threadIdx.y == 0)
xys[tid] = ((float*)(&features[blockIdx.x]))[tid]; s_feature[threadIdx.x] = ((float*)(&featureBuffer[blockIdx.x]))[threadIdx.x];
}
__syncthreads(); __syncthreads();
// Read all texture locations into memory
// Maybe I should use __mul24 here?
texLookups[threadIdx.x][threadIdx.y] = tex2D(sumTex, xys[SF_X] + ((int)threadIdx.x - 8) * xys[SF_SIZE],
xys[SF_Y] + ((int)threadIdx.y - 8) * xys[SF_SIZE]);
__syncthreads(); /* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = s_feature[SF_SIZE] * 1.2f / 9.0f;
float dx = 0.f; /* To find the dominant orientation, the gradients in x and y are
float dy = 0.f; sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the
wavelet pattern is balanced and symmetric around its center */
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
// Computes lookups for all points in a 13x13 lattice. // check when grad_wav_size is too big
// - SURF says to only use a circle, but the branching logic would slow it down if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
// - Gaussian weighting should reduce the effects of the outer points anyway
if (tid2 < 169)
{ {
dx -= texLookups[threadIdx.x ][threadIdx.y ]; // Calc X, Y, angle and store it to shared memory
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ]; {
dx -= texLookups[threadIdx.x + 4][threadIdx.y ]; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
dx += texLookups[threadIdx.x ][threadIdx.y + 4];
dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];
dx += texLookups[threadIdx.x + 4][threadIdx.y + 4];
dy -= texLookups[threadIdx.x ][threadIdx.y ];
dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2];
dy -= texLookups[threadIdx.x ][threadIdx.y + 4];
dy += texLookups[threadIdx.x + 4][threadIdx.y ];
dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];
dy += texLookups[threadIdx.x + 4][threadIdx.y + 4];
float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y];
Edx[tid2] = dx * g;
Edy[tid2] = dy * g;
}
__syncthreads(); float X = 0.0f, Y = 0.0f, angle = 0.0f;
// This is a scan to get the summed dx, dy values. if (tid < ORI_SAMPLES)
// Gets 128-168 {
if (tid < 41) const float margin = (float)(grad_wav_size - 1) / 2.0f;
{ const int x = __float2int_rn(s_feature[SF_X] + c_aptX[tid] * s - margin);
Edx[tid] += Edx[tid + 128]; const int y = __float2int_rn(s_feature[SF_Y] + c_aptY[tid] * s - margin);
}
__syncthreads();
if (tid < 64)
{
Edx[tid] += Edx[tid + 64];
}
__syncthreads();
if (tid < 32)
{
volatile float* smem = Edx;
smem[tid] += smem[tid + 32];
smem[tid] += smem[tid + 16];
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
// Gets 128-168 if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
if (tid < 41) {
{ X = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NX, 4, grad_wav_size, y, x);
Edy[tid] += Edy[tid + 128]; Y = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NY, 4, grad_wav_size, y, x);
}
__syncthreads(); angle = atan2f(Y, X);
if (tid < 64) if (angle < 0)
{ angle += 2.0f * CV_PI;
Edy[tid] += Edy[tid + 64]; angle *= 180.0f / CV_PI;
} }
__syncthreads(); }
if (tid < 32) if (tid < 128)
{ {
volatile float* smem = Edy; s_X[tid] = X;
s_Y[tid] = Y;
smem[tid] += smem[tid + 32]; s_angle[tid] = angle;
smem[tid] += smem[tid + 16]; }
smem[tid] += smem[tid + 8]; }
smem[tid] += smem[tid + 4]; __syncthreads();
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
// Thread 0 saves back the result. float bestx = 0, besty = 0, best_mod = 0;
if (tid == 0)
{ #pragma unroll
features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI); for (int i = 0; i < 18; ++i)
{
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
float sumx = 0.0f, sumy = 0.0f;
int d = abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx = s_X[threadIdx.x];
sumy = s_Y[threadIdx.x];
}
d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[threadIdx.x + 64];
sumy += s_Y[threadIdx.x + 64];
}
float* s_sumx_row = s_sumx + threadIdx.y * 64;
float* s_sumy_row = s_sumy + threadIdx.y * 64;
s_sumx_row[threadIdx.x] = sumx;
s_sumy_row[threadIdx.x] = sumy;
__syncthreads();
if (threadIdx.x < 32)
{
volatile float* v_sumx_row = s_sumx_row;
volatile float* v_sumy_row = s_sumy_row;
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1];
}
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
{
best_mod = temp_mod;
bestx = sumx;
besty = sumy;
}
__syncthreads();
}
if (threadIdx.x == 0)
{
s_X[threadIdx.y] = bestx;
s_Y[threadIdx.y] = besty;
s_angle[threadIdx.y] = best_mod;
}
__syncthreads();
if (threadIdx.x < 2 && threadIdx.y == 0)
{
volatile float* v_x = s_X;
volatile float* v_y = s_Y;
volatile float* v_mod = s_angle;
bestx = v_x[threadIdx.x];
besty = v_y[threadIdx.x];
best_mod = v_mod[threadIdx.x];
float temp_mod = v_mod[threadIdx.x + 2];
if (temp_mod > best_mod)
{
v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
v_mod[threadIdx.x] = best_mod = temp_mod;
}
temp_mod = v_mod[threadIdx.x + 1];
if (temp_mod > best_mod)
{
v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
}
}
if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
{
// Get a new feature index.
unsigned int ind = atomicInc(keypointCounter, (unsigned int)-1);
if (ind < c_max_keypoints)
{
float kp_dir = atan2f(besty, bestx);
if (kp_dir < 0)
kp_dir += 2.0f * CV_PI;
kp_dir *= 180.0f / CV_PI;
__shared__ KeyPoint_GPU kp;
kp.x = s_feature[SF_X];
kp.y = s_feature[SF_Y];
kp.laplacian = s_feature[SF_LAPLACIAN];
kp.size = s_feature[SF_SIZE];
kp.dir = kp_dir;
kp.hessian = s_feature[SF_HESSIAN];
keypoints[ind] = kp;
}
}
} }
#endif
} }
void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures) #undef ORI_SEARCH_INC
#undef ORI_WIN
#undef ORI_SAMPLES
void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
{ {
dim3 threads; dim3 threads;
threads.x = 17; threads.x = 64;
threads.y = 17; threads.y = 4;
dim3 grid; dim3 grid;
grid.x = nFeatures; grid.x = nFeatures;
find_orientation<<<grid, threads>>>(features); icvCalcOrientation<<<grid, threads>>>(featureBuffer, keypoints, keypointCounter);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -780,117 +681,119 @@ namespace cv { namespace gpu { namespace surf ...@@ -780,117 +681,119 @@ namespace cv { namespace gpu { namespace surf
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Descriptors // Descriptors
// precomputed values for a Gaussian with a standard deviation of 3.3 #define PATCH_SZ 20
// - it appears SURF uses a different value, but not sure what it is
__constant__ float c_3p3gauss1D[20] = texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
{
0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f, __constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f, {
0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f, 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
}; };
template <int BLOCK_DIM_X> __device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2)
__global__ void normalize_descriptors(PtrStepf descriptors)
{ {
// no need for thread ID const float centerX = s_pt[SF_X];
float* descriptor_base = descriptors.ptr(blockIdx.x); const float centerY = s_pt[SF_Y];
const float size = s_pt[SF_SIZE];
const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);
// read in the unnormalized descriptor values (squared) /* The sampling intervals and wavelet sized for selecting an orientation
__shared__ float sqDesc[BLOCK_DIM_X]; and building the keypoint descriptor are defined relative to 's' */
const float lookup = descriptor_base[threadIdx.x]; const float s = size * 1.2f / 9.0f;
sqDesc[threadIdx.x] = lookup * lookup;
__syncthreads();
if (BLOCK_DIM_X >= 128) /* Extract a window of pixels around the keypoint of size 20s */
{ const int win_size = (int)((PATCH_SZ + 1) * s);
if (threadIdx.x < 64)
sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
__syncthreads();
}
// reduction to get total float sin_dir;
if (threadIdx.x < 32) float cos_dir;
{ sincosf(descriptor_dir, &sin_dir, &cos_dir);
volatile float* smem = sqDesc;
smem[threadIdx.x] += smem[threadIdx.x + 32]; /* Nearest neighbour version (faster) */
smem[threadIdx.x] += smem[threadIdx.x + 16]; const float win_offset = -(float)(win_size - 1) / 2;
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
}
// compute length (square root) /* Scale the window to size PATCH_SZ so each pixel's size is s. This
__shared__ float len; makes calculating the gradients with wavelets of size 2s easy */
if (threadIdx.x == 0) const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;
{ const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size;
len = sqrtf(sqDesc[0]);
}
__syncthreads();
// normalize and store in output const int i = __float2int_rd(icoo);
descriptor_base[threadIdx.x] = lookup / len; const int j = __float2int_rd(jcoo);
}
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt, float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
int xIndex, int yIndex, int tid) float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
{
float sin_theta, cos_theta; float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo);
sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);
pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir;
// Compute rotated sampling points pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir;
// (clockwise rotation since we are rotating the lattice)
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel) res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo);
const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
+ sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]); pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir;
const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir;
+ cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j);
// gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
// a b c pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir;
// d f pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir;
// g h i
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j);
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); s_PATCH[i2][j2] = (unsigned char)res;
const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]);
const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
// compute axis-aligned HaarX, HaarY
// (could group the additions together into multiplications)
const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx
const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy
// rotate responses (store all dxs then all dys)
// - counterclockwise rotation to rotate back to zero orientation
sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
} }
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features) __device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)
{ {
// get the interest point parameters (x, y, size, response, angle) // get the interest point parameters (x, y, size, response, angle)
__shared__ float ipt[5]; __shared__ float s_pt[5];
if (threadIdx.x < 5 && threadIdx.y == 0) if (tid < 5)
{ {
ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x]; s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid];
} }
__syncthreads(); __syncthreads();
// Compute sampling points // Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices // since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (threadIdx.y & 3); // threadIdx.y % 4 const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4) const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
const int xIndex = (xBlock * 5) + (threadIdx.x % 5); const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = (yBlock * 5) + (threadIdx.x / 5); const int yIndex = yBlock * blockDim.y + threadIdx.y;
calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x);
if (threadIdx.x == 0)
calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5);
if (threadIdx.y == 0)
calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x);
if (threadIdx.x == 0 && threadIdx.y == 0)
calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5);
__syncthreads();
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw;
const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw;
calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x); s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
} }
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,
...@@ -933,193 +836,9 @@ namespace cv { namespace gpu { namespace surf ...@@ -933,193 +836,9 @@ namespace cv { namespace gpu { namespace surf
// Spawn 16 blocks per interest point // Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features) __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx [16 * 25];
__shared__ float sdy [16 * 25];
__shared__ float sdxabs[16 * 25];
__shared__ float sdyabs[16 * 25];
__shared__ float sdesc[64];
float* sdx_bin = sdx + (threadIdx.y * 25);
float* sdy_bin = sdy + (threadIdx.y * 25);
float* sdxabs_bin = sdxabs + (threadIdx.y * 25);
float* sdyabs_bin = sdyabs + (threadIdx.y * 25);
calc_dx_dy(sdx_bin, sdy_bin, features);
__syncthreads();
sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array
sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array
__syncthreads();
reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x);
__syncthreads();
float* sdesc_bin = sdesc + (threadIdx.y << 2);
// write dx, dy, |dx|, |dy|
if (threadIdx.x == 0)
{
sdesc_bin[0] = sdx_bin[0];
sdesc_bin[1] = sdy_bin[0];
sdesc_bin[2] = sdxabs_bin[0];
sdesc_bin[3] = sdyabs_bin[0];
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 64)
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
}
// Spawn 16 blocks per interest point
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[16 * 25];
__shared__ float sdy[16 * 25];
// sum (reduce) 5x5 area response
__shared__ float sd1[16 * 25];
__shared__ float sd2[16 * 25];
__shared__ float sdabs1[16 * 25];
__shared__ float sdabs2[16 * 25];
__shared__ float sdesc[128];
float* sdx_bin = sdx + (threadIdx.y * 25);
float* sdy_bin = sdy + (threadIdx.y * 25);
float* sd1_bin = sd1 + (threadIdx.y * 25);
float* sd2_bin = sd2 + (threadIdx.y * 25);
float* sdabs1_bin = sdabs1 + (threadIdx.y * 25);
float* sdabs2_bin = sdabs2 + (threadIdx.y * 25);
calc_dx_dy(sdx_bin, sdy_bin, features);
__syncthreads();
if (sdy_bin[threadIdx.x] >= 0)
{
sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x];
sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]);
sd2_bin[threadIdx.x] = 0;
sdabs2_bin[threadIdx.x] = 0;
}
else
{
sd1_bin[threadIdx.x] = 0;
sdabs1_bin[threadIdx.x] = 0;
sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x];
sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]);
}
__syncthreads();
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
__syncthreads();
float* sdesc_bin = sdesc + (threadIdx.y << 3);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (threadIdx.x == 0)
{
sdesc_bin[0] = sd1_bin[0];
sdesc_bin[1] = sdabs1_bin[0];
sdesc_bin[2] = sd2_bin[0];
sdesc_bin[3] = sdabs2_bin[0];
}
__syncthreads();
if (sdx_bin[threadIdx.x] >= 0)
{
sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x];
sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
sd2_bin[threadIdx.x] = 0;
sdabs2_bin[threadIdx.x] = 0;
}
else
{
sd1_bin[threadIdx.x] = 0;
sdabs1_bin[threadIdx.x] = 0;
sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x];
sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
}
__syncthreads();
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
__syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (threadIdx.x == 0)
{
sdesc_bin[4] = sd1_bin[0];
sdesc_bin[5] = sdabs1_bin[0];
sdesc_bin[6] = sd2_bin[0];
sdesc_bin[7] = sdabs2_bin[0];
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 128)
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
}
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
{
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
}
__device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid)
{
// get the interest point parameters (x, y, scale, strength, theta)
__shared__ float ipt[5];
if (tid < 5)
{
ipt[tid] = ((float*)&features[blockIdx.x])[tid];
}
__syncthreads();
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = yBlock * blockDim.y + threadIdx.y;
calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid);
}
// Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)
{ {
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float s_PATCH[6][6];
__shared__ float sdx[25]; __shared__ float sdx[25];
__shared__ float sdy[25]; __shared__ float sdy[25];
__shared__ float sdxabs[25]; __shared__ float sdxabs[25];
...@@ -1127,7 +846,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -1127,7 +846,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy_old(sdx, sdy, features, tid); calc_dx_dy(s_PATCH, sdx, sdy, features, tid);
__syncthreads(); __syncthreads();
sdxabs[tid] = fabs(sdx[tid]); // |dx| array sdxabs[tid] = fabs(sdx[tid]); // |dx| array
...@@ -1151,9 +870,10 @@ namespace cv { namespace gpu { namespace surf ...@@ -1151,9 +870,10 @@ namespace cv { namespace gpu { namespace surf
// Spawn 16 blocks per interest point // Spawn 16 blocks per interest point
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features) __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
{ {
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float s_PATCH[6][6];
__shared__ float sdx[25]; __shared__ float sdx[25];
__shared__ float sdy[25]; __shared__ float sdy[25];
...@@ -1165,7 +885,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -1165,7 +885,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy_old(sdx, sdy, features, tid); calc_dx_dy(s_PATCH, sdx, sdy, features, tid);
__syncthreads(); __syncthreads();
if (sdy[tid] >= 0) if (sdy[tid] >= 0)
...@@ -1184,7 +904,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -1184,7 +904,7 @@ namespace cv { namespace gpu { namespace surf
} }
__syncthreads(); __syncthreads();
reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
__syncthreads(); __syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3); float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
...@@ -1215,7 +935,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -1215,7 +935,7 @@ namespace cv { namespace gpu { namespace surf
} }
__syncthreads(); __syncthreads();
reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
__syncthreads(); __syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
...@@ -1228,13 +948,56 @@ namespace cv { namespace gpu { namespace surf ...@@ -1228,13 +948,56 @@ namespace cv { namespace gpu { namespace surf
} }
} }
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures) template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
{
// no need for thread ID
float* descriptor_base = descriptors.ptr(blockIdx.x);
// read in the unnormalized descriptor values (squared)
__shared__ float sqDesc[BLOCK_DIM_X];
const float lookup = descriptor_base[threadIdx.x];
sqDesc[threadIdx.x] = lookup * lookup;
__syncthreads();
if (BLOCK_DIM_X >= 128)
{
if (threadIdx.x < 64)
sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
__syncthreads();
}
// reduction to get total
if (threadIdx.x < 32)
{
volatile float* smem = sqDesc;
smem[threadIdx.x] += smem[threadIdx.x + 32];
smem[threadIdx.x] += smem[threadIdx.x + 16];
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
}
// compute length (square root)
__shared__ float len;
if (threadIdx.x == 0)
{
len = sqrtf(sqDesc[0]);
}
__syncthreads();
// normalize and store in output
descriptor_base[threadIdx.x] = lookup / len;
}
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
{ {
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64) if (descriptors.cols == 64)
{ {
compute_descriptors64_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features); compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -1246,7 +1009,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -1246,7 +1009,7 @@ namespace cv { namespace gpu { namespace surf
} }
else else
{ {
compute_descriptors128_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features); compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
......
...@@ -48,123 +48,93 @@ using namespace std; ...@@ -48,123 +48,93 @@ using namespace std;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }
cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float) { throw_nogpu(); }
int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;} int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}
void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>&, GpuMat&) { throw_nogpu(); } void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); } void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector<float>&) { throw_nogpu(); } void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector<float>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); } void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); } void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool, bool) { throw_nogpu(); } void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool, bool) { throw_nogpu(); } void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace surf namespace cv { namespace gpu { namespace surf
{ {
dim3 calcBlockSize(int nIntervals); void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads); void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld); int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter);
int x_size, int y_size, bool use_mask, const dim3& threads);
void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter);
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter,
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures); void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
}}} }}}
using namespace cv::gpu::surf; using namespace cv::gpu::surf;
namespace namespace
{ {
class SURF_GPU_Invoker : private SURFParams_GPU class SURF_GPU_Invoker : private CvSURFParams
{ {
public: public:
SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) : SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :
SURFParams_GPU(surf), CvSURFParams(surf),
sum(surf.sum), sumf(surf.sumf), sum(surf.sum), mask1(surf.mask1), maskSum(surf.maskSum), intBuffer(surf.intBuffer), det(surf.det), trace(surf.trace),
mask1(surf.mask1), maskSum(surf.maskSum), maxPosBuffer(surf.maxPosBuffer), featuresBuffer(surf.featuresBuffer), keypointsBuffer(surf.keypointsBuffer),
hessianBuffer(surf.hessianBuffer),
maxPosBuffer(surf.maxPosBuffer),
featuresBuffer(surf.featuresBuffer),
img_cols(img.cols), img_rows(img.rows), img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty()), use_mask(!mask.empty())
mask_width(0), mask_height(0),
featureCounter(0), maxCounter(0)
{ {
CV_Assert(!img.empty() && img.type() == CV_8UC1); CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22); CV_Assert(nOctaves > 0 && nOctaveLayers > 0);
CV_Assert(DeviceInfo().supports(GLOBAL_ATOMICS)); CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
max_features = static_cast<int>(img.size().area() * featuresRatio);
max_candidates = static_cast<int>(1.5 * max_features);
CV_Assert(max_features > 0);
featuresBuffer.create(1, max_features, CV_32FC(6)); maxKeypoints = static_cast<int>(img.size().area() * surf.keypointsRatio);
maxPosBuffer.create(1, max_candidates, CV_32SC4); maxFeatures = static_cast<int>(1.5 * maxKeypoints);
maxCandidates = static_cast<int>(1.5 * maxFeatures);
mask_width = l2 * 0.5f; CV_Assert(maxKeypoints > 0);
mask_height = 1.0f + l1;
cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 2) * sizeof(unsigned int)) );
cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 2) * sizeof(unsigned int)) );
// Dxy gap half-width uploadConstant("cv::gpu::surf::c_max_candidates", maxCandidates);
float dxy_center_offset = 0.5f * (l4 + l3); uploadConstant("cv::gpu::surf::c_max_features", maxFeatures);
// Dxy squares half-width uploadConstant("cv::gpu::surf::c_max_keypoints", maxKeypoints);
float dxy_half_width = 0.5f * l3; uploadConstant("cv::gpu::surf::c_img_rows", img_rows);
uploadConstant("cv::gpu::surf::c_img_cols", img_cols);
uploadConstant("cv::gpu::surf::c_nOctaveLayers", nOctaveLayers);
uploadConstant("cv::gpu::surf::c_hessianThreshold", static_cast<float>(hessianThreshold));
// rescale edge_scale to fit with the filter dimensions bindTexture("cv::gpu::surf::imgTex", (DevMem2D)img);
float dxy_scale = edgeScale * std::pow((2.f + 2.f * l1) * l2 / (4.f * l3 * l3), 2.f);
// Compute border required such that the filters don't overstep the image boundaries
float smax0 = 2.0f * initialScale + 0.5f;
int border0 = static_cast<int>(std::ceil(smax0 * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));
int width0 = (img_cols - 2 * border0) / initialStep;
int height0 = (img_rows - 2 * border0) / initialStep;
uploadConstant("cv::gpu::surf::c_max_candidates", max_candidates);
uploadConstant("cv::gpu::surf::c_max_features", max_features);
uploadConstant("cv::gpu::surf::c_nIntervals", nIntervals);
uploadConstant("cv::gpu::surf::c_mask_width", mask_width);
uploadConstant("cv::gpu::surf::c_mask_height", mask_height);
uploadConstant("cv::gpu::surf::c_dxy_center_offset", dxy_center_offset);
uploadConstant("cv::gpu::surf::c_dxy_half_width", dxy_half_width);
uploadConstant("cv::gpu::surf::c_dxy_scale", dxy_scale);
uploadConstant("cv::gpu::surf::c_initialScale", initialScale);
uploadConstant("cv::gpu::surf::c_threshold", threshold);
hessianBuffer.create(height0 * nIntervals, width0, CV_32F);
integral(img, sum); integralBuffered(img, sum, intBuffer);
sum.convertTo(sumf, CV_32F, 1.0 / 255.0); bindTexture("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum);
bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sumf);
if (!mask.empty()) if (use_mask)
{ {
min(mask, 1.0, mask1); min(mask, 1.0, mask1);
integral(mask1, maskSum); integralBuffered(mask1, maskSum, intBuffer);
bindTexture("cv::gpu::surf::maskSumTex", (DevMem2Di)maskSum); bindTexture("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);
} }
} }
~SURF_GPU_Invoker() ~SURF_GPU_Invoker()
{ {
cudaSafeCall( cudaFree(d_counters) );
unbindTexture("cv::gpu::surf::imgTex");
unbindTexture("cv::gpu::surf::sumTex"); unbindTexture("cv::gpu::surf::sumTex");
if (use_mask) if (use_mask)
unbindTexture("cv::gpu::surf::maskSumTex"); unbindTexture("cv::gpu::surf::maskSumTex");
...@@ -172,102 +142,115 @@ namespace ...@@ -172,102 +142,115 @@ namespace
void detectKeypoints(GpuMat& keypoints) void detectKeypoints(GpuMat& keypoints)
{ {
typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads); ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, det);
const fasthessian_t fasthessian = ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, trace);
DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old;
ensureSizeIsEnough(1, maxCandidates, CV_32SC4, maxPosBuffer);
ensureSizeIsEnough(1, maxFeatures, CV_32FC(6), featuresBuffer);
dim3 threads = calcBlockSize(nIntervals); for (int octave = 0; octave < nOctaves; ++octave)
for(int octave = 0; octave < nOctaves; ++octave)
{ {
int step = initialStep * (1 << octave); const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave;
// Compute border required such that the filters don't overstep the image boundaries
float d = (initialScale * (1 << octave)) / (nIntervals - 2); uploadConstant("cv::gpu::surf::c_octave", octave);
float smax = initialScale * (1 << octave) + d * (nIntervals - 2.0f) + 0.5f; uploadConstant("cv::gpu::surf::c_layer_rows", layer_rows);
int border = static_cast<int>(std::ceil(smax * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f))); uploadConstant("cv::gpu::surf::c_layer_cols", layer_cols);
int x_size = (img_cols - 2 * border) / step; icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);
int y_size = (img_rows - 2 * border) / step;
icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr<int4>(), d_counters + 2 + octave,
if (x_size <= 0 || y_size <= 0) img_rows, img_cols, octave, use_mask, nOctaveLayers);
break;
unsigned int maxCounter;
uploadConstant("cv::gpu::surf::c_octave", octave); cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 2 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
uploadConstant("cv::gpu::surf::c_x_size", x_size); maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
uploadConstant("cv::gpu::surf::c_y_size", y_size);
uploadConstant("cv::gpu::surf::c_border", border);
uploadConstant("cv::gpu::surf::c_step", step);
fasthessian(hessianBuffer, x_size, y_size, threads);
// Reset the candidate count.
maxCounter = 0;
nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads);
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
if (maxCounter > 0) if (maxCounter > 0)
{ {
fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr<int4>(), maxCounter,
featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter); featuresBuffer.ptr<KeyPoint_GPU>(), d_counters);
featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
} }
} }
unsigned int featureCounter;
cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
if (featureCounter > 0) findOrientation(featuresBuffer.colRange(0, featureCounter), keypoints);
featuresBuffer.colRange(0, featureCounter).copyTo(keypoints);
else
keypoints.release();
} }
void findOrientation(GpuMat& keypoints) void findOrientation(const GpuMat& features, GpuMat& keypoints)
{ {
if (keypoints.cols > 0) if (features.cols > 0)
find_orientation_gpu(keypoints.ptr<KeyPoint_GPU>(), keypoints.cols); {
ensureSizeIsEnough(1, maxKeypoints, CV_32FC(6), keypointsBuffer);
icvCalcOrientation_gpu(features.ptr<KeyPoint_GPU>(), features.cols, keypointsBuffer.ptr<KeyPoint_GPU>(),
d_counters + 1);
unsigned int keypointsCounter;
cudaSafeCall( cudaMemcpy(&keypointsCounter, d_counters + 1, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
keypointsCounter = std::min(keypointsCounter, static_cast<unsigned int>(maxKeypoints));
if (keypointsCounter > 0)
keypointsBuffer.colRange(0, keypointsCounter).copyTo(keypoints);
else
keypoints.release();
}
} }
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize) void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
{ {
typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors,
const KeyPoint_GPU* features, int nFeatures);
const compute_descriptors_t compute_descriptors = compute_descriptors_gpu_old;
//DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
if (keypoints.cols > 0) if (keypoints.cols > 0)
{ {
descriptors.create(keypoints.cols, descriptorSize, CV_32F); descriptors.create(keypoints.cols, descriptorSize, CV_32F);
compute_descriptors(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols); compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
} }
} }
private: private:
GpuMat& sum; GpuMat& sum;
GpuMat& sumf;
GpuMat& mask1; GpuMat& mask1;
GpuMat& maskSum; GpuMat& maskSum;
GpuMat& intBuffer;
GpuMat& det;
GpuMat& trace;
GpuMat& hessianBuffer;
GpuMat& maxPosBuffer; GpuMat& maxPosBuffer;
GpuMat& featuresBuffer; GpuMat& featuresBuffer;
GpuMat& keypointsBuffer;
int img_cols, img_rows; int img_cols, img_rows;
bool use_mask; bool use_mask;
float mask_width, mask_height;
unsigned int featureCounter; int maxCandidates;
unsigned int maxCounter; int maxFeatures;
int maxKeypoints;
int max_candidates; unsigned int* d_counters;
int max_features;
}; };
} }
cv::gpu::SURF_GPU::SURF_GPU()
{
hessianThreshold = 100;
extended = 1;
nOctaves = 4;
nOctaveLayers = 2;
keypointsRatio = 0.01f;
}
cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio)
{
hessianThreshold = _threshold;
extended = _extended;
nOctaves = _nOctaves;
nOctaveLayers = _nOctaveLayers;
keypointsRatio = _keypointsRatio;
}
int cv::gpu::SURF_GPU::descriptorSize() const int cv::gpu::SURF_GPU::descriptorSize() const
{ {
return extended ? 128 : 64; return extended ? 128 : 64;
...@@ -281,27 +264,64 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa ...@@ -281,27 +264,64 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa
{ {
Mat keypointsCPU(1, keypoints.size(), CV_32FC(6)); Mat keypointsCPU(1, keypoints.size(), CV_32FC(6));
const KeyPoint* keypoints_ptr = &keypoints[0]; for (size_t i = 0; i < keypoints.size(); ++i)
KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();
for (size_t i = 0; i < keypoints.size(); ++i, ++keypoints_ptr, ++keypointsCPU_ptr)
{ {
const KeyPoint& kp = *keypoints_ptr; const KeyPoint& kp = keypoints[i];
KeyPoint_GPU& gkp = *keypointsCPU_ptr; KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];
gkp.x = kp.pt.x; gkp.x = kp.pt.x;
gkp.y = kp.pt.y; gkp.y = kp.pt.y;
gkp.laplacian = 1.0f;
gkp.size = kp.size; gkp.size = kp.size;
gkp.octave = static_cast<float>(kp.octave); gkp.dir = kp.angle;
gkp.angle = kp.angle; gkp.hessian = kp.response;
gkp.response = kp.response;
} }
keypointsGPU.upload(keypointsCPU); keypointsGPU.upload(keypointsCPU);
} }
} }
namespace
{
int calcSize(int octave, int layer)
{
/* Wavelet size at first layer of first octave. */
const int HAAR_SIZE0 = 9;
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbours of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
int getPointOctave(const KeyPoint_GPU& kpt, const CvSURFParams& params)
{
int best_octave = 0;
float min_diff = numeric_limits<float>::max();
for (int octave = 1; octave < params.nOctaves; ++octave)
{
for (int layer = 0; layer < params.nOctaveLayers; ++layer)
{
float diff = std::abs(kpt.size - (float)calcSize(octave, layer));
if (min_diff > diff)
{
min_diff = diff;
best_octave = octave;
if (min_diff == 0)
return best_octave;
}
}
}
return best_octave;
}
}
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints) void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints)
{ {
if (keypointsGPU.empty()) if (keypointsGPU.empty())
...@@ -313,21 +333,23 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key ...@@ -313,21 +333,23 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
Mat keypointsCPU = keypointsGPU; Mat keypointsCPU = keypointsGPU;
keypoints.resize(keypointsGPU.cols); keypoints.resize(keypointsGPU.cols);
KeyPoint* keypoints_ptr = &keypoints[0]; for (int i = 0; i < keypointsGPU.cols; ++i)
const KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();
for (int i = 0; i < keypointsGPU.cols; ++i, ++keypoints_ptr, ++keypointsCPU_ptr)
{ {
KeyPoint& kp = *keypoints_ptr; KeyPoint& kp = keypoints[i];
const KeyPoint_GPU& gkp = *keypointsCPU_ptr; const KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];
kp.pt.x = gkp.x; kp.pt.x = gkp.x;
kp.pt.y = gkp.y; kp.pt.y = gkp.y;
kp.size = gkp.size; kp.size = gkp.size;
kp.octave = static_cast<int>(gkp.octave); kp.angle = gkp.dir;
kp.angle = gkp.angle;
kp.response = gkp.response; kp.response = gkp.hessian;
kp.octave = getPointOctave(gkp, *this);
kp.class_id = static_cast<int>(gkp.laplacian);
} }
} }
} }
...@@ -353,23 +375,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat ...@@ -353,23 +375,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat
SURF_GPU_Invoker surf(*this, img, mask); SURF_GPU_Invoker surf(*this, img, mask);
surf.detectKeypoints(keypoints); surf.detectKeypoints(keypoints);
surf.findOrientation(keypoints);
} }
} }
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints, bool calcOrientation) bool useProvidedKeypoints)
{ {
if (!img.empty()) if (!img.empty())
{ {
SURF_GPU_Invoker surf(*this, img, mask); SURF_GPU_Invoker surf(*this, img, mask);
if (!useProvidedKeypoints) if (!useProvidedKeypoints)
surf.detectKeypoints(keypoints); surf.detectKeypoints(keypoints);
else
if (calcOrientation) {
surf.findOrientation(keypoints); GpuMat keypointsBuf;
surf.findOrientation(keypoints, keypointsBuf);
keypointsBuf.copyTo(keypoints);
}
surf.computeDescriptors(keypoints, descriptors, descriptorSize()); surf.computeDescriptors(keypoints, descriptors, descriptorSize());
} }
...@@ -385,24 +408,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector ...@@ -385,24 +408,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector
} }
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints,
GpuMat& descriptors, bool useProvidedKeypoints, bool calcOrientation) GpuMat& descriptors, bool useProvidedKeypoints)
{ {
GpuMat keypointsGPU; GpuMat keypointsGPU;
if (useProvidedKeypoints) if (useProvidedKeypoints)
uploadKeypoints(keypoints, keypointsGPU); uploadKeypoints(keypoints, keypointsGPU);
(*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation); (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
downloadKeypoints(keypointsGPU, keypoints); downloadKeypoints(keypointsGPU, keypoints);
} }
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints,
vector<float>& descriptors, bool useProvidedKeypoints, bool calcOrientation) vector<float>& descriptors, bool useProvidedKeypoints)
{ {
GpuMat descriptorsGPU; GpuMat descriptorsGPU;
(*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation); (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
downloadDescriptors(descriptorsGPU, descriptors); downloadDescriptors(descriptorsGPU, descriptors);
} }
......
...@@ -48,7 +48,6 @@ using namespace std; ...@@ -48,7 +48,6 @@ using namespace std;
const string FEATURES2D_DIR = "features2d"; const string FEATURES2D_DIR = "features2d";
const string IMAGE_FILENAME = "aloe.png"; const string IMAGE_FILENAME = "aloe.png";
const string VALID_FILE_NAME = "surf.xml.gz";
class CV_GPU_SURFTest : public cvtest::BaseTest class CV_GPU_SURFTest : public cvtest::BaseTest
{ {
...@@ -59,17 +58,20 @@ public: ...@@ -59,17 +58,20 @@ public:
protected: protected:
bool isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2); bool isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2);
int getValidCount(const vector<KeyPoint>& keypoints1, const vector<KeyPoint>& keypoints2, const vector<DMatch>& matches);
void compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints, void compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints,
const Mat& validDescriptors, const Mat& calcDescriptors); const Mat& validDescriptors, const Mat& calcDescriptors);
void emptyDataTest(SURF_GPU& fdetector); void emptyDataTest();
void regressionTest(SURF_GPU& fdetector); void accuracyTest();
virtual void run(int); virtual void run(int);
}; };
void CV_GPU_SURFTest::emptyDataTest(SURF_GPU& fdetector) void CV_GPU_SURFTest::emptyDataTest()
{ {
SURF_GPU fdetector;
GpuMat image; GpuMat image;
vector<KeyPoint> keypoints; vector<KeyPoint> keypoints;
vector<float> descriptors; vector<float> descriptors;
...@@ -114,116 +116,80 @@ bool CV_GPU_SURFTest::isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2) ...@@ -114,116 +116,80 @@ bool CV_GPU_SURFTest::isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2)
p1.class_id == p2.class_id ); p1.class_id == p2.class_id );
} }
void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints, int CV_GPU_SURFTest::getValidCount(const vector<KeyPoint>& keypoints1, const vector<KeyPoint>& keypoints2,
const Mat& validDescriptors, const Mat& calcDescriptors) const vector<DMatch>& matches)
{ {
if (validKeypoints.size() != calcKeypoints.size()) int count = 0;
for (size_t i = 0; i < matches.size(); ++i)
{ {
ts->printf(cvtest::TS::LOG, "Keypoints sizes doesn't equal (validCount = %d, calcCount = %d).\n", const DMatch& m = matches[i];
validKeypoints.size(), calcKeypoints.size());
ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT); const KeyPoint& kp1 = keypoints1[m.queryIdx];
return; const KeyPoint& kp2 = keypoints2[m.trainIdx];
if (isSimilarKeypoints(kp1, kp2))
++count;
} }
if (validDescriptors.size() != calcDescriptors.size())
return count;
}
void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints,
const Mat& validDescriptors, const Mat& calcDescriptors)
{
BruteForceMatcher< L2<float> > matcher;
vector<DMatch> matches;
matcher.match(validDescriptors, calcDescriptors, matches);
int validCount = getValidCount(validKeypoints, calcKeypoints, matches);
float validRatio = (float)validCount / matches.size();
if (validRatio < 0.5f)
{ {
ts->printf(cvtest::TS::LOG, "Descriptors sizes doesn't equal.\n"); ts->printf(cvtest::TS::LOG, "Bad accuracy - %f.\n", validRatio);
ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT); ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
return; return;
} }
for (size_t v = 0; v < validKeypoints.size(); v++)
{
int nearestIdx = -1;
float minDist = std::numeric_limits<float>::max();
for (size_t c = 0; c < calcKeypoints.size(); c++)
{
float curDist = (float)norm(calcKeypoints[c].pt - validKeypoints[v].pt);
if (curDist < minDist)
{
minDist = curDist;
nearestIdx = c;
}
}
assert(minDist >= 0);
if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
{
ts->printf(cvtest::TS::LOG, "Bad keypoints accuracy.\n");
ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
return;
}
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f)
{
ts->printf(cvtest::TS::LOG, "Bad descriptors accuracy.\n");
ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
return;
}
}
} }
void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector) void CV_GPU_SURFTest::accuracyTest()
{ {
string imgFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + IMAGE_FILENAME; string imgFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + IMAGE_FILENAME;
string resFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + VALID_FILE_NAME;
// Read the test image. // Read the test image.
GpuMat image(imread(imgFilename, 0)); Mat image = imread(imgFilename, 0);
if (image.empty()) if (image.empty())
{ {
ts->printf( cvtest::TS::LOG, "Image %s can not be read.\n", imgFilename.c_str() ); ts->printf( cvtest::TS::LOG, "Image %s can not be read.\n", imgFilename.c_str() );
ts->set_failed_test_info( cvtest::TS::FAIL_INVALID_TEST_DATA ); ts->set_failed_test_info( cvtest::TS::FAIL_INVALID_TEST_DATA );
return; return;
} }
FileStorage fs(resFilename, FileStorage::READ); Mat mask(image.size(), CV_8UC1, Scalar::all(1));
mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0));
// Compute keypoints. // Compute keypoints.
GpuMat mask(image.size(), CV_8UC1, Scalar::all(1));
mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0));
vector<KeyPoint> calcKeypoints; vector<KeyPoint> calcKeypoints;
GpuMat calcDespcriptors; GpuMat calcDescriptors;
fdetector(image, mask, calcKeypoints, calcDespcriptors); SURF_GPU fdetector; fdetector.extended = false;
fdetector(GpuMat(image), GpuMat(mask), calcKeypoints, calcDescriptors);
if (fs.isOpened()) // Compare computed and valid keypoints.
{ // Calc validation keypoints set.
// Read validation keypoints set. vector<KeyPoint> validKeypoints;
vector<KeyPoint> validKeypoints; vector<float> validDescriptors;
Mat validDespcriptors; SURF fdetector_gold; fdetector_gold.extended = false;
read(fs["keypoints"], validKeypoints); fdetector_gold(image, mask, validKeypoints, validDescriptors);
read(fs["descriptors"], validDespcriptors);
if (validKeypoints.empty() || validDespcriptors.empty()) compareKeypointSets(validKeypoints, calcKeypoints,
{ Mat(validKeypoints.size(), fdetector_gold.descriptorSize(), CV_32F, &validDescriptors[0]), calcDescriptors);
ts->printf(cvtest::TS::LOG, "Validation file can not be read.\n");
ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
return;
}
compareKeypointSets(validKeypoints, calcKeypoints, validDespcriptors, calcDespcriptors);
}
else // Write detector parameters and computed keypoints as validation data.
{
fs.open(resFilename, FileStorage::WRITE);
if (!fs.isOpened())
{
ts->printf(cvtest::TS::LOG, "File %s can not be opened to write.\n", resFilename.c_str());
ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
return;
}
else
{
write(fs, "keypoints", calcKeypoints);
write(fs, "descriptors", (Mat)calcDespcriptors);
}
}
} }
void CV_GPU_SURFTest::run( int /*start_from*/ ) void CV_GPU_SURFTest::run( int /*start_from*/ )
{ {
SURF_GPU fdetector; emptyDataTest();
accuracyTest();
emptyDataTest(fdetector);
regressionTest(fdetector);
} }
TEST(SURF, empty_data_and_regression) { CV_GPU_SURFTest test; test.safe_run(); } TEST(SURF, empty_data_and_accuracy) { CV_GPU_SURFTest test; test.safe_run(); }
...@@ -264,10 +264,11 @@ TEST(SURF) ...@@ -264,10 +264,11 @@ TEST(SURF)
SURF surf; SURF surf;
vector<KeyPoint> keypoints1, keypoints2; vector<KeyPoint> keypoints1, keypoints2;
vector<float> descriptors1, descriptors2;
CPU_ON; CPU_ON;
surf(src1, Mat(), keypoints1); surf(src1, Mat(), keypoints1, descriptors1);
surf(src2, Mat(), keypoints2); surf(src2, Mat(), keypoints2, descriptors2);
CPU_OFF; CPU_OFF;
gpu::SURF_GPU d_surf; gpu::SURF_GPU d_surf;
...@@ -275,8 +276,8 @@ TEST(SURF) ...@@ -275,8 +276,8 @@ TEST(SURF)
gpu::GpuMat d_descriptors1, d_descriptors2; gpu::GpuMat d_descriptors1, d_descriptors2;
GPU_ON; GPU_ON;
d_surf(d_src1, gpu::GpuMat(), d_keypoints1); d_surf(d_src1, gpu::GpuMat(), d_keypoints1, d_descriptors1);
d_surf(d_src2, gpu::GpuMat(), d_keypoints2); d_surf(d_src2, gpu::GpuMat(), d_keypoints2, d_descriptors2);
GPU_OFF; GPU_OFF;
} }
......
...@@ -51,10 +51,10 @@ int main(int argc, char* argv[]) ...@@ -51,10 +51,10 @@ int main(int argc, char* argv[])
vector<KeyPoint> keypoints1, keypoints2; vector<KeyPoint> keypoints1, keypoints2;
vector<float> descriptors1, descriptors2; vector<float> descriptors1, descriptors2;
vector<DMatch> matches; vector<DMatch> matches;
SURF_GPU::downloadKeypoints(keypoints1GPU, keypoints1); surf.downloadKeypoints(keypoints1GPU, keypoints1);
SURF_GPU::downloadKeypoints(keypoints2GPU, keypoints2); surf.downloadKeypoints(keypoints2GPU, keypoints2);
SURF_GPU::downloadDescriptors(descriptors1GPU, descriptors1); surf.downloadDescriptors(descriptors1GPU, descriptors1);
SURF_GPU::downloadDescriptors(descriptors2GPU, descriptors2); surf.downloadDescriptors(descriptors2GPU, descriptors2);
BruteForceMatcher_GPU< L2<float> >::matchDownload(trainIdx, distance, matches); BruteForceMatcher_GPU< L2<float> >::matchDownload(trainIdx, distance, matches);
// drawing the results // drawing the results
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment