Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
58f69197
Commit
58f69197
authored
Mar 10, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
made GPU version of SURF more consistent with CPU one
parent
c067c633
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
859 additions
and
1134 deletions
+859
-1134
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+20
-48
internal_shared.hpp
modules/gpu/src/cuda/internal_shared.hpp
+6
-6
surf.cu
modules/gpu/src/cuda/surf.cu
+581
-818
surf.cpp
modules/gpu/src/surf.cpp
+187
-164
test_features2d.cpp
modules/gpu/test/test_features2d.cpp
+56
-90
tests.cpp
samples/gpu/performance/tests.cpp
+5
-4
surf_keypoint_matcher.cpp
samples/gpu/surf_keypoint_matcher.cpp
+4
-4
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
58f69197
...
@@ -1537,83 +1537,55 @@ namespace cv
...
@@ -1537,83 +1537,55 @@ namespace cv
};
};
////////////////////////////////// SURF //////////////////////////////////////////
////////////////////////////////// SURF //////////////////////////////////////////
struct
CV_EXPORTS
SURFParams_GPU
{
SURFParams_GPU
()
:
threshold
(
0.1
f
),
nOctaves
(
4
),
nIntervals
(
4
),
initialScale
(
2.
f
),
l1
(
3.
f
/
1.5
f
),
l2
(
5.
f
/
1.5
f
),
l3
(
3.
f
/
1.5
f
),
l4
(
1.
f
/
1.5
f
),
edgeScale
(
0.81
f
),
initialStep
(
1
),
extended
(
true
),
featuresRatio
(
0.01
f
)
{}
//! The interest operator threshold
float
threshold
;
//! The number of octaves to process
int
nOctaves
;
//! The number of intervals in each octave
int
nIntervals
;
//! The scale associated with the first interval of the first octave
float
initialScale
;
//! mask parameter l_1
float
l1
;
//! mask parameter l_2
float
l2
;
//! mask parameter l_3
float
l3
;
//! mask parameter l_4
float
l4
;
//! The amount to scale the edge rejection mask
float
edgeScale
;
//! The initial sampling step in pixels.
int
initialStep
;
//! True, if generate 128-len descriptors, false - 64-len descriptors
bool
extended
;
//! max features = featuresRatio * img.size().srea()
float
featuresRatio
;
};
class
CV_EXPORTS
SURF_GPU
:
public
SURFParams_GPU
class
CV_EXPORTS
SURF_GPU
:
public
CvSURFParams
{
{
public
:
public
:
//! the default constructor
SURF_GPU
();
//! the full constructor taking all the necessary parameters
explicit
SURF_GPU
(
double
_hessianThreshold
,
int
_nOctaves
=
4
,
int
_nOctaveLayers
=
2
,
bool
_extended
=
false
,
float
_keypointsRatio
=
0.01
f
);
//! returns the descriptor size in float's (64 or 128)
//! returns the descriptor size in float's (64 or 128)
int
descriptorSize
()
const
;
int
descriptorSize
()
const
;
//! upload host keypoints to device memory
//! upload host keypoints to device memory
static
void
uploadKeypoints
(
const
vector
<
KeyPoint
>&
keypoints
,
GpuMat
&
keypointsGPU
);
void
uploadKeypoints
(
const
vector
<
KeyPoint
>&
keypoints
,
GpuMat
&
keypointsGPU
);
//! download keypoints from device to host memory
//! download keypoints from device to host memory
static
void
downloadKeypoints
(
const
GpuMat
&
keypointsGPU
,
vector
<
KeyPoint
>&
keypoints
);
void
downloadKeypoints
(
const
GpuMat
&
keypointsGPU
,
vector
<
KeyPoint
>&
keypoints
);
//! download descriptors from device to host memory
//! download descriptors from device to host memory
static
void
downloadDescriptors
(
const
GpuMat
&
descriptorsGPU
,
vector
<
float
>&
descriptors
);
void
downloadDescriptors
(
const
GpuMat
&
descriptorsGPU
,
vector
<
float
>&
descriptors
);
//! finds the keypoints using fast hessian detector used in SURF
//! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images
//! supports CV_8UC1 images
//! keypoints will have 1 row and type CV_32FC(6)
//! keypoints will have 1 row and type CV_32FC(6)
//! keypoints.at<float[6]>(1, i) contains i'th keypoint
//! keypoints.at<float[6]>(1, i) contains i'th keypoint
//! format: (x, y,
size, response, angle, octave
)
//! format: (x, y,
laplacian, size, dir, hessian
)
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
GpuMat
&
keypoints
);
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
GpuMat
&
keypoints
);
//! finds the keypoints and computes their descriptors.
//! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
GpuMat
&
keypoints
,
GpuMat
&
descriptors
,
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
GpuMat
&
keypoints
,
GpuMat
&
descriptors
,
bool
useProvidedKeypoints
=
false
,
bool
calcOrientation
=
true
);
bool
useProvidedKeypoints
=
false
);
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
std
::
vector
<
KeyPoint
>&
keypoints
);
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
std
::
vector
<
KeyPoint
>&
keypoints
);
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
std
::
vector
<
KeyPoint
>&
keypoints
,
GpuMat
&
descriptors
,
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
std
::
vector
<
KeyPoint
>&
keypoints
,
GpuMat
&
descriptors
,
bool
useProvidedKeypoints
=
false
,
bool
calcOrientation
=
true
);
bool
useProvidedKeypoints
=
false
);
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
std
::
vector
<
KeyPoint
>&
keypoints
,
std
::
vector
<
float
>&
descriptors
,
void
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
std
::
vector
<
KeyPoint
>&
keypoints
,
std
::
vector
<
float
>&
descriptors
,
bool
useProvidedKeypoints
=
false
,
bool
calcOrientation
=
true
);
bool
useProvidedKeypoints
=
false
);
//! max keypoints = keypointsRatio * img.size().area()
float
keypointsRatio
;
GpuMat
sum
;
GpuMat
sum
,
mask1
,
maskSum
,
intBuffer
;
GpuMat
sumf
;
GpuMat
mask1
;
GpuMat
det
,
trace
;
GpuMat
maskSum
;
GpuMat
hessianBuffer
;
GpuMat
maxPosBuffer
;
GpuMat
maxPosBuffer
;
GpuMat
featuresBuffer
;
GpuMat
featuresBuffer
;
GpuMat
keypointsBuffer
;
};
};
}
}
...
...
modules/gpu/src/cuda/internal_shared.hpp
View file @
58f69197
...
@@ -111,20 +111,20 @@ namespace cv
...
@@ -111,20 +111,20 @@ namespace cv
{
{
float
x
;
float
x
;
float
y
;
float
y
;
float
laplacian
;
float
size
;
float
size
;
float
response
;
float
dir
;
float
angle
;
float
hessian
;
float
octave
;
};
};
enum
KeypointLayout
enum
KeypointLayout
{
{
SF_X
,
SF_X
,
SF_Y
,
SF_Y
,
SF_LAPLACIAN
,
SF_SIZE
,
SF_SIZE
,
SF_RESPONSE
,
SF_DIR
,
SF_ANGLE
,
SF_HESSIAN
,
SF_OCTAVE
,
SF_FEATURE_STRIDE
SF_FEATURE_STRIDE
};
};
}
}
...
...
modules/gpu/src/cuda/surf.cu
View file @
58f69197
...
@@ -55,52 +55,6 @@ using namespace cv::gpu::device;
...
@@ -55,52 +55,6 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace surf
namespace cv { namespace gpu { namespace surf
{
{
////////////////////////////////////////////////////////////////////////
// Help funcs
// Wrapper for host reference to pass into kernel
template <typename T>
class DeviceReference
{
public:
explicit DeviceReference(T& host_val) : d_ptr(0), h_ptr(&host_val)
{
cudaSafeCall( cudaMalloc((void**)&d_ptr, sizeof(T)) );
cudaSafeCall( cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice) );
}
~DeviceReference()
{
cudaSafeCall( cudaMemcpy(h_ptr, d_ptr, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaFree(d_ptr) );
}
// Casting to device pointer
operator T*() {return d_ptr;}
operator const T*() const {return d_ptr;}
private:
T* d_ptr;
T* h_ptr;
};
__device__ void clearLastBit(int* f)
{
*f &= ~0x1;
}
__device__ void clearLastBit(float& f)
{
clearLastBit((int*)&f);
}
__device__ void setLastBit(int* f)
{
*f |= 0x1;
}
__device__ void setLastBit(float& f)
{
setLastBit((int*)&f);
}
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// Global parameters
// Global parameters
...
@@ -108,215 +62,126 @@ namespace cv { namespace gpu { namespace surf
...
@@ -108,215 +62,126 @@ namespace cv { namespace gpu { namespace surf
__constant__ int c_max_candidates;
__constant__ int c_max_candidates;
// The maximum number of features that memory is reserved for.
// The maximum number of features that memory is reserved for.
__constant__ int c_max_features;
__constant__ int c_max_features;
// The number of intervals in the octave.
// The maximum number of keypoints that memory is reserved for.
__constant__ int c_nIntervals;
__constant__ int c_max_keypoints;
// Mask sizes derived from the mask parameters
// The image size.
__constant__ float c_mask_width;
__constant__ int c_img_rows;
// Mask sizes derived from the mask parameters
__constant__ int c_img_cols;
__constant__ float c_mask_height;
// The number of layers.
// Mask sizes derived from the mask parameters
__constant__ int c_nOctaveLayers;
__constant__ float c_dxy_center_offset;
// The hessian threshold.
// Mask sizes derived from the mask parameters
__constant__ float c_hessianThreshold;
__constant__ float c_dxy_half_width;
// Mask sizes derived from the mask parameters
// The current octave.
__constant__ float c_dxy_scale;
// The scale associated with the first interval of the first octave
__constant__ float c_initialScale;
// The interest operator threshold
__constant__ float c_threshold;
// Ther octave
__constant__ int c_octave;
__constant__ int c_octave;
// The width of the octave buffer.
// The current layer size.
__constant__ int c_x_size;
__constant__ int c_layer_rows;
// The height of the octave buffer.
__constant__ int c_layer_cols;
__constant__ int c_y_size;
// The size of the octave border in pixels.
__constant__ int c_border;
// The step size used in this octave in pixels.
__constant__ int c_step;
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// Integral image texture
// Integral image texture
t
exture<float, 2, cudaReadModeElementType> sumTex(0, cudaFilterModeLinear, cudaAddressModeClamp)
;
t
ypedef texture<unsigned int, 2, cudaReadModeElementType> IntTex
;
__device__ float iiAreaLookupCDHalfWH(float cx, float cy, float halfWidth, float halfHeight)
IntTex sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
{
float result = 0.f;
result += tex2D(sumTex, cx - halfWidth, cy - halfHeight);
result -= tex2D(sumTex, cx + halfWidth, cy - halfHeight);
result -= tex2D(sumTex, cx - halfWidth, cy + halfHeight);
result += tex2D(sumTex, cx + halfWidth, cy + halfHeight);
return result;
}
////////////////////////////////////////////////////////////////////////
// Hessian
__device__ float evalDyy(float x, float y, float t, float mask_width, float mask_height, float fscale)
template <int N>
__device__ float icvCalcHaarPattern(const IntTex& tex, const float src[][5], int oldSize, int newSize, int y, int x)
{
{
float Dyy = 0.f;
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
typedef double real_t;
Dyy += iiAreaLookupCDHalfWH(x, y, mask_width, mask_height);
#else
Dyy -= t * iiAreaLookupCDHalfWH(x, y, mask_width, fscale);
typedef float real_t;
#endif
Dyy *= 1.0f / (fscale * fscale);
return Dyy;
float ratio = (float)newSize / oldSize;
}
real_t d = 0;
__device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale)
#pragma unroll
{
for (int k = 0; k < N; ++k)
float Dxx = 0.f;
{
int dx1 = __float2int_rn(ratio * src[k][0]);
int dy1 = __float2int_rn(ratio * src[k][1]);
int dx2 = __float2int_rn(ratio * src[k][2]);
int dy2 = __float2int_rn(ratio * src[k][3]);
Dxx += iiAreaLookupCDHalfWH(x, y, mask_height, mask_width);
real_t t = 0;
Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale , mask_width);
t += tex2D(tex, x + dx1, y + dy1);
t -= tex2D(tex, x + dx1, y + dy2);
t -= tex2D(tex, x + dx2, y + dy1);
t += tex2D(tex, x + dx2, y + dy2);
Dxx *= 1.0f / (fscale * fscale);
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
return
Dxx
;
return
(float)d
;
}
}
__device__ float evalDxy(float x, float y, float fscale)
////////////////////////////////////////////////////////////////////////
{
// Hessian
float center_offset = c_dxy_center_offset * fscale;
float half_width = c_dxy_half_width * fscale;
float Dxy = 0.f;
Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width);
Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width);
Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width);
Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width);
Dxy *= 1.0f / (fscale * fscale);
return Dxy;
__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
}
__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
__
device__ float calcScale(int hidx_z
)
__
host__ __device__ int calcSize(int octave, int layer
)
{
{
float d = (c_initialScale * (1 << c_octave)) / (c_nIntervals - 2);
/* Wavelet size at first layer of first octave. */
return c_initialScale * (1 << c_octave) + d * (hidx_z - 1.0f) + 0.5f;
const int HAAR_SIZE0 = 9;
}
__global__ void fasthessian(PtrStepf hessianBuffer)
{
// Determine the indices in the Hessian buffer
int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
int hidx_y = threadIdx.y + blockIdx.y * blockDim.y;
int hidx_z = threadIdx.z;
float fscale = calcScale(hidx_z);
// Compute the lookup location of the mask center
/* Wavelet size increment between layers. This should be an even number,
float x = hidx_x * c_step + c_border;
such that the wavelet sizes in an octave are either all even or all odd.
float y = hidx_y * c_step + c_border;
This ensures that when looking for the neighbours of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
// Scale the mask dimensions according to the scale
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
{
float mask_width = c_mask_width * fscale;
float mask_height = c_mask_height * fscale;
// Compute the filter responses
float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxy = evalDxy(x, y, fscale);
// Combine the responses and store the Laplacian sign
float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
if (Dxx + Dyy > 0.f)
setLastBit(result);
else
clearLastBit(result);
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
}
}
}
__global__ void
fasthessian_old(PtrStepf hessianBuffer
)
__global__ void
icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace
)
{
{
// Determine the indices in the Hessian buffer
// Determine the indices
int gridDim_y = gridDim.y / c_nIntervals;
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
int blockIdx_y = blockIdx.y % gridDim_y;
const int blockIdx_y = blockIdx.y % gridDim_y;
int blockIdx_z = blockIdx.y / gridDim_y;
const int blockIdx_z = blockIdx.y / gridDim_y;
int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
int hidx_y = threadIdx.y + blockIdx_y * blockDim.y;
int hidx_z = blockIdx_z;
float fscale = calcScale(hidx_z);
const int j = threadIdx.x + blockIdx.x * blockDim.x;
const int i = threadIdx.y + blockIdx_y * blockDim.y;
const int layer = blockIdx_z;
// Compute the lookup location of the mask center
const int size = calcSize(c_octave, layer);
float x = hidx_x * c_step + c_border;
float y = hidx_y * c_step + c_border;
// Scale the mask dimensions according to the scale
const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
{
float mask_width = c_mask_width * fscale;
float mask_height = c_mask_height * fscale;
// Compute the filter responses
float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxy = evalDxy(x, y, fscale);
// Combine the responses and store the Laplacian sign
/* Ignore pixels where some of the kernel is outside the image */
float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy)
;
const int margin = (size >> 1) >> c_octave
;
if (Dxx + Dyy > 0.f)
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
setLastBit(result);
{
else
const float dx = icvCalcHaarPattern<3>(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
clearLastBit(result);
const float dy = icvCalcHaarPattern<3>(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
const float dxy = icvCalcHaarPattern<4>(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
}
}
}
}
dim3 calcBlockSize(int nIntervals)
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
{
int threadsPerBlock = 512;
dim3 threads;
threads.z = nIntervals;
threadsPerBlock /= nIntervals;
if (threadsPerBlock >= 48)
threads.x = 16;
else
threads.x = 8;
threadsPerBlock /= threads.x;
threads.y = threadsPerBlock;
return threads;
}
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
{
{
dim3 grid;
const int min_size = calcSize(octave, 0);
grid.x = divUp(x_size, threads.x);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
grid.y = divUp(y_size, threads.y);
const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
fasthessian<<<grid, threads>>>(hessianBuffer);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)
{
dim3 threads(16, 16);
dim3 threads(16, 16);
dim3 grid;
dim3 grid;
grid.x = divUp(
x_size
, threads.x);
grid.x = divUp(
max_samples_j
, threads.x);
grid.y = divUp(
y_size, threads.y) * threadsOld.z
;
grid.y = divUp(
max_samples_i, threads.y) * (nOctaveLayers + 2)
;
fasthessian_old<<<grid, threads>>>(hessianBuffer
);
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace
);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaThreadSynchronize() );
...
@@ -325,109 +190,113 @@ namespace cv { namespace gpu { namespace surf
...
@@ -325,109 +190,113 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// NONMAX
// NONMAX
texture<int, 2, cudaReadModeElementType>
maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
IntTex
maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
struct WithOutMask
struct WithOutMask
{
{
static __device__ bool check(
float, float, floa
t)
static __device__ bool check(
int, int, in
t)
{
{
return true;
return true;
}
}
};
};
__constant__ float c_DM[1][5] = {{0, 0, 9, 9, 1}};
struct WithMask
struct WithMask
{
{
static __device__ bool check(
float x, float y, float fscal
e)
static __device__ bool check(
int sum_i, int sum_j, int siz
e)
{
{
float half_width = fscale / 2;
float mval = icvCalcHaarPattern<1>(maskSumTex, c_DM , 9, size, sum_i, sum_j);
return (mval >= 0.5);
float result = 0.f;
result += tex2D(maskSumTex, x - half_width, y - half_width);
result -= tex2D(maskSumTex, x + half_width, y - half_width);
result -= tex2D(maskSumTex, x - half_width, y + half_width);
result += tex2D(maskSumTex, x + half_width, y + half_width);
result /= (fscale * fscale);
return (result >= 0.5f);
}
}
};
};
template <typename Mask>
template <typename Mask>
__global__ void
nonmaxonly(PtrStepf hessianBuffer
, int4* maxPosBuffer, unsigned int* maxCounter)
__global__ void
icvFindMaximaInLayer(PtrStepf det, PtrStepf trace
, int4* maxPosBuffer, unsigned int* maxCounter)
{
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
extern __shared__ float
fh_vals
[];
extern __shared__ float
N9
[];
// The hidx variables are the indices to the hessian buffer.
// The hidx variables are the indices to the hessian buffer.
int hidx_x = threadIdx.x + blockIdx.x * (blockDim.x - 2);
const int gridDim_y = gridDim.y / c_nOctaveLayers;
int hidx_y = threadIdx.y + blockIdx.y * (blockDim.y - 2);
const int blockIdx_y = blockIdx.y % gridDim_y;
int hidx_z = threadIdx.z;
const int blockIdx_z = blockIdx.y / gridDim_y;
int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
// Is this thread within the hessian buffer?
const int layer = blockIdx_z + 1;
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
{
const int size = calcSize(c_octave, layer);
fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];
}
__syncthreads();
// Is this location one of the ones being processed for nonmax suppression.
/* Ignore pixels without a 3x3x3 neighbourhood in the layer above */
// Blocks overlap by one so we don't process the border threads.
const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1
&& threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < c_y_size - 1
&& threadIdx.z > 0 && threadIdx.z < blockDim.z-1;
float val = fh_vals[localLin];
const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
// Compute the lookup location of the mask center
// Is this thread within the hessian buffer?
float x = hidx_x * c_step + c_border;
const int zoff = blockDim.x * blockDim.y;
float y = hidx_y * c_step + c_border;
const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
float fscale = calcScale(hidx_z);
N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + i)[j];
N9[localLin ] = det.ptr(c_layer_rows * (layer ) + i)[j];
N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + i)[j];
__syncthreads();
if (i
nBounds2 && val >= c_threshold && Mask::check(x, y, fscale)
)
if (i
< c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1
)
{
{
// Check to see if we have a max (in its 26 neighbours)
float val0 = N9[localLin];
int zoff = blockDim.x * blockDim.y;
bool condmax = val > fh_vals[localLin + 1]
if (val0 > c_hessianThreshold)
&& val > fh_vals[localLin - 1]
&& val > fh_vals[localLin - blockDim.x + 1]
&& val > fh_vals[localLin - blockDim.x ]
&& val > fh_vals[localLin - blockDim.x - 1]
&& val > fh_vals[localLin + blockDim.x + 1]
&& val > fh_vals[localLin + blockDim.x ]
&& val > fh_vals[localLin + blockDim.x - 1]
&& val > fh_vals[localLin - zoff + 1]
&& val > fh_vals[localLin - zoff ]
&& val > fh_vals[localLin - zoff - 1]
&& val > fh_vals[localLin - zoff - blockDim.x + 1]
&& val > fh_vals[localLin - zoff - blockDim.x ]
&& val > fh_vals[localLin - zoff - blockDim.x - 1]
&& val > fh_vals[localLin - zoff + blockDim.x + 1]
&& val > fh_vals[localLin - zoff + blockDim.x ]
&& val > fh_vals[localLin - zoff + blockDim.x - 1]
&& val > fh_vals[localLin + zoff + 1]
&& val > fh_vals[localLin + zoff ]
&& val > fh_vals[localLin + zoff - 1]
&& val > fh_vals[localLin + zoff - blockDim.x + 1]
&& val > fh_vals[localLin + zoff - blockDim.x ]
&& val > fh_vals[localLin + zoff - blockDim.x - 1]
&& val > fh_vals[localLin + zoff + blockDim.x + 1]
&& val > fh_vals[localLin + zoff + blockDim.x ]
&& val > fh_vals[localLin + zoff + blockDim.x - 1]
;
if(condmax)
{
{
unsigned int i = atomicInc(maxCounter,(unsigned int) -1);
// Coordinates for the start of the wavelet in the sum image. There
// is some integer division involved, so don't try to simplify this
// (cancel out sampleStep) without checking the result is the same
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (
i < c_max_candidates)
if (
Mask::check(sum_i, sum_j, size))
{
{
int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave};
// Check to see if we have a max (in its 26 neighbours)
maxPosBuffer[i] = f;
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
&& val0 > N9[localLin - blockDim.x - zoff]
&& val0 > N9[localLin + 1 - blockDim.x - zoff]
&& val0 > N9[localLin - 1 - zoff]
&& val0 > N9[localLin - zoff]
&& val0 > N9[localLin + 1 - zoff]
&& val0 > N9[localLin - 1 + blockDim.x - zoff]
&& val0 > N9[localLin + blockDim.x - zoff]
&& val0 > N9[localLin + 1 + blockDim.x - zoff]
&& val0 > N9[localLin - 1 - blockDim.x]
&& val0 > N9[localLin - blockDim.x]
&& val0 > N9[localLin + 1 - blockDim.x]
&& val0 > N9[localLin - 1 ]
&& val0 > N9[localLin + 1 ]
&& val0 > N9[localLin - 1 + blockDim.x]
&& val0 > N9[localLin + blockDim.x]
&& val0 > N9[localLin + 1 + blockDim.x]
&& val0 > N9[localLin - 1 - blockDim.x + zoff]
&& val0 > N9[localLin - blockDim.x + zoff]
&& val0 > N9[localLin + 1 - blockDim.x + zoff]
&& val0 > N9[localLin - 1 + zoff]
&& val0 > N9[localLin + zoff]
&& val0 > N9[localLin + 1 + zoff]
&& val0 > N9[localLin - 1 + blockDim.x + zoff]
&& val0 > N9[localLin + blockDim.x + zoff]
&& val0 > N9[localLin + 1 + blockDim.x + zoff]
;
if(condmax)
{
unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
if (ind < c_max_candidates)
{
const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
}
}
}
}
}
}
}
}
...
@@ -435,21 +304,26 @@ namespace cv { namespace gpu { namespace surf
...
@@ -435,21 +304,26 @@ namespace cv { namespace gpu { namespace surf
#endif
#endif
}
}
void
nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
void
icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int
x_size, int y_size, bool use_mask, const dim3& thread
s)
int
img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayer
s)
{
{
dim3 grid;
const int layer_rows = img_rows >> octave;
grid.x = divUp(x_size, threads.x - 2);
const int layer_cols = img_cols >> octave;
grid.y = divUp(y_size, threads.y - 2);
int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
dim3 threads(16, 16);
const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
dim3 grid;
grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
DeviceReference<unsigned int> maxCounterWrapper(maxCounter
);
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float
);
if (use_mask)
if (use_mask)
nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapp
er);
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCount
er);
else
else
nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapp
er);
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCount
er);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
...
@@ -459,166 +333,117 @@ namespace cv { namespace gpu { namespace surf
...
@@ -459,166 +333,117 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
// INTERPOLATION
#define MID_IDX 1
__global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
__global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer,
KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
{
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x;
const int4 maxPos = maxPosBuffer[blockIdx.x];
int hidx_y = maxPosBuffer[blockIdx.x].y - 1 + threadIdx.y;
int hidx_z = maxPosBuffer[blockIdx.x].z - 1 + threadIdx.z;
const int j = maxPos.x - 1 + threadIdx.x;
const int i = maxPos.y - 1 + threadIdx.y;
const int layer = maxPos.z - 1 + threadIdx.z;
__shared__ float
fh_vals
[3][3][3];
__shared__ float
N9
[3][3][3];
__shared__ KeyPoint_GPU p;
__shared__ KeyPoint_GPU p;
fh_vals[threadIdx.z][threadIdx.y][threadIdx.x] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x
];
N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j
];
__syncthreads();
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{
{
__shared__ float H[3][3];
__shared__ float dD[3];
//dxx
H[0][0] = fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ]
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]
+ fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ];
//dyy
//dx
H[1][1] = fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1]
dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]
//dy
+ fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1];
dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
//ds
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
//dss
__shared__ float H[3][3];
H[2][2] = fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ]
- 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]
+ fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ];
//dxx
H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
//dxy
//dxy
H[0][1]= 0.25f*
H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX - 1] +
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX - 1]);
//dxs
//dxs
H[0][2]= 0.25f*
H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
(fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX ] -
fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX ] -
fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX ] +
fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX ]);
//dys
H[1][2]= 0.25f*
(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX + 1] -
fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX - 1] -
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX + 1] +
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX - 1]);
//dyx = dxy
//dyx = dxy
H[1][0] = H[0][1];
H[1][0] = H[0][1];
//dyy
H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
//dys
H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
//dsx = dxs
//dsx = dxs
H[2][0] = H[0][2];
H[2][0] = H[0][2];
//dsy = dys
//dsy = dys
H[2][1] = H[1][2];
H[2][1] = H[1][2];
//dss
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
__shared__ float dD[3];
float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1])
- H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0])
+ H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]);
//dx
if (det != 0.0f)
dD[0] = 0.5f*(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] -
fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]);
//dy
dD[1] = 0.5f*(fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] -
fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]);
//ds
dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] -
fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]);
__shared__ float invdet;
invdet = 1.f /
(
H[0][0]*H[1][1]*H[2][2]
+ H[0][1]*H[1][2]*H[2][0]
+ H[0][2]*H[1][0]*H[2][1]
- H[0][0]*H[1][2]*H[2][1]
- H[0][1]*H[1][0]*H[2][2]
- H[0][2]*H[1][1]*H[2][0]
);
// // 1-based entries of a 3x3 inverse
// /* [ |a22 a23| |a12 a13| |a12 a13|] */
// /* [ |a32 a33| -|a32 a33| |a22 a23|] */
// /* [ ] */
// /* [ |a21 a23| |a11 a13| |a11 a13|] */
// /* A^(-1) = [-|a31 a33| |a31 a33| -|a21 a23|] / d */
// /* [ ] */
// /* [ |a21 a22| |a11 a12| |a11 a12|] */
// /* [ |a31 a32| -|a31 a32| |a21 a22|] */
__shared__ float Hinv[3][3];
Hinv[0][0] = invdet*(H[1][1]*H[2][2]-H[1][2]*H[2][1]);
Hinv[0][1] = -invdet*(H[0][1]*H[2][2]-H[0][2]*H[2][1]);
Hinv[0][2] = invdet*(H[0][1]*H[1][2]-H[0][2]*H[1][1]);
Hinv[1][0] = -invdet*(H[1][0]*H[2][2]-H[1][2]*H[2][0]);
Hinv[1][1] = invdet*(H[0][0]*H[2][2]-H[0][2]*H[2][0]);
Hinv[1][2] = -invdet*(H[0][0]*H[1][2]-H[0][2]*H[1][0]);
Hinv[2][0] = invdet*(H[1][0]*H[2][1]-H[1][1]*H[2][0]);
Hinv[2][1] = -invdet*(H[0][0]*H[2][1]-H[0][1]*H[2][0]);
Hinv[2][2] = invdet*(H[0][0]*H[1][1]-H[0][1]*H[1][0]);
__shared__ float x[3];
x[0] = -(Hinv[0][0]*(dD[0]) + Hinv[0][1]*(dD[1]) + Hinv[0][2]*(dD[2]));
x[1] = -(Hinv[1][0]*(dD[0]) + Hinv[1][1]*(dD[1]) + Hinv[1][2]*(dD[2]));
x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2]));
if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f)
{
{
// if the step is within the interpolation region, perform it
float invdet = 1.0f / det;
// Get a new feature index.
__shared__ float x[3];
unsigned int i = atomicInc(featureCounter, (unsigned int)-1);
if (i < c_max_features)
x[0] = invdet *
(dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) -
H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) +
H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2]));
x[1] = invdet *
(H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) -
dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) +
H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0]));
x[2] = invdet *
(H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) -
H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) +
dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]));
if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
{
{
p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border;
// if the step is within the interpolation region, perform it
p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border;
if (x[2] > 0)
// Get a new feature index.
{
unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
float a = calcScale(maxPosBuffer[blockIdx.x].z);
float b = calcScale(maxPosBuffer[blockIdx.x].z + 1);
p.size = (1.f - x[2]) * a + x[2] * b;
if (ind < c_max_features)
}
else
{
{
float a = calcScale(maxPosBuffer[blockIdx.x].z);
const int size = calcSize(c_octave, maxPos.z);
float b = calcScale(maxPosBuffer[blockIdx.x].z - 1);
p.size = (1.f + x[2]) * a - x[2] * b;
const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
}
const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
const float center_i = sum_i + (float)(size - 1) / 2;
const float center_j = sum_j + (float)(size - 1) / 2;
p.octave = c_octave;
p.x = center_j + x[0] * (1 << c_octave);
p.y = center_i + x[1] * (1 << c_octave);
p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX];
int ds = size - calcSize(c_octave, maxPos.z - 1);
p.size = roundf(size + x[2] * ds);
// Should we split up this transfer over many threads?
p.laplacian = maxPos.w;
featuresBuffer[i] = p;
p.dir = 0.0f;
}
p.hessian = N9[1][1][1];
} // If the subpixel interpolation worked
// Should we split up this transfer over many threads?
featuresBuffer[ind] = p;
}
} // If the subpixel interpolation worked
}
} // If this is thread 0.
} // If this is thread 0.
#endif
#endif
}
}
#undef MID_IDX
void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter,
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter)
{
{
dim3 threads;
dim3 threads;
threads.x = 3;
threads.x = 3;
...
@@ -628,9 +453,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -628,9 +453,7 @@ namespace cv { namespace gpu { namespace surf
dim3 grid;
dim3 grid;
grid.x = maxCounter;
grid.x = maxCounter;
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featuresBuffer, featureCounter);
fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaThreadSynchronize() );
...
@@ -639,139 +462,217 @@ namespace cv { namespace gpu { namespace surf
...
@@ -639,139 +462,217 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// Orientation
// Orientation
// precomputed values for a Gaussian with a standard deviation of 2
#define ORI_SEARCH_INC 5
__constant__ float c_gauss1D[13] =
#define ORI_WIN 60
{
#define ORI_SAMPLES 113
0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f,
0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f,
0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f,
0.002215924206f
};
__global__ void find_orientation(KeyPoint_GPU* features)
__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
{
__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
int tid = threadIdx.y * 17 + threadIdx.x;
__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
int tid2 = numeric_limits_gpu<int>::max();
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
if (threadIdx.x < 13 && threadIdx.y < 13)
__global__ void icvCalcOrientation(const KeyPoint_GPU* featureBuffer, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
{
{
tid2 = threadIdx.y * 13 + threadIdx.x;
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
}
__shared__ float texLookups[17][17];
__shared__ float s_X[128];
__shared__ float s_Y[128];
__shared__ float s_angle[128];
__shared__ float Edx[13*13];
__shared__ float s_sumx[64 * 4];
__shared__ float Edy[13*13];
__shared__ float s_sumy[64 * 4];
__shared__ float xys[3];
// Read my x, y, size.
__shared__ float s_feature[6];
if (tid < 3)
{
if (threadIdx.x < 6 && threadIdx.y == 0)
xys[tid] = ((float*)(&features[blockIdx.x]))[tid];
s_feature[threadIdx.x] = ((float*)(&featureBuffer[blockIdx.x]))[threadIdx.x];
}
__syncthreads();
__syncthreads();
// Read all texture locations into memory
// Maybe I should use __mul24 here?
texLookups[threadIdx.x][threadIdx.y] = tex2D(sumTex, xys[SF_X] + ((int)threadIdx.x - 8) * xys[SF_SIZE],
xys[SF_Y] + ((int)threadIdx.y - 8) * xys[SF_SIZE]);
__syncthreads();
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = s_feature[SF_SIZE] * 1.2f / 9.0f;
float dx = 0.f;
/* To find the dominant orientation, the gradients in x and y are
float dy = 0.f;
sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the
wavelet pattern is balanced and symmetric around its center */
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
// Computes lookups for all points in a 13x13 lattice.
// check when grad_wav_size is too big
// - SURF says to only use a circle, but the branching logic would slow it down
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
// - Gaussian weighting should reduce the effects of the outer points anyway
if (tid2 < 169)
{
{
dx -= texLookups[threadIdx.x ][threadIdx.y ];
// Calc X, Y, angle and store it to shared memory
dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];
{
dx -= texLookups[threadIdx.x + 4][threadIdx.y ];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
dx += texLookups[threadIdx.x ][threadIdx.y + 4];
dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];
dx += texLookups[threadIdx.x + 4][threadIdx.y + 4];
dy -= texLookups[threadIdx.x ][threadIdx.y ];
dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2];
dy -= texLookups[threadIdx.x ][threadIdx.y + 4];
dy += texLookups[threadIdx.x + 4][threadIdx.y ];
dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];
dy += texLookups[threadIdx.x + 4][threadIdx.y + 4];
float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y];
Edx[tid2] = dx * g;
Edy[tid2] = dy * g;
}
__syncthreads()
;
float X = 0.0f, Y = 0.0f, angle = 0.0f
;
// This is a scan to get the summed dx, dy values.
if (tid < ORI_SAMPLES)
// Gets 128-168
{
if (tid < 41)
const float margin = (float)(grad_wav_size - 1) / 2.0f;
{
const int x = __float2int_rn(s_feature[SF_X] + c_aptX[tid] * s - margin);
Edx[tid] += Edx[tid + 128];
const int y = __float2int_rn(s_feature[SF_Y] + c_aptY[tid] * s - margin);
}
__syncthreads();
if (tid < 64)
{
Edx[tid] += Edx[tid + 64];
}
__syncthreads();
if (tid < 32)
{
volatile float* smem = Edx;
smem[tid] += smem[tid + 32];
smem[tid] += smem[tid + 16];
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
// Gets 128-168
if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
if (tid < 41)
{
{
X = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NX, 4, grad_wav_size, y, x);
Edy[tid] += Edy[tid + 128];
Y = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NY, 4, grad_wav_size, y, x);
}
__syncthreads();
angle = atan2f(Y, X);
if (tid < 64)
if (angle < 0)
{
angle += 2.0f * CV_PI;
Edy[tid] += Edy[tid + 64];
angle *= 180.0f / CV_PI;
}
}
__syncthreads();
}
if (tid < 32)
if (tid < 128)
{
{
volatile float* smem = Edy;
s_X[tid] = X;
s_Y[tid] = Y;
smem[tid] += smem[tid + 32];
s_angle[tid] = angle;
smem[tid] += smem[tid + 16];
}
smem[tid] += smem[tid + 8];
}
smem[tid] += smem[tid + 4];
__syncthreads();
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
// Thread 0 saves back the result.
float bestx = 0, besty = 0, best_mod = 0;
if (tid == 0)
{
#pragma unroll
features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI);
for (int i = 0; i < 18; ++i)
{
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
float sumx = 0.0f, sumy = 0.0f;
int d = abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx = s_X[threadIdx.x];
sumy = s_Y[threadIdx.x];
}
d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[threadIdx.x + 64];
sumy += s_Y[threadIdx.x + 64];
}
float* s_sumx_row = s_sumx + threadIdx.y * 64;
float* s_sumy_row = s_sumy + threadIdx.y * 64;
s_sumx_row[threadIdx.x] = sumx;
s_sumy_row[threadIdx.x] = sumy;
__syncthreads();
if (threadIdx.x < 32)
{
volatile float* v_sumx_row = s_sumx_row;
volatile float* v_sumy_row = s_sumy_row;
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];
v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1];
v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1];
}
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
{
best_mod = temp_mod;
bestx = sumx;
besty = sumy;
}
__syncthreads();
}
if (threadIdx.x == 0)
{
s_X[threadIdx.y] = bestx;
s_Y[threadIdx.y] = besty;
s_angle[threadIdx.y] = best_mod;
}
__syncthreads();
if (threadIdx.x < 2 && threadIdx.y == 0)
{
volatile float* v_x = s_X;
volatile float* v_y = s_Y;
volatile float* v_mod = s_angle;
bestx = v_x[threadIdx.x];
besty = v_y[threadIdx.x];
best_mod = v_mod[threadIdx.x];
float temp_mod = v_mod[threadIdx.x + 2];
if (temp_mod > best_mod)
{
v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
v_mod[threadIdx.x] = best_mod = temp_mod;
}
temp_mod = v_mod[threadIdx.x + 1];
if (temp_mod > best_mod)
{
v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
}
}
if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
{
// Get a new feature index.
unsigned int ind = atomicInc(keypointCounter, (unsigned int)-1);
if (ind < c_max_keypoints)
{
float kp_dir = atan2f(besty, bestx);
if (kp_dir < 0)
kp_dir += 2.0f * CV_PI;
kp_dir *= 180.0f / CV_PI;
__shared__ KeyPoint_GPU kp;
kp.x = s_feature[SF_X];
kp.y = s_feature[SF_Y];
kp.laplacian = s_feature[SF_LAPLACIAN];
kp.size = s_feature[SF_SIZE];
kp.dir = kp_dir;
kp.hessian = s_feature[SF_HESSIAN];
keypoints[ind] = kp;
}
}
}
}
#endif
}
}
void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures)
#undef ORI_SEARCH_INC
#undef ORI_WIN
#undef ORI_SAMPLES
void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
{
{
dim3 threads;
dim3 threads;
threads.x =
17
;
threads.x =
64
;
threads.y =
17
;
threads.y =
4
;
dim3 grid;
dim3 grid;
grid.x = nFeatures;
grid.x = nFeatures;
find_orientation<<<grid, threads>>>(features
);
icvCalcOrientation<<<grid, threads>>>(featureBuffer, keypoints, keypointCounter
);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaThreadSynchronize() );
...
@@ -780,117 +681,119 @@ namespace cv { namespace gpu { namespace surf
...
@@ -780,117 +681,119 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// Descriptors
// Descriptors
// precomputed values for a Gaussian with a standard deviation of 3.3
#define PATCH_SZ 20
// - it appears SURF uses a different value, but not sure what it is
__constant__ float c_3p3gauss1D[20] =
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
{
0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,
__constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,
{
0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
};
};
template <int BLOCK_DIM_X>
__device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2)
__global__ void normalize_descriptors(PtrStepf descriptors)
{
{
// no need for thread ID
const float centerX = s_pt[SF_X];
float* descriptor_base = descriptors.ptr(blockIdx.x);
const float centerY = s_pt[SF_Y];
const float size = s_pt[SF_SIZE];
const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);
// read in the unnormalized descriptor values (squared)
/* The sampling intervals and wavelet sized for selecting an orientation
__shared__ float sqDesc[BLOCK_DIM_X];
and building the keypoint descriptor are defined relative to 's' */
const float lookup = descriptor_base[threadIdx.x];
const float s = size * 1.2f / 9.0f;
sqDesc[threadIdx.x] = lookup * lookup;
__syncthreads();
if (BLOCK_DIM_X >= 128)
/* Extract a window of pixels around the keypoint of size 20s */
{
const int win_size = (int)((PATCH_SZ + 1) * s);
if (threadIdx.x < 64)
sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
__syncthreads();
}
// reduction to get total
float sin_dir;
if (threadIdx.x < 32)
float cos_dir;
{
sincosf(descriptor_dir, &sin_dir, &cos_dir);
volatile float* smem = sqDesc;
smem[threadIdx.x] += smem[threadIdx.x + 32];
/* Nearest neighbour version (faster) */
smem[threadIdx.x] += smem[threadIdx.x + 16];
const float win_offset = -(float)(win_size - 1) / 2;
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
}
// compute length (square root)
/* Scale the window to size PATCH_SZ so each pixel's size is s. This
__shared__ float len;
makes calculating the gradients with wavelets of size 2s easy */
if (threadIdx.x == 0)
const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;
{
const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size;
len = sqrtf(sqDesc[0]);
}
__syncthreads();
// normalize and store in output
const int i = __float2int_rd(icoo);
descriptor_base[threadIdx.x] = lookup / len;
const int j = __float2int_rd(jcoo);
}
__device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt,
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
int xIndex, int yIndex, int tid)
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
{
float sin_theta, cos_theta;
float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo);
sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);
pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir;
// Compute rotated sampling points
pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir;
// (clockwise rotation since we are rotating the lattice)
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo);
const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
+ sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir;
const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir;
+ cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j);
// gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
// a b c
pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir;
// d f
pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir;
// g h i
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j);
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
s_PATCH[i2][j2] = (unsigned char)res;
const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]);
const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
// compute axis-aligned HaarX, HaarY
// (could group the additions together into multiplications)
const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx
const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy
// rotate responses (store all dxs then all dys)
// - counterclockwise rotation to rotate back to zero orientation
sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
}
}
__device__ void calc_dx_dy(float
* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features
)
__device__ void calc_dx_dy(float
s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid
)
{
{
// get the interest point parameters (x, y, size, response, angle)
// get the interest point parameters (x, y, size, response, angle)
__shared__ float
i
pt[5];
__shared__ float
s_
pt[5];
if (t
hreadIdx.x < 5 && threadIdx.y == 0
)
if (t
id < 5
)
{
{
ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x
];
s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid
];
}
}
__syncthreads();
__syncthreads();
// Compute sampling points
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (threadIdx.y & 3); // threadIdx.y % 4
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4)
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
const int xIndex = (xBlock * 5) + (threadIdx.x % 5);
const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = (yBlock * 5) + (threadIdx.x / 5);
const int yIndex = yBlock * blockDim.y + threadIdx.y;
calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x);
if (threadIdx.x == 0)
calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5);
if (threadIdx.y == 0)
calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x);
if (threadIdx.x == 0 && threadIdx.y == 0)
calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5);
__syncthreads();
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw;
const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw;
calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x);
s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
}
}
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,
...
@@ -933,193 +836,9 @@ namespace cv { namespace gpu { namespace surf
...
@@ -933,193 +836,9 @@ namespace cv { namespace gpu { namespace surf
// Spawn 16 blocks per interest point
// Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx [16 * 25];
__shared__ float sdy [16 * 25];
__shared__ float sdxabs[16 * 25];
__shared__ float sdyabs[16 * 25];
__shared__ float sdesc[64];
float* sdx_bin = sdx + (threadIdx.y * 25);
float* sdy_bin = sdy + (threadIdx.y * 25);
float* sdxabs_bin = sdxabs + (threadIdx.y * 25);
float* sdyabs_bin = sdyabs + (threadIdx.y * 25);
calc_dx_dy(sdx_bin, sdy_bin, features);
__syncthreads();
sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array
sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array
__syncthreads();
reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x);
__syncthreads();
float* sdesc_bin = sdesc + (threadIdx.y << 2);
// write dx, dy, |dx|, |dy|
if (threadIdx.x == 0)
{
sdesc_bin[0] = sdx_bin[0];
sdesc_bin[1] = sdy_bin[0];
sdesc_bin[2] = sdxabs_bin[0];
sdesc_bin[3] = sdyabs_bin[0];
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 64)
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
}
// Spawn 16 blocks per interest point
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[16 * 25];
__shared__ float sdy[16 * 25];
// sum (reduce) 5x5 area response
__shared__ float sd1[16 * 25];
__shared__ float sd2[16 * 25];
__shared__ float sdabs1[16 * 25];
__shared__ float sdabs2[16 * 25];
__shared__ float sdesc[128];
float* sdx_bin = sdx + (threadIdx.y * 25);
float* sdy_bin = sdy + (threadIdx.y * 25);
float* sd1_bin = sd1 + (threadIdx.y * 25);
float* sd2_bin = sd2 + (threadIdx.y * 25);
float* sdabs1_bin = sdabs1 + (threadIdx.y * 25);
float* sdabs2_bin = sdabs2 + (threadIdx.y * 25);
calc_dx_dy(sdx_bin, sdy_bin, features);
__syncthreads();
if (sdy_bin[threadIdx.x] >= 0)
{
sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x];
sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]);
sd2_bin[threadIdx.x] = 0;
sdabs2_bin[threadIdx.x] = 0;
}
else
{
sd1_bin[threadIdx.x] = 0;
sdabs1_bin[threadIdx.x] = 0;
sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x];
sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]);
}
__syncthreads();
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
__syncthreads();
float* sdesc_bin = sdesc + (threadIdx.y << 3);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (threadIdx.x == 0)
{
sdesc_bin[0] = sd1_bin[0];
sdesc_bin[1] = sdabs1_bin[0];
sdesc_bin[2] = sd2_bin[0];
sdesc_bin[3] = sdabs2_bin[0];
}
__syncthreads();
if (sdx_bin[threadIdx.x] >= 0)
{
sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x];
sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
sd2_bin[threadIdx.x] = 0;
sdabs2_bin[threadIdx.x] = 0;
}
else
{
sd1_bin[threadIdx.x] = 0;
sdabs1_bin[threadIdx.x] = 0;
sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x];
sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
}
__syncthreads();
reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
__syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (threadIdx.x == 0)
{
sdesc_bin[4] = sd1_bin[0];
sdesc_bin[5] = sdabs1_bin[0];
sdesc_bin[6] = sd2_bin[0];
sdesc_bin[7] = sdabs2_bin[0];
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 128)
descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
}
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
{
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
}
__device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid)
{
// get the interest point parameters (x, y, scale, strength, theta)
__shared__ float ipt[5];
if (tid < 5)
{
ipt[tid] = ((float*)&features[blockIdx.x])[tid];
}
__syncthreads();
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = yBlock * blockDim.y + threadIdx.y;
calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid);
}
// Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)
{
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float s_PATCH[6][6];
__shared__ float sdx[25];
__shared__ float sdx[25];
__shared__ float sdy[25];
__shared__ float sdy[25];
__shared__ float sdxabs[25];
__shared__ float sdxabs[25];
...
@@ -1127,7 +846,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -1127,7 +846,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy
_old(
sdx, sdy, features, tid);
calc_dx_dy
(s_PATCH,
sdx, sdy, features, tid);
__syncthreads();
__syncthreads();
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
...
@@ -1151,9 +870,10 @@ namespace cv { namespace gpu { namespace surf
...
@@ -1151,9 +870,10 @@ namespace cv { namespace gpu { namespace surf
// Spawn 16 blocks per interest point
// Spawn 16 blocks per interest point
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128
_old
(PtrStepf descriptors, const KeyPoint_GPU* features)
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
{
{
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float s_PATCH[6][6];
__shared__ float sdx[25];
__shared__ float sdx[25];
__shared__ float sdy[25];
__shared__ float sdy[25];
...
@@ -1165,7 +885,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -1165,7 +885,7 @@ namespace cv { namespace gpu { namespace surf
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
calc_dx_dy
_old(
sdx, sdy, features, tid);
calc_dx_dy
(s_PATCH,
sdx, sdy, features, tid);
__syncthreads();
__syncthreads();
if (sdy[tid] >= 0)
if (sdy[tid] >= 0)
...
@@ -1184,7 +904,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -1184,7 +904,7 @@ namespace cv { namespace gpu { namespace surf
}
}
__syncthreads();
__syncthreads();
reduce_sum25(sd1, sd
1
, sdabs1, sdabs2, tid);
reduce_sum25(sd1, sd
2
, sdabs1, sdabs2, tid);
__syncthreads();
__syncthreads();
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
...
@@ -1215,7 +935,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -1215,7 +935,7 @@ namespace cv { namespace gpu { namespace surf
}
}
__syncthreads();
__syncthreads();
reduce_sum25(sd1, sd
1
, sdabs1, sdabs2, tid);
reduce_sum25(sd1, sd
2
, sdabs1, sdabs2, tid);
__syncthreads();
__syncthreads();
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
...
@@ -1228,13 +948,56 @@ namespace cv { namespace gpu { namespace surf
...
@@ -1228,13 +948,56 @@ namespace cv { namespace gpu { namespace surf
}
}
}
}
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
{
// no need for thread ID
float* descriptor_base = descriptors.ptr(blockIdx.x);
// read in the unnormalized descriptor values (squared)
__shared__ float sqDesc[BLOCK_DIM_X];
const float lookup = descriptor_base[threadIdx.x];
sqDesc[threadIdx.x] = lookup * lookup;
__syncthreads();
if (BLOCK_DIM_X >= 128)
{
if (threadIdx.x < 64)
sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
__syncthreads();
}
// reduction to get total
if (threadIdx.x < 32)
{
volatile float* smem = sqDesc;
smem[threadIdx.x] += smem[threadIdx.x + 32];
smem[threadIdx.x] += smem[threadIdx.x + 16];
smem[threadIdx.x] += smem[threadIdx.x + 8];
smem[threadIdx.x] += smem[threadIdx.x + 4];
smem[threadIdx.x] += smem[threadIdx.x + 2];
smem[threadIdx.x] += smem[threadIdx.x + 1];
}
// compute length (square root)
__shared__ float len;
if (threadIdx.x == 0)
{
len = sqrtf(sqDesc[0]);
}
__syncthreads();
// normalize and store in output
descriptor_base[threadIdx.x] = lookup / len;
}
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
{
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
if (descriptors.cols == 64)
{
{
compute_descriptors64
_old
<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaThreadSynchronize() );
...
@@ -1246,7 +1009,7 @@ namespace cv { namespace gpu { namespace surf
...
@@ -1246,7 +1009,7 @@ namespace cv { namespace gpu { namespace surf
}
}
else
else
{
{
compute_descriptors128
_old
<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaThreadSynchronize() );
...
...
modules/gpu/src/surf.cpp
View file @
58f69197
...
@@ -48,123 +48,93 @@ using namespace std;
...
@@ -48,123 +48,93 @@ using namespace std;
#if !defined (HAVE_CUDA)
#if !defined (HAVE_CUDA)
cv
::
gpu
::
SURF_GPU
::
SURF_GPU
()
{
throw_nogpu
();
}
cv
::
gpu
::
SURF_GPU
::
SURF_GPU
(
double
,
int
,
int
,
bool
,
float
)
{
throw_nogpu
();
}
int
cv
::
gpu
::
SURF_GPU
::
descriptorSize
()
const
{
throw_nogpu
();
return
0
;}
int
cv
::
gpu
::
SURF_GPU
::
descriptorSize
()
const
{
throw_nogpu
();
return
0
;}
void
cv
::
gpu
::
SURF_GPU
::
uploadKeypoints
(
const
vector
<
KeyPoint
>&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
uploadKeypoints
(
const
vector
<
KeyPoint
>&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
downloadKeypoints
(
const
GpuMat
&
,
vector
<
KeyPoint
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
downloadKeypoints
(
const
GpuMat
&
,
vector
<
KeyPoint
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
downloadDescriptors
(
const
GpuMat
&
,
vector
<
float
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
downloadDescriptors
(
const
GpuMat
&
,
vector
<
float
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
bool
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
KeyPoint
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
KeyPoint
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
KeyPoint
>&
,
GpuMat
&
,
bool
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
KeyPoint
>&
,
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
KeyPoint
>&
,
vector
<
float
>&
,
bool
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
KeyPoint
>&
,
vector
<
float
>&
,
bool
)
{
throw_nogpu
();
}
#else
/* !defined (HAVE_CUDA) */
#else
/* !defined (HAVE_CUDA) */
namespace
cv
{
namespace
gpu
{
namespace
surf
namespace
cv
{
namespace
gpu
{
namespace
surf
{
{
dim3
calcBlockSize
(
int
nIntervals
);
void
icvCalcLayerDetAndTrace_gpu
(
const
PtrStepf
&
det
,
const
PtrStepf
&
trace
,
int
img_rows
,
int
img_cols
,
int
octave
,
int
nOctaveLayers
);
void
fasthessian_gpu
(
PtrStepf
hessianBuffer
,
int
x_size
,
int
y_size
,
const
dim3
&
threads
);
void
icvFindMaximaInLayer_gpu
(
const
PtrStepf
&
det
,
const
PtrStepf
&
trace
,
int4
*
maxPosBuffer
,
unsigned
int
*
maxCounter
,
void
fasthessian_gpu_old
(
PtrStepf
hessianBuffer
,
int
x_size
,
int
y_size
,
const
dim3
&
threadsOld
);
int
img_rows
,
int
img_cols
,
int
octave
,
bool
use_mask
,
int
nLayers
);
void
nonmaxonly_gpu
(
PtrStepf
hessianBuffer
,
int4
*
maxPosBuffer
,
unsigned
int
&
maxCounter
,
void
icvInterpolateKeypoint_gpu
(
const
PtrStepf
&
det
,
const
int4
*
maxPosBuffer
,
unsigned
int
maxCounter
,
KeyPoint_GPU
*
featuresBuffer
,
unsigned
int
*
featureCounter
);
int
x_size
,
int
y_size
,
bool
use_mask
,
const
dim3
&
threads
);
void
icvCalcOrientation_gpu
(
const
KeyPoint_GPU
*
featureBuffer
,
int
nFeatures
,
KeyPoint_GPU
*
keypoints
,
unsigned
int
*
keypointCounter
);
void
fh_interp_extremum_gpu
(
PtrStepf
hessianBuffer
,
const
int4
*
maxPosBuffer
,
unsigned
int
maxCounter
,
KeyPoint_GPU
*
featuresBuffer
,
unsigned
int
&
featureCounter
);
void
find_orientation_gpu
(
KeyPoint_GPU
*
features
,
int
nFeatures
);
void
compute_descriptors_gpu
(
const
DevMem2Df
&
descriptors
,
const
KeyPoint_GPU
*
features
,
int
nFeatures
);
void
compute_descriptors_gpu
(
const
DevMem2Df
&
descriptors
,
const
KeyPoint_GPU
*
features
,
int
nFeatures
);
void
compute_descriptors_gpu_old
(
const
DevMem2Df
&
descriptors
,
const
KeyPoint_GPU
*
features
,
int
nFeatures
);
}}}
}}}
using
namespace
cv
::
gpu
::
surf
;
using
namespace
cv
::
gpu
::
surf
;
namespace
namespace
{
{
class
SURF_GPU_Invoker
:
private
SURFParams_GPU
class
SURF_GPU_Invoker
:
private
CvSURFParams
{
{
public
:
public
:
SURF_GPU_Invoker
(
SURF_GPU
&
surf
,
const
GpuMat
&
img
,
const
GpuMat
&
mask
)
:
SURF_GPU_Invoker
(
SURF_GPU
&
surf
,
const
GpuMat
&
img
,
const
GpuMat
&
mask
)
:
SURFParams_GPU
(
surf
),
CvSURFParams
(
surf
),
sum
(
surf
.
sum
),
sumf
(
surf
.
sumf
),
sum
(
surf
.
sum
),
mask1
(
surf
.
mask1
),
maskSum
(
surf
.
maskSum
),
intBuffer
(
surf
.
intBuffer
),
det
(
surf
.
det
),
trace
(
surf
.
trace
),
mask1
(
surf
.
mask1
),
maskSum
(
surf
.
maskSum
),
maxPosBuffer
(
surf
.
maxPosBuffer
),
featuresBuffer
(
surf
.
featuresBuffer
),
keypointsBuffer
(
surf
.
keypointsBuffer
),
hessianBuffer
(
surf
.
hessianBuffer
),
maxPosBuffer
(
surf
.
maxPosBuffer
),
featuresBuffer
(
surf
.
featuresBuffer
),
img_cols
(
img
.
cols
),
img_rows
(
img
.
rows
),
img_cols
(
img
.
cols
),
img_rows
(
img
.
rows
),
use_mask
(
!
mask
.
empty
()),
use_mask
(
!
mask
.
empty
())
mask_width
(
0
),
mask_height
(
0
),
featureCounter
(
0
),
maxCounter
(
0
)
{
{
CV_Assert
(
!
img
.
empty
()
&&
img
.
type
()
==
CV_8UC1
);
CV_Assert
(
!
img
.
empty
()
&&
img
.
type
()
==
CV_8UC1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
size
()
==
img
.
size
()
&&
mask
.
type
()
==
CV_8UC1
));
CV_Assert
(
mask
.
empty
()
||
(
mask
.
size
()
==
img
.
size
()
&&
mask
.
type
()
==
CV_8UC1
));
CV_Assert
(
nOctaves
>
0
&&
nIntervals
>
2
&&
nIntervals
<
22
);
CV_Assert
(
nOctaves
>
0
&&
nOctaveLayers
>
0
);
CV_Assert
(
DeviceInfo
().
supports
(
GLOBAL_ATOMICS
));
CV_Assert
(
TargetArchs
::
builtWith
(
GLOBAL_ATOMICS
)
&&
DeviceInfo
().
supports
(
GLOBAL_ATOMICS
));
max_features
=
static_cast
<
int
>
(
img
.
size
().
area
()
*
featuresRatio
);
max_candidates
=
static_cast
<
int
>
(
1.5
*
max_features
);
CV_Assert
(
max_features
>
0
);
featuresBuffer
.
create
(
1
,
max_features
,
CV_32FC
(
6
));
maxKeypoints
=
static_cast
<
int
>
(
img
.
size
().
area
()
*
surf
.
keypointsRatio
);
maxPosBuffer
.
create
(
1
,
max_candidates
,
CV_32SC4
);
maxFeatures
=
static_cast
<
int
>
(
1.5
*
maxKeypoints
);
maxCandidates
=
static_cast
<
int
>
(
1.5
*
maxFeatures
);
mask_width
=
l2
*
0.5
f
;
CV_Assert
(
maxKeypoints
>
0
);
mask_height
=
1.0
f
+
l1
;
cudaSafeCall
(
cudaMalloc
((
void
**
)
&
d_counters
,
(
nOctaves
+
2
)
*
sizeof
(
unsigned
int
))
);
cudaSafeCall
(
cudaMemset
(
d_counters
,
0
,
(
nOctaves
+
2
)
*
sizeof
(
unsigned
int
))
);
// Dxy gap half-width
uploadConstant
(
"cv::gpu::surf::c_max_candidates"
,
maxCandidates
);
float
dxy_center_offset
=
0.5
f
*
(
l4
+
l3
);
uploadConstant
(
"cv::gpu::surf::c_max_features"
,
maxFeatures
);
// Dxy squares half-width
uploadConstant
(
"cv::gpu::surf::c_max_keypoints"
,
maxKeypoints
);
float
dxy_half_width
=
0.5
f
*
l3
;
uploadConstant
(
"cv::gpu::surf::c_img_rows"
,
img_rows
);
uploadConstant
(
"cv::gpu::surf::c_img_cols"
,
img_cols
);
uploadConstant
(
"cv::gpu::surf::c_nOctaveLayers"
,
nOctaveLayers
);
uploadConstant
(
"cv::gpu::surf::c_hessianThreshold"
,
static_cast
<
float
>
(
hessianThreshold
));
// rescale edge_scale to fit with the filter dimensions
bindTexture
(
"cv::gpu::surf::imgTex"
,
(
DevMem2D
)
img
);
float
dxy_scale
=
edgeScale
*
std
::
pow
((
2.
f
+
2.
f
*
l1
)
*
l2
/
(
4.
f
*
l3
*
l3
),
2.
f
);
// Compute border required such that the filters don't overstep the image boundaries
float
smax0
=
2.0
f
*
initialScale
+
0.5
f
;
int
border0
=
static_cast
<
int
>
(
std
::
ceil
(
smax0
*
std
::
max
(
std
::
max
(
mask_width
,
mask_height
),
l3
+
l4
*
0.5
f
)));
int
width0
=
(
img_cols
-
2
*
border0
)
/
initialStep
;
int
height0
=
(
img_rows
-
2
*
border0
)
/
initialStep
;
uploadConstant
(
"cv::gpu::surf::c_max_candidates"
,
max_candidates
);
uploadConstant
(
"cv::gpu::surf::c_max_features"
,
max_features
);
uploadConstant
(
"cv::gpu::surf::c_nIntervals"
,
nIntervals
);
uploadConstant
(
"cv::gpu::surf::c_mask_width"
,
mask_width
);
uploadConstant
(
"cv::gpu::surf::c_mask_height"
,
mask_height
);
uploadConstant
(
"cv::gpu::surf::c_dxy_center_offset"
,
dxy_center_offset
);
uploadConstant
(
"cv::gpu::surf::c_dxy_half_width"
,
dxy_half_width
);
uploadConstant
(
"cv::gpu::surf::c_dxy_scale"
,
dxy_scale
);
uploadConstant
(
"cv::gpu::surf::c_initialScale"
,
initialScale
);
uploadConstant
(
"cv::gpu::surf::c_threshold"
,
threshold
);
hessianBuffer
.
create
(
height0
*
nIntervals
,
width0
,
CV_32F
);
integral
(
img
,
sum
);
integralBuffered
(
img
,
sum
,
intBuffer
);
sum
.
convertTo
(
sumf
,
CV_32F
,
1.0
/
255.0
);
bindTexture
(
"cv::gpu::surf::sumTex"
,
(
DevMem2D_
<
unsigned
int
>
)
sum
);
bindTexture
(
"cv::gpu::surf::sumTex"
,
(
DevMem2Df
)
sumf
);
if
(
!
mask
.
empty
()
)
if
(
use_mask
)
{
{
min
(
mask
,
1.0
,
mask1
);
min
(
mask
,
1.0
,
mask1
);
integral
(
mask1
,
maskSum
);
integral
Buffered
(
mask1
,
maskSum
,
intBuffer
);
bindTexture
(
"cv::gpu::surf::maskSumTex"
,
(
DevMem2D
i
)
maskSum
);
bindTexture
(
"cv::gpu::surf::maskSumTex"
,
(
DevMem2D
_
<
unsigned
int
>
)
maskSum
);
}
}
}
}
~
SURF_GPU_Invoker
()
~
SURF_GPU_Invoker
()
{
{
cudaSafeCall
(
cudaFree
(
d_counters
)
);
unbindTexture
(
"cv::gpu::surf::imgTex"
);
unbindTexture
(
"cv::gpu::surf::sumTex"
);
unbindTexture
(
"cv::gpu::surf::sumTex"
);
if
(
use_mask
)
if
(
use_mask
)
unbindTexture
(
"cv::gpu::surf::maskSumTex"
);
unbindTexture
(
"cv::gpu::surf::maskSumTex"
);
...
@@ -172,102 +142,115 @@ namespace
...
@@ -172,102 +142,115 @@ namespace
void
detectKeypoints
(
GpuMat
&
keypoints
)
void
detectKeypoints
(
GpuMat
&
keypoints
)
{
{
typedef
void
(
*
fasthessian_t
)(
PtrStepf
hessianBuffer
,
int
x_size
,
int
y_size
,
const
dim3
&
threads
);
ensureSizeIsEnough
(
img_rows
*
(
nOctaveLayers
+
2
),
img_cols
,
CV_32FC1
,
det
);
const
fasthessian_t
fasthessian
=
ensureSizeIsEnough
(
img_rows
*
(
nOctaveLayers
+
2
),
img_cols
,
CV_32FC1
,
trace
);
DeviceInfo
().
supports
(
FEATURE_SET_COMPUTE_13
)
?
fasthessian_gpu
:
fasthessian_gpu_old
;
ensureSizeIsEnough
(
1
,
maxCandidates
,
CV_32SC4
,
maxPosBuffer
);
ensureSizeIsEnough
(
1
,
maxFeatures
,
CV_32FC
(
6
),
featuresBuffer
);
dim3
threads
=
calcBlockSize
(
nIntervals
);
for
(
int
octave
=
0
;
octave
<
nOctaves
;
++
octave
)
for
(
int
octave
=
0
;
octave
<
nOctaves
;
++
octave
)
{
{
int
step
=
initialStep
*
(
1
<<
octave
);
const
int
layer_rows
=
img_rows
>>
octave
;
const
int
layer_cols
=
img_cols
>>
octave
;
// Compute border required such that the filters don't overstep the image boundaries
float
d
=
(
initialScale
*
(
1
<<
octave
))
/
(
nIntervals
-
2
);
uploadConstant
(
"cv::gpu::surf::c_octave"
,
octave
);
float
smax
=
initialScale
*
(
1
<<
octave
)
+
d
*
(
nIntervals
-
2.0
f
)
+
0.5
f
;
uploadConstant
(
"cv::gpu::surf::c_layer_rows"
,
layer_rows
);
int
border
=
static_cast
<
int
>
(
std
::
ceil
(
smax
*
std
::
max
(
std
::
max
(
mask_width
,
mask_height
),
l3
+
l4
*
0.5
f
)));
uploadConstant
(
"cv::gpu::surf::c_layer_cols"
,
layer_cols
);
int
x_size
=
(
img_cols
-
2
*
border
)
/
step
;
icvCalcLayerDetAndTrace_gpu
(
det
,
trace
,
img_rows
,
img_cols
,
octave
,
nOctaveLayers
);
int
y_size
=
(
img_rows
-
2
*
border
)
/
step
;
icvFindMaximaInLayer_gpu
(
det
,
trace
,
maxPosBuffer
.
ptr
<
int4
>
(),
d_counters
+
2
+
octave
,
if
(
x_size
<=
0
||
y_size
<=
0
)
img_rows
,
img_cols
,
octave
,
use_mask
,
nOctaveLayers
);
break
;
unsigned
int
maxCounter
;
uploadConstant
(
"cv::gpu::surf::c_octave"
,
octave
);
cudaSafeCall
(
cudaMemcpy
(
&
maxCounter
,
d_counters
+
2
+
octave
,
sizeof
(
unsigned
int
),
cudaMemcpyDeviceToHost
)
);
uploadConstant
(
"cv::gpu::surf::c_x_size"
,
x_size
);
maxCounter
=
std
::
min
(
maxCounter
,
static_cast
<
unsigned
int
>
(
maxCandidates
));
uploadConstant
(
"cv::gpu::surf::c_y_size"
,
y_size
);
uploadConstant
(
"cv::gpu::surf::c_border"
,
border
);
uploadConstant
(
"cv::gpu::surf::c_step"
,
step
);
fasthessian
(
hessianBuffer
,
x_size
,
y_size
,
threads
);
// Reset the candidate count.
maxCounter
=
0
;
nonmaxonly_gpu
(
hessianBuffer
,
maxPosBuffer
.
ptr
<
int4
>
(),
maxCounter
,
x_size
,
y_size
,
use_mask
,
threads
);
maxCounter
=
std
::
min
(
maxCounter
,
static_cast
<
unsigned
int
>
(
max_candidates
));
if
(
maxCounter
>
0
)
if
(
maxCounter
>
0
)
{
{
fh_interp_extremum_gpu
(
hessianBuffer
,
maxPosBuffer
.
ptr
<
int4
>
(),
maxCounter
,
icvInterpolateKeypoint_gpu
(
det
,
maxPosBuffer
.
ptr
<
int4
>
(),
maxCounter
,
featuresBuffer
.
ptr
<
KeyPoint_GPU
>
(),
featureCounter
);
featuresBuffer
.
ptr
<
KeyPoint_GPU
>
(),
d_counters
);
featureCounter
=
std
::
min
(
featureCounter
,
static_cast
<
unsigned
int
>
(
max_features
));
}
}
}
}
unsigned
int
featureCounter
;
cudaSafeCall
(
cudaMemcpy
(
&
featureCounter
,
d_counters
,
sizeof
(
unsigned
int
),
cudaMemcpyDeviceToHost
)
);
featureCounter
=
std
::
min
(
featureCounter
,
static_cast
<
unsigned
int
>
(
maxFeatures
));
if
(
featureCounter
>
0
)
findOrientation
(
featuresBuffer
.
colRange
(
0
,
featureCounter
),
keypoints
);
featuresBuffer
.
colRange
(
0
,
featureCounter
).
copyTo
(
keypoints
);
else
keypoints
.
release
();
}
}
void
findOrientation
(
GpuMat
&
keypoints
)
void
findOrientation
(
const
GpuMat
&
features
,
GpuMat
&
keypoints
)
{
{
if
(
keypoints
.
cols
>
0
)
if
(
features
.
cols
>
0
)
find_orientation_gpu
(
keypoints
.
ptr
<
KeyPoint_GPU
>
(),
keypoints
.
cols
);
{
ensureSizeIsEnough
(
1
,
maxKeypoints
,
CV_32FC
(
6
),
keypointsBuffer
);
icvCalcOrientation_gpu
(
features
.
ptr
<
KeyPoint_GPU
>
(),
features
.
cols
,
keypointsBuffer
.
ptr
<
KeyPoint_GPU
>
(),
d_counters
+
1
);
unsigned
int
keypointsCounter
;
cudaSafeCall
(
cudaMemcpy
(
&
keypointsCounter
,
d_counters
+
1
,
sizeof
(
unsigned
int
),
cudaMemcpyDeviceToHost
)
);
keypointsCounter
=
std
::
min
(
keypointsCounter
,
static_cast
<
unsigned
int
>
(
maxKeypoints
));
if
(
keypointsCounter
>
0
)
keypointsBuffer
.
colRange
(
0
,
keypointsCounter
).
copyTo
(
keypoints
);
else
keypoints
.
release
();
}
}
}
void
computeDescriptors
(
const
GpuMat
&
keypoints
,
GpuMat
&
descriptors
,
int
descriptorSize
)
void
computeDescriptors
(
const
GpuMat
&
keypoints
,
GpuMat
&
descriptors
,
int
descriptorSize
)
{
{
typedef
void
(
*
compute_descriptors_t
)(
const
DevMem2Df
&
descriptors
,
const
KeyPoint_GPU
*
features
,
int
nFeatures
);
const
compute_descriptors_t
compute_descriptors
=
compute_descriptors_gpu_old
;
//DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
if
(
keypoints
.
cols
>
0
)
if
(
keypoints
.
cols
>
0
)
{
{
descriptors
.
create
(
keypoints
.
cols
,
descriptorSize
,
CV_32F
);
descriptors
.
create
(
keypoints
.
cols
,
descriptorSize
,
CV_32F
);
compute_descriptors
(
descriptors
,
keypoints
.
ptr
<
KeyPoint_GPU
>
(),
keypoints
.
cols
);
compute_descriptors
_gpu
(
descriptors
,
keypoints
.
ptr
<
KeyPoint_GPU
>
(),
keypoints
.
cols
);
}
}
}
}
private
:
private
:
GpuMat
&
sum
;
GpuMat
&
sum
;
GpuMat
&
sumf
;
GpuMat
&
mask1
;
GpuMat
&
mask1
;
GpuMat
&
maskSum
;
GpuMat
&
maskSum
;
GpuMat
&
intBuffer
;
GpuMat
&
det
;
GpuMat
&
trace
;
GpuMat
&
hessianBuffer
;
GpuMat
&
maxPosBuffer
;
GpuMat
&
maxPosBuffer
;
GpuMat
&
featuresBuffer
;
GpuMat
&
featuresBuffer
;
GpuMat
&
keypointsBuffer
;
int
img_cols
,
img_rows
;
int
img_cols
,
img_rows
;
bool
use_mask
;
bool
use_mask
;
float
mask_width
,
mask_height
;
unsigned
int
featureCounter
;
int
maxCandidates
;
unsigned
int
maxCounter
;
int
maxFeatures
;
int
maxKeypoints
;
int
max_candidates
;
unsigned
int
*
d_counters
;
int
max_features
;
};
};
}
}
cv
::
gpu
::
SURF_GPU
::
SURF_GPU
()
{
hessianThreshold
=
100
;
extended
=
1
;
nOctaves
=
4
;
nOctaveLayers
=
2
;
keypointsRatio
=
0.01
f
;
}
cv
::
gpu
::
SURF_GPU
::
SURF_GPU
(
double
_threshold
,
int
_nOctaves
,
int
_nOctaveLayers
,
bool
_extended
,
float
_keypointsRatio
)
{
hessianThreshold
=
_threshold
;
extended
=
_extended
;
nOctaves
=
_nOctaves
;
nOctaveLayers
=
_nOctaveLayers
;
keypointsRatio
=
_keypointsRatio
;
}
int
cv
::
gpu
::
SURF_GPU
::
descriptorSize
()
const
int
cv
::
gpu
::
SURF_GPU
::
descriptorSize
()
const
{
{
return
extended
?
128
:
64
;
return
extended
?
128
:
64
;
...
@@ -281,27 +264,64 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa
...
@@ -281,27 +264,64 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa
{
{
Mat
keypointsCPU
(
1
,
keypoints
.
size
(),
CV_32FC
(
6
));
Mat
keypointsCPU
(
1
,
keypoints
.
size
(),
CV_32FC
(
6
));
const
KeyPoint
*
keypoints_ptr
=
&
keypoints
[
0
];
for
(
size_t
i
=
0
;
i
<
keypoints
.
size
();
++
i
)
KeyPoint_GPU
*
keypointsCPU_ptr
=
keypointsCPU
.
ptr
<
KeyPoint_GPU
>
();
for
(
size_t
i
=
0
;
i
<
keypoints
.
size
();
++
i
,
++
keypoints_ptr
,
++
keypointsCPU_ptr
)
{
{
const
KeyPoint
&
kp
=
*
keypoints_ptr
;
const
KeyPoint
&
kp
=
keypoints
[
i
]
;
KeyPoint_GPU
&
gkp
=
*
keypointsCPU_ptr
;
KeyPoint_GPU
&
gkp
=
keypointsCPU
.
ptr
<
KeyPoint_GPU
>
()[
i
]
;
gkp
.
x
=
kp
.
pt
.
x
;
gkp
.
x
=
kp
.
pt
.
x
;
gkp
.
y
=
kp
.
pt
.
y
;
gkp
.
y
=
kp
.
pt
.
y
;
gkp
.
laplacian
=
1.0
f
;
gkp
.
size
=
kp
.
size
;
gkp
.
size
=
kp
.
size
;
gkp
.
octave
=
static_cast
<
float
>
(
kp
.
octave
);
gkp
.
dir
=
kp
.
angle
;
gkp
.
angle
=
kp
.
angle
;
gkp
.
hessian
=
kp
.
response
;
gkp
.
response
=
kp
.
response
;
}
}
keypointsGPU
.
upload
(
keypointsCPU
);
keypointsGPU
.
upload
(
keypointsCPU
);
}
}
}
}
namespace
{
int
calcSize
(
int
octave
,
int
layer
)
{
/* Wavelet size at first layer of first octave. */
const
int
HAAR_SIZE0
=
9
;
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbours of a sample, the layers
above and below are aligned correctly. */
const
int
HAAR_SIZE_INC
=
6
;
return
(
HAAR_SIZE0
+
HAAR_SIZE_INC
*
layer
)
<<
octave
;
}
int
getPointOctave
(
const
KeyPoint_GPU
&
kpt
,
const
CvSURFParams
&
params
)
{
int
best_octave
=
0
;
float
min_diff
=
numeric_limits
<
float
>::
max
();
for
(
int
octave
=
1
;
octave
<
params
.
nOctaves
;
++
octave
)
{
for
(
int
layer
=
0
;
layer
<
params
.
nOctaveLayers
;
++
layer
)
{
float
diff
=
std
::
abs
(
kpt
.
size
-
(
float
)
calcSize
(
octave
,
layer
));
if
(
min_diff
>
diff
)
{
min_diff
=
diff
;
best_octave
=
octave
;
if
(
min_diff
==
0
)
return
best_octave
;
}
}
}
return
best_octave
;
}
}
void
cv
::
gpu
::
SURF_GPU
::
downloadKeypoints
(
const
GpuMat
&
keypointsGPU
,
vector
<
KeyPoint
>&
keypoints
)
void
cv
::
gpu
::
SURF_GPU
::
downloadKeypoints
(
const
GpuMat
&
keypointsGPU
,
vector
<
KeyPoint
>&
keypoints
)
{
{
if
(
keypointsGPU
.
empty
())
if
(
keypointsGPU
.
empty
())
...
@@ -313,21 +333,23 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
...
@@ -313,21 +333,23 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
Mat
keypointsCPU
=
keypointsGPU
;
Mat
keypointsCPU
=
keypointsGPU
;
keypoints
.
resize
(
keypointsGPU
.
cols
);
keypoints
.
resize
(
keypointsGPU
.
cols
);
KeyPoint
*
keypoints_ptr
=
&
keypoints
[
0
];
for
(
int
i
=
0
;
i
<
keypointsGPU
.
cols
;
++
i
)
const
KeyPoint_GPU
*
keypointsCPU_ptr
=
keypointsCPU
.
ptr
<
KeyPoint_GPU
>
();
for
(
int
i
=
0
;
i
<
keypointsGPU
.
cols
;
++
i
,
++
keypoints_ptr
,
++
keypointsCPU_ptr
)
{
{
KeyPoint
&
kp
=
*
keypoints_ptr
;
KeyPoint
&
kp
=
keypoints
[
i
]
;
const
KeyPoint_GPU
&
gkp
=
*
keypointsCPU_ptr
;
const
KeyPoint_GPU
&
gkp
=
keypointsCPU
.
ptr
<
KeyPoint_GPU
>
()[
i
]
;
kp
.
pt
.
x
=
gkp
.
x
;
kp
.
pt
.
x
=
gkp
.
x
;
kp
.
pt
.
y
=
gkp
.
y
;
kp
.
pt
.
y
=
gkp
.
y
;
kp
.
size
=
gkp
.
size
;
kp
.
size
=
gkp
.
size
;
kp
.
octave
=
static_cast
<
int
>
(
gkp
.
octave
);
kp
.
angle
=
gkp
.
dir
;
kp
.
angle
=
gkp
.
angle
;
kp
.
response
=
gkp
.
response
;
kp
.
response
=
gkp
.
hessian
;
kp
.
octave
=
getPointOctave
(
gkp
,
*
this
);
kp
.
class_id
=
static_cast
<
int
>
(
gkp
.
laplacian
);
}
}
}
}
}
}
...
@@ -353,23 +375,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat
...
@@ -353,23 +375,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat
SURF_GPU_Invoker
surf
(
*
this
,
img
,
mask
);
SURF_GPU_Invoker
surf
(
*
this
,
img
,
mask
);
surf
.
detectKeypoints
(
keypoints
);
surf
.
detectKeypoints
(
keypoints
);
surf
.
findOrientation
(
keypoints
);
}
}
}
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
GpuMat
&
keypoints
,
GpuMat
&
descriptors
,
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
GpuMat
&
keypoints
,
GpuMat
&
descriptors
,
bool
useProvidedKeypoints
,
bool
calcOrientation
)
bool
useProvidedKeypoints
)
{
{
if
(
!
img
.
empty
())
if
(
!
img
.
empty
())
{
{
SURF_GPU_Invoker
surf
(
*
this
,
img
,
mask
);
SURF_GPU_Invoker
surf
(
*
this
,
img
,
mask
);
if
(
!
useProvidedKeypoints
)
if
(
!
useProvidedKeypoints
)
surf
.
detectKeypoints
(
keypoints
);
surf
.
detectKeypoints
(
keypoints
);
else
if
(
calcOrientation
)
{
surf
.
findOrientation
(
keypoints
);
GpuMat
keypointsBuf
;
surf
.
findOrientation
(
keypoints
,
keypointsBuf
);
keypointsBuf
.
copyTo
(
keypoints
);
}
surf
.
computeDescriptors
(
keypoints
,
descriptors
,
descriptorSize
());
surf
.
computeDescriptors
(
keypoints
,
descriptors
,
descriptorSize
());
}
}
...
@@ -385,24 +408,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector
...
@@ -385,24 +408,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector
}
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
vector
<
KeyPoint
>&
keypoints
,
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
vector
<
KeyPoint
>&
keypoints
,
GpuMat
&
descriptors
,
bool
useProvidedKeypoints
,
bool
calcOrientation
)
GpuMat
&
descriptors
,
bool
useProvidedKeypoints
)
{
{
GpuMat
keypointsGPU
;
GpuMat
keypointsGPU
;
if
(
useProvidedKeypoints
)
if
(
useProvidedKeypoints
)
uploadKeypoints
(
keypoints
,
keypointsGPU
);
uploadKeypoints
(
keypoints
,
keypointsGPU
);
(
*
this
)(
img
,
mask
,
keypointsGPU
,
descriptors
,
useProvidedKeypoints
,
calcOrientation
);
(
*
this
)(
img
,
mask
,
keypointsGPU
,
descriptors
,
useProvidedKeypoints
);
downloadKeypoints
(
keypointsGPU
,
keypoints
);
downloadKeypoints
(
keypointsGPU
,
keypoints
);
}
}
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
vector
<
KeyPoint
>&
keypoints
,
void
cv
::
gpu
::
SURF_GPU
::
operator
()(
const
GpuMat
&
img
,
const
GpuMat
&
mask
,
vector
<
KeyPoint
>&
keypoints
,
vector
<
float
>&
descriptors
,
bool
useProvidedKeypoints
,
bool
calcOrientation
)
vector
<
float
>&
descriptors
,
bool
useProvidedKeypoints
)
{
{
GpuMat
descriptorsGPU
;
GpuMat
descriptorsGPU
;
(
*
this
)(
img
,
mask
,
keypoints
,
descriptorsGPU
,
useProvidedKeypoints
,
calcOrientation
);
(
*
this
)(
img
,
mask
,
keypoints
,
descriptorsGPU
,
useProvidedKeypoints
);
downloadDescriptors
(
descriptorsGPU
,
descriptors
);
downloadDescriptors
(
descriptorsGPU
,
descriptors
);
}
}
...
...
modules/gpu/test/test_features2d.cpp
View file @
58f69197
...
@@ -48,7 +48,6 @@ using namespace std;
...
@@ -48,7 +48,6 @@ using namespace std;
const
string
FEATURES2D_DIR
=
"features2d"
;
const
string
FEATURES2D_DIR
=
"features2d"
;
const
string
IMAGE_FILENAME
=
"aloe.png"
;
const
string
IMAGE_FILENAME
=
"aloe.png"
;
const
string
VALID_FILE_NAME
=
"surf.xml.gz"
;
class
CV_GPU_SURFTest
:
public
cvtest
::
BaseTest
class
CV_GPU_SURFTest
:
public
cvtest
::
BaseTest
{
{
...
@@ -59,17 +58,20 @@ public:
...
@@ -59,17 +58,20 @@ public:
protected
:
protected
:
bool
isSimilarKeypoints
(
const
KeyPoint
&
p1
,
const
KeyPoint
&
p2
);
bool
isSimilarKeypoints
(
const
KeyPoint
&
p1
,
const
KeyPoint
&
p2
);
int
getValidCount
(
const
vector
<
KeyPoint
>&
keypoints1
,
const
vector
<
KeyPoint
>&
keypoints2
,
const
vector
<
DMatch
>&
matches
);
void
compareKeypointSets
(
const
vector
<
KeyPoint
>&
validKeypoints
,
const
vector
<
KeyPoint
>&
calcKeypoints
,
void
compareKeypointSets
(
const
vector
<
KeyPoint
>&
validKeypoints
,
const
vector
<
KeyPoint
>&
calcKeypoints
,
const
Mat
&
validDescriptors
,
const
Mat
&
calcDescriptors
);
const
Mat
&
validDescriptors
,
const
Mat
&
calcDescriptors
);
void
emptyDataTest
(
SURF_GPU
&
fdetector
);
void
emptyDataTest
();
void
regressionTest
(
SURF_GPU
&
fdetector
);
void
accuracyTest
(
);
virtual
void
run
(
int
);
virtual
void
run
(
int
);
};
};
void
CV_GPU_SURFTest
::
emptyDataTest
(
SURF_GPU
&
fdetector
)
void
CV_GPU_SURFTest
::
emptyDataTest
()
{
{
SURF_GPU
fdetector
;
GpuMat
image
;
GpuMat
image
;
vector
<
KeyPoint
>
keypoints
;
vector
<
KeyPoint
>
keypoints
;
vector
<
float
>
descriptors
;
vector
<
float
>
descriptors
;
...
@@ -114,116 +116,80 @@ bool CV_GPU_SURFTest::isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2)
...
@@ -114,116 +116,80 @@ bool CV_GPU_SURFTest::isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2)
p1
.
class_id
==
p2
.
class_id
);
p1
.
class_id
==
p2
.
class_id
);
}
}
void
CV_GPU_SURFTest
::
compareKeypointSets
(
const
vector
<
KeyPoint
>&
validKeypoints
,
const
vector
<
KeyPoint
>&
calcKeypoints
,
int
CV_GPU_SURFTest
::
getValidCount
(
const
vector
<
KeyPoint
>&
keypoints1
,
const
vector
<
KeyPoint
>&
keypoints2
,
const
Mat
&
validDescriptors
,
const
Mat
&
calcDescriptor
s
)
const
vector
<
DMatch
>&
matche
s
)
{
{
if
(
validKeypoints
.
size
()
!=
calcKeypoints
.
size
())
int
count
=
0
;
for
(
size_t
i
=
0
;
i
<
matches
.
size
();
++
i
)
{
{
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"Keypoints sizes doesn't equal (validCount = %d, calcCount = %d).
\n
"
,
const
DMatch
&
m
=
matches
[
i
];
validKeypoints
.
size
(),
calcKeypoints
.
size
());
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_INVALID_OUTPUT
);
const
KeyPoint
&
kp1
=
keypoints1
[
m
.
queryIdx
];
return
;
const
KeyPoint
&
kp2
=
keypoints2
[
m
.
trainIdx
];
if
(
isSimilarKeypoints
(
kp1
,
kp2
))
++
count
;
}
}
if
(
validDescriptors
.
size
()
!=
calcDescriptors
.
size
())
return
count
;
}
void
CV_GPU_SURFTest
::
compareKeypointSets
(
const
vector
<
KeyPoint
>&
validKeypoints
,
const
vector
<
KeyPoint
>&
calcKeypoints
,
const
Mat
&
validDescriptors
,
const
Mat
&
calcDescriptors
)
{
BruteForceMatcher
<
L2
<
float
>
>
matcher
;
vector
<
DMatch
>
matches
;
matcher
.
match
(
validDescriptors
,
calcDescriptors
,
matches
);
int
validCount
=
getValidCount
(
validKeypoints
,
calcKeypoints
,
matches
);
float
validRatio
=
(
float
)
validCount
/
matches
.
size
();
if
(
validRatio
<
0.5
f
)
{
{
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"
Descriptors sizes doesn't equal.
\n
"
);
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"
Bad accuracy - %f.
\n
"
,
validRatio
);
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_INVALID_OUTPUT
);
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_BAD_ACCURACY
);
return
;
return
;
}
}
for
(
size_t
v
=
0
;
v
<
validKeypoints
.
size
();
v
++
)
{
int
nearestIdx
=
-
1
;
float
minDist
=
std
::
numeric_limits
<
float
>::
max
();
for
(
size_t
c
=
0
;
c
<
calcKeypoints
.
size
();
c
++
)
{
float
curDist
=
(
float
)
norm
(
calcKeypoints
[
c
].
pt
-
validKeypoints
[
v
].
pt
);
if
(
curDist
<
minDist
)
{
minDist
=
curDist
;
nearestIdx
=
c
;
}
}
assert
(
minDist
>=
0
);
if
(
!
isSimilarKeypoints
(
validKeypoints
[
v
],
calcKeypoints
[
nearestIdx
]))
{
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"Bad keypoints accuracy.
\n
"
);
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_BAD_ACCURACY
);
return
;
}
if
(
norm
(
validDescriptors
.
row
(
v
),
calcDescriptors
.
row
(
nearestIdx
),
NORM_L2
)
>
1.5
f
)
{
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"Bad descriptors accuracy.
\n
"
);
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_BAD_ACCURACY
);
return
;
}
}
}
}
void
CV_GPU_SURFTest
::
regressionTest
(
SURF_GPU
&
fdetector
)
void
CV_GPU_SURFTest
::
accuracyTest
(
)
{
{
string
imgFilename
=
string
(
ts
->
get_data_path
())
+
FEATURES2D_DIR
+
"/"
+
IMAGE_FILENAME
;
string
imgFilename
=
string
(
ts
->
get_data_path
())
+
FEATURES2D_DIR
+
"/"
+
IMAGE_FILENAME
;
string
resFilename
=
string
(
ts
->
get_data_path
())
+
FEATURES2D_DIR
+
"/"
+
VALID_FILE_NAME
;
// Read the test image.
// Read the test image.
GpuMat
image
(
imread
(
imgFilename
,
0
)
);
Mat
image
=
imread
(
imgFilename
,
0
);
if
(
image
.
empty
())
if
(
image
.
empty
())
{
{
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"Image %s can not be read.
\n
"
,
imgFilename
.
c_str
()
);
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"Image %s can not be read.
\n
"
,
imgFilename
.
c_str
()
);
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_INVALID_TEST_DATA
);
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_INVALID_TEST_DATA
);
return
;
return
;
}
}
FileStorage
fs
(
resFilename
,
FileStorage
::
READ
);
Mat
mask
(
image
.
size
(),
CV_8UC1
,
Scalar
::
all
(
1
));
mask
(
Range
(
0
,
image
.
rows
/
2
),
Range
(
0
,
image
.
cols
/
2
)).
setTo
(
Scalar
::
all
(
0
));
// Compute keypoints.
// Compute keypoints.
GpuMat
mask
(
image
.
size
(),
CV_8UC1
,
Scalar
::
all
(
1
));
mask
(
Range
(
0
,
image
.
rows
/
2
),
Range
(
0
,
image
.
cols
/
2
)).
setTo
(
Scalar
::
all
(
0
));
vector
<
KeyPoint
>
calcKeypoints
;
vector
<
KeyPoint
>
calcKeypoints
;
GpuMat
calcDespcriptors
;
GpuMat
calcDescriptors
;
fdetector
(
image
,
mask
,
calcKeypoints
,
calcDespcriptors
);
SURF_GPU
fdetector
;
fdetector
.
extended
=
false
;
fdetector
(
GpuMat
(
image
),
GpuMat
(
mask
),
calcKeypoints
,
calcDescriptors
);
if
(
fs
.
isOpened
())
// Compare computed and valid keypoints.
{
// Calc validation keypoints set.
// Read validation keypoints set.
vector
<
KeyPoint
>
validKeypoints
;
vector
<
KeyPoint
>
validKeypoints
;
vector
<
float
>
validDescriptors
;
Mat
validDespcriptors
;
SURF
fdetector_gold
;
fdetector_gold
.
extended
=
false
;
read
(
fs
[
"keypoints"
],
validKeypoints
);
fdetector_gold
(
image
,
mask
,
validKeypoints
,
validDescriptors
);
read
(
fs
[
"descriptors"
],
validDespcriptors
);
if
(
validKeypoints
.
empty
()
||
validDespcriptors
.
empty
())
compareKeypointSets
(
validKeypoints
,
calcKeypoints
,
{
Mat
(
validKeypoints
.
size
(),
fdetector_gold
.
descriptorSize
(),
CV_32F
,
&
validDescriptors
[
0
]),
calcDescriptors
);
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"Validation file can not be read.
\n
"
);
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_INVALID_TEST_DATA
);
return
;
}
compareKeypointSets
(
validKeypoints
,
calcKeypoints
,
validDespcriptors
,
calcDespcriptors
);
}
else
// Write detector parameters and computed keypoints as validation data.
{
fs
.
open
(
resFilename
,
FileStorage
::
WRITE
);
if
(
!
fs
.
isOpened
())
{
ts
->
printf
(
cvtest
::
TS
::
LOG
,
"File %s can not be opened to write.
\n
"
,
resFilename
.
c_str
());
ts
->
set_failed_test_info
(
cvtest
::
TS
::
FAIL_INVALID_TEST_DATA
);
return
;
}
else
{
write
(
fs
,
"keypoints"
,
calcKeypoints
);
write
(
fs
,
"descriptors"
,
(
Mat
)
calcDespcriptors
);
}
}
}
}
void
CV_GPU_SURFTest
::
run
(
int
/*start_from*/
)
void
CV_GPU_SURFTest
::
run
(
int
/*start_from*/
)
{
{
SURF_GPU
fdetector
;
emptyDataTest
();
accuracyTest
();
emptyDataTest
(
fdetector
);
regressionTest
(
fdetector
);
}
}
TEST
(
SURF
,
empty_data_and_
regression
)
{
CV_GPU_SURFTest
test
;
test
.
safe_run
();
}
TEST
(
SURF
,
empty_data_and_
accuracy
)
{
CV_GPU_SURFTest
test
;
test
.
safe_run
();
}
samples/gpu/performance/tests.cpp
View file @
58f69197
...
@@ -264,10 +264,11 @@ TEST(SURF)
...
@@ -264,10 +264,11 @@ TEST(SURF)
SURF
surf
;
SURF
surf
;
vector
<
KeyPoint
>
keypoints1
,
keypoints2
;
vector
<
KeyPoint
>
keypoints1
,
keypoints2
;
vector
<
float
>
descriptors1
,
descriptors2
;
CPU_ON
;
CPU_ON
;
surf
(
src1
,
Mat
(),
keypoints1
);
surf
(
src1
,
Mat
(),
keypoints1
,
descriptors1
);
surf
(
src2
,
Mat
(),
keypoints2
);
surf
(
src2
,
Mat
(),
keypoints2
,
descriptors2
);
CPU_OFF
;
CPU_OFF
;
gpu
::
SURF_GPU
d_surf
;
gpu
::
SURF_GPU
d_surf
;
...
@@ -275,8 +276,8 @@ TEST(SURF)
...
@@ -275,8 +276,8 @@ TEST(SURF)
gpu
::
GpuMat
d_descriptors1
,
d_descriptors2
;
gpu
::
GpuMat
d_descriptors1
,
d_descriptors2
;
GPU_ON
;
GPU_ON
;
d_surf
(
d_src1
,
gpu
::
GpuMat
(),
d_keypoints1
);
d_surf
(
d_src1
,
gpu
::
GpuMat
(),
d_keypoints1
,
d_descriptors1
);
d_surf
(
d_src2
,
gpu
::
GpuMat
(),
d_keypoints2
);
d_surf
(
d_src2
,
gpu
::
GpuMat
(),
d_keypoints2
,
d_descriptors2
);
GPU_OFF
;
GPU_OFF
;
}
}
...
...
samples/gpu/surf_keypoint_matcher.cpp
View file @
58f69197
...
@@ -51,10 +51,10 @@ int main(int argc, char* argv[])
...
@@ -51,10 +51,10 @@ int main(int argc, char* argv[])
vector
<
KeyPoint
>
keypoints1
,
keypoints2
;
vector
<
KeyPoint
>
keypoints1
,
keypoints2
;
vector
<
float
>
descriptors1
,
descriptors2
;
vector
<
float
>
descriptors1
,
descriptors2
;
vector
<
DMatch
>
matches
;
vector
<
DMatch
>
matches
;
SURF_GPU
::
downloadKeypoints
(
keypoints1GPU
,
keypoints1
);
surf
.
downloadKeypoints
(
keypoints1GPU
,
keypoints1
);
SURF_GPU
::
downloadKeypoints
(
keypoints2GPU
,
keypoints2
);
surf
.
downloadKeypoints
(
keypoints2GPU
,
keypoints2
);
SURF_GPU
::
downloadDescriptors
(
descriptors1GPU
,
descriptors1
);
surf
.
downloadDescriptors
(
descriptors1GPU
,
descriptors1
);
SURF_GPU
::
downloadDescriptors
(
descriptors2GPU
,
descriptors2
);
surf
.
downloadDescriptors
(
descriptors2GPU
,
descriptors2
);
BruteForceMatcher_GPU
<
L2
<
float
>
>::
matchDownload
(
trainIdx
,
distance
,
matches
);
BruteForceMatcher_GPU
<
L2
<
float
>
>::
matchDownload
(
trainIdx
,
distance
,
matches
);
// drawing the results
// drawing the results
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment