Commit 68d04d28 authored by marina.kolpakova's avatar marina.kolpakova

replace offsets in surf to simple copy for better speed

parent be0c20b7
...@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device ...@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
return true; return true;
} }
static __device__ __forceinline__ bool check(int, int, int, uint offset = 0) static __device__ __forceinline__ bool check(int, int, int)
{ {
return true; return true;
} }
......
...@@ -177,7 +177,7 @@ namespace cv { namespace gpu { namespace device ...@@ -177,7 +177,7 @@ namespace cv { namespace gpu { namespace device
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
} }
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace, uint sumOffset) __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
{ {
// Determine the indices // Determine the indices
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2); const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
...@@ -198,9 +198,9 @@ namespace cv { namespace gpu { namespace device ...@@ -198,9 +198,9 @@ namespace cv { namespace gpu { namespace device
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{ {
const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), sumOffset + (j << c_octave)); const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), sumOffset + (j << c_octave)); const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), sumOffset + (j << c_octave)); const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy; det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy; trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
...@@ -208,7 +208,7 @@ namespace cv { namespace gpu { namespace device ...@@ -208,7 +208,7 @@ namespace cv { namespace gpu { namespace device
} }
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayers, const size_t sumOffset) int octave, int nOctaveLayers)
{ {
const int min_size = calcSize(octave, 0); const int min_size = calcSize(octave, 0);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave); const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
...@@ -220,7 +220,7 @@ namespace cv { namespace gpu { namespace device ...@@ -220,7 +220,7 @@ namespace cv { namespace gpu { namespace device
grid.x = divUp(max_samples_j, threads.x); grid.x = divUp(max_samples_j, threads.x);
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2); grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace, (uint)sumOffset); icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
...@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace device ...@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace device
struct WithMask struct WithMask
{ {
static __device__ bool check(int sum_i, int sum_j, int size, const uint offset) static __device__ bool check(int sum_i, int sum_j, int size)
{ {
float ratio = (float)size / 9.0f; float ratio = (float)size / 9.0f;
...@@ -245,10 +245,10 @@ namespace cv { namespace gpu { namespace device ...@@ -245,10 +245,10 @@ namespace cv { namespace gpu { namespace device
int dy2 = __float2int_rn(ratio * c_DM[3]); int dy2 = __float2int_rn(ratio * c_DM[3]);
float t = 0; float t = 0;
t += tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy1); t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
t -= tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy2); t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
t -= tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy1); t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
t += tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy2); t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
...@@ -258,7 +258,7 @@ namespace cv { namespace gpu { namespace device ...@@ -258,7 +258,7 @@ namespace cv { namespace gpu { namespace device
template <typename Mask> template <typename Mask>
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
unsigned int* maxCounter, const uint maskOffset) unsigned int* maxCounter)
{ {
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
...@@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace device ...@@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace device
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (Mask::check(sum_i, sum_j, size, maskOffset)) if (Mask::check(sum_i, sum_j, size))
{ {
// Check to see if we have a max (in its 26 neighbours) // Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff] const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
...@@ -351,7 +351,7 @@ namespace cv { namespace gpu { namespace device ...@@ -351,7 +351,7 @@ namespace cv { namespace gpu { namespace device
} }
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers, const size_t maskOffset) int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
{ {
const int layer_rows = img_rows >> octave; const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave; const int layer_cols = img_cols >> octave;
...@@ -367,9 +367,9 @@ namespace cv { namespace gpu { namespace device ...@@ -367,9 +367,9 @@ namespace cv { namespace gpu { namespace device
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float); const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
if (use_mask) if (use_mask)
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter, (uint)maskOffset); icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
else else
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter, 0); icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
......
...@@ -75,10 +75,10 @@ namespace cv { namespace gpu { namespace device ...@@ -75,10 +75,10 @@ namespace cv { namespace gpu { namespace device
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum); size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayers, const size_t sumOffset); int octave, int nOctaveLayer);
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nLayers, const size_t maskOffset); int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian, float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
...@@ -146,14 +146,17 @@ namespace ...@@ -146,14 +146,17 @@ namespace
loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold)); loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
bindImgTex(img); bindImgTex(img);
integralBuffered(img, surf_.sum, surf_.intBuffer);
integralBuffered(img, tmpSum, surf_.intBuffer);
tmpSum.copyTo(surf_.sum);
sumOffset = bindSumTex(surf_.sum); sumOffset = bindSumTex(surf_.sum);
if (use_mask) if (use_mask)
{ {
min(mask, 1.0, surf_.mask1); min(mask, 1.0, surf_.mask1);
integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer); integralBuffered(surf_.mask1, tmpMaskSum, surf_.intBuffer);
tmpMaskSum.copyTo(surf_.maskSum);
maskOffset = bindMaskSumTex(surf_.maskSum); maskOffset = bindMaskSumTex(surf_.maskSum);
} }
} }
...@@ -174,10 +177,10 @@ namespace ...@@ -174,10 +177,10 @@ namespace
loadOctaveConstants(octave, layer_rows, layer_cols); loadOctaveConstants(octave, layer_rows, layer_cols);
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers, sumOffset); icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave, icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers, maskOffset); img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
unsigned int maxCounter; unsigned int maxCounter;
cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
...@@ -228,6 +231,9 @@ namespace ...@@ -228,6 +231,9 @@ namespace
private: private:
SURF_GPU& surf_; SURF_GPU& surf_;
cv::gpu::GpuMat tmpSum;
cv::gpu::GpuMat tmpMaskSum;
int img_cols, img_rows; int img_cols, img_rows;
bool use_mask; bool use_mask;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment