temporarily disabled compute descriptor kernel for new cards (some problems with…

temporarily disabled compute descriptor kernel for new cards (some problems with threads synchronization), old version of kernels is used.

temporarily disabled compute descriptor kernel for new cards (some problems with…
temporarily disabled compute descriptor kernel for new cards (some problems with threads synchronization), old version of kernels is used.
32a2fde8 · Vladislav Vinogradov · 5b3d786e · 32a2fde8 · 32a2fde8
Commit 32a2fde8 authored Feb 22, 2011 by Vladislav Vinogradov
Show whitespace changes
Inline Side-by-side

Showing with 156 additions and 198 deletions

surf.cu modules/gpu/src/cuda/surf.cu +154 -196

surf.cpp modules/gpu/src/surf.cpp +2 -2

No files found.
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace surf
    __constant__ float c_dxy_scale;
    // The scale associated with the first interval of the first octave
    __constant__ float c_initialScale;
-    //! The interest operator threshold
+    // The interest operator threshold
    __constant__ float c_threshold;

    // Ther octave
@@ -685,7 +685,6 @@ namespace cv { namespace gpu { namespace surf
        // - SURF says to only use a circle, but the branching logic would slow it down
        // - Gaussian weighting should reduce the effects of the outer points anyway
        if (tid2 < 169)
-
        {
            dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];
            dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];
@@ -835,24 +834,12 @@ namespace cv { namespace gpu { namespace surf
        descriptor_base[threadIdx.x] = lookup / len;
    }

-    __device__ void calc_dx_dy(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)
+    __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt,
+                               int xIndex, int yIndex, int tid)
    {
-        // get the interest point parameters (x, y, size, response, angle)
-        __shared__ float ipt[5];
-        if (threadIdx.x < 5 && threadIdx.y == 0 && threadIdx.z == 0)
-        {
-	        ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];
-        }
-        __syncthreads();
-
        float sin_theta, cos_theta;
        sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);

-        // Compute sampling points
-        // since grids are 2D, need to compute xBlock and yBlock indices
-        const int xIndex = threadIdx.y * 5 + threadIdx.x % 5;
-        const int yIndex = threadIdx.z * 5 + threadIdx.x / 5;
-
        // Compute rotated sampling points
        // (clockwise rotation since we are rotating the lattice)
        // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
@@ -865,7 +852,6 @@ namespace cv { namespace gpu { namespace surf
        // a b c
        // d   f
        // g h i
-
        const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
        const float b = tex2D(sumTex, sample_x,                sample_y - ipt[SF_SIZE]);
        const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
@@ -883,53 +869,64 @@ namespace cv { namespace gpu { namespace surf

        // rotate responses (store all dxs then all dys)
        // - counterclockwise rotation to rotate back to zero orientation
-        sdx[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
-        sdy[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
+        sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
+        sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
    }

-    __device__ void reduce_sum(float sdata1[4][4][25], float sdata2[4][4][25], float sdata3[4][4][25],
-        float sdata4[4][4][25])
+    __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)
    {
-        // first step is to reduce from 25 to 16
-        if (threadIdx.x < 9) // use 9 threads
+        // get the interest point parameters (x, y, size, response, angle)
+        __shared__ float ipt[5];
+        if (threadIdx.x < 5 && threadIdx.y == 0)
        {
-	        sdata1[threadIdx.z][threadIdx.y][threadIdx.x] += sdata1[threadIdx.z][threadIdx.y][threadIdx.x + 16];
-	        sdata2[threadIdx.z][threadIdx.y][threadIdx.x] += sdata2[threadIdx.z][threadIdx.y][threadIdx.x + 16];
-	        sdata3[threadIdx.z][threadIdx.y][threadIdx.x] += sdata3[threadIdx.z][threadIdx.y][threadIdx.x + 16];
-	        sdata4[threadIdx.z][threadIdx.y][threadIdx.x] += sdata4[threadIdx.z][threadIdx.y][threadIdx.x + 16];
+            ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];
        }
        __syncthreads();

-        // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
-        if (threadIdx.x < 16)
-        {
-            volatile float* smem = sdata1[threadIdx.z][threadIdx.y];
-
-	        smem[threadIdx.x] += smem[threadIdx.x + 8];
-	        smem[threadIdx.x] += smem[threadIdx.x + 4];
-	        smem[threadIdx.x] += smem[threadIdx.x + 2];
-	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+        // Compute sampling points
+        // since grids are 2D, need to compute xBlock and yBlock indices
+        const int xBlock = (threadIdx.y & 3);  // threadIdx.y % 4
+        const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4)
+        const int xIndex = (xBlock * 5) + (threadIdx.x % 5);
+        const int yIndex = (yBlock * 5) + (threadIdx.x / 5);

-            smem = sdata2[threadIdx.z][threadIdx.y];
+        calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x);
+    }

-	        smem[threadIdx.x] += smem[threadIdx.x + 8];
-	        smem[threadIdx.x] += smem[threadIdx.x + 4];
-	        smem[threadIdx.x] += smem[threadIdx.x + 2];
-	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+    __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,
+                                 volatile float* sdata3, volatile float* sdata4, int tid)
+    {
+        // first step is to reduce from 25 to 16
+        if (tid < 9) // use 9 threads
+        {
+            sdata1[tid] += sdata1[tid + 16];
+            sdata2[tid] += sdata2[tid + 16];
+            sdata3[tid] += sdata3[tid + 16];
+            sdata4[tid] += sdata4[tid + 16];
+        }

-            smem = sdata3[threadIdx.z][threadIdx.y];
+        // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
+        if (tid < 16)
+        {
+            sdata1[tid] += sdata1[tid + 8];
+            sdata1[tid] += sdata1[tid + 4];
+            sdata1[tid] += sdata1[tid + 2];
+            sdata1[tid] += sdata1[tid + 1];

-	        smem[threadIdx.x] += smem[threadIdx.x + 8];
-	        smem[threadIdx.x] += smem[threadIdx.x + 4];
-	        smem[threadIdx.x] += smem[threadIdx.x + 2];
-	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+            sdata2[tid] += sdata2[tid + 8];
+            sdata2[tid] += sdata2[tid + 4];
+            sdata2[tid] += sdata2[tid + 2];
+            sdata2[tid] += sdata2[tid + 1];

-            smem = sdata4[threadIdx.z][threadIdx.y];
+            sdata3[tid] += sdata3[tid + 8];
+            sdata3[tid] += sdata3[tid + 4];
+            sdata3[tid] += sdata3[tid + 2];
+            sdata3[tid] += sdata3[tid + 1];

-	        smem[threadIdx.x] += smem[threadIdx.x + 8];
-	        smem[threadIdx.x] += smem[threadIdx.x + 4];
-	        smem[threadIdx.x] += smem[threadIdx.x + 2];
-	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+            sdata4[tid] += sdata4[tid + 8];
+            sdata4[tid] += sdata4[tid + 4];
+            sdata4[tid] += sdata4[tid + 2];
+            sdata4[tid] += sdata4[tid + 1];
        }
    }

@@ -938,31 +935,43 @@ namespace cv { namespace gpu { namespace surf
    __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
    {
        // 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)
-        __shared__ float sdx[4][4][25]; 
-        __shared__ float sdy[4][4][25];
+        __shared__ float sdx   [16 * 25];
+        __shared__ float sdy   [16 * 25];
+        __shared__ float sdxabs[16 * 25];
+        __shared__ float sdyabs[16 * 25];

-        calc_dx_dy(sdx, sdy, features);
-        __syncthreads();
+        __shared__ float sdesc[64];

-        __shared__ float sdxabs[4][4][25];
-        __shared__ float sdyabs[4][4][25];
+        float* sdx_bin    = sdx    + (threadIdx.y * 25);
+        float* sdy_bin    = sdy    + (threadIdx.y * 25);
+        float* sdxabs_bin = sdxabs + (threadIdx.y * 25);
+        float* sdyabs_bin = sdyabs + (threadIdx.y * 25);

-        sdxabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); // |dx| array
-        sdyabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); // |dy| array
+        calc_dx_dy(sdx_bin, sdy_bin, features);
        __syncthreads();

-        reduce_sum(sdx, sdy, sdxabs, sdyabs);
+        sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array
+        sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array
+        __syncthreads();

-        float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 16 + threadIdx.y * 4;
+        reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x);
+        __syncthreads();
+
+        float* sdesc_bin = sdesc + (threadIdx.y << 2);

        // write dx, dy, |dx|, |dy|
        if (threadIdx.x == 0)
        {
-            descriptors_block[0] = sdx[threadIdx.z][threadIdx.y][0];
-            descriptors_block[1] = sdy[threadIdx.z][threadIdx.y][0];
-            descriptors_block[2] = sdxabs[threadIdx.z][threadIdx.y][0];
-            descriptors_block[3] = sdyabs[threadIdx.z][threadIdx.y][0];
+            sdesc_bin[0] = sdx_bin[0];
+            sdesc_bin[1] = sdy_bin[0];
+            sdesc_bin[2] = sdxabs_bin[0];
+            sdesc_bin[3] = sdyabs_bin[0];
        }
+        __syncthreads();
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        if (tid < 64)
+            descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
    }

    // Spawn 16 blocks per interest point
@@ -970,74 +979,90 @@ namespace cv { namespace gpu { namespace surf
    __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
    {
        // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-        __shared__ float sdx[4][4][25]; 
-        __shared__ float sdy[4][4][25];
-        
-        calc_dx_dy(sdx, sdy, features);
-        __syncthreads();
+        __shared__ float sdx[16 * 25];
+        __shared__ float sdy[16 * 25];

        // sum (reduce) 5x5 area response
-        __shared__ float sd1[4][4][25];
-        __shared__ float sd2[4][4][25];
-        __shared__ float sdabs1[4][4][25]; 
-        __shared__ float sdabs2[4][4][25];
+        __shared__ float sd1[16 * 25];
+        __shared__ float sd2[16 * 25];
+        __shared__ float sdabs1[16 * 25];
+        __shared__ float sdabs2[16 * 25];
+
+        __shared__ float sdesc[128];
+
+        float* sdx_bin    = sdx    + (threadIdx.y * 25);
+        float* sdy_bin    = sdy    + (threadIdx.y * 25);
+        float* sd1_bin    = sd1    + (threadIdx.y * 25);
+        float* sd2_bin    = sd2    + (threadIdx.y * 25);
+        float* sdabs1_bin = sdabs1 + (threadIdx.y * 25);
+        float* sdabs2_bin = sdabs2 + (threadIdx.y * 25);
+
+        calc_dx_dy(sdx_bin, sdy_bin, features);
+        __syncthreads();

-        if (sdy[threadIdx.z][threadIdx.y][threadIdx.x] >= 0)
+        if (sdy_bin[threadIdx.x] >= 0)
        {
-            sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x];
-            sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]);
-            sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
-            sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+            sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x];
+            sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]);
+            sd2_bin[threadIdx.x] = 0;
+            sdabs2_bin[threadIdx.x] = 0;
        }
        else
        {
-            sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
-            sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
-            sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x];
-            sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]);
+            sd1_bin[threadIdx.x] = 0;
+            sdabs1_bin[threadIdx.x] = 0;
+            sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x];
+            sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]);
        }
        __syncthreads();

-        reduce_sum(sd1, sd2, sdabs1, sdabs2);
+        reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
+        __syncthreads();

-        float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 32 + threadIdx.y * 8;
+        float* sdesc_bin = sdesc + (threadIdx.y << 3);

        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
        if (threadIdx.x == 0)
        {
-	        descriptors_block[0] = sd1[threadIdx.z][threadIdx.y][0];
-	        descriptors_block[1] = sdabs1[threadIdx.z][threadIdx.y][0];
-	        descriptors_block[2] = sd2[threadIdx.z][threadIdx.y][0];
-	        descriptors_block[3] = sdabs2[threadIdx.z][threadIdx.y][0];
+                sdesc_bin[0] = sd1_bin[0];
+                sdesc_bin[1] = sdabs1_bin[0];
+                sdesc_bin[2] = sd2_bin[0];
+                sdesc_bin[3] = sdabs2_bin[0];
        }
        __syncthreads();

-        if (sdx[threadIdx.z][threadIdx.y][threadIdx.x] >= 0)
+        if (sdx_bin[threadIdx.x] >= 0)
        {
-            sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x];
-            sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]);
-            sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
-            sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+            sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x];
+            sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
+            sd2_bin[threadIdx.x] = 0;
+            sdabs2_bin[threadIdx.x] = 0;
        }
        else
        {
-            sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
-            sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
-            sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x];
-            sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]);
+            sd1_bin[threadIdx.x] = 0;
+            sdabs1_bin[threadIdx.x] = 0;
+            sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x];
+            sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
        }
        __syncthreads();

-        reduce_sum(sd1, sd2, sdabs1, sdabs2);
+        reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
+        __syncthreads();

        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
        if (threadIdx.x == 0)
        {
-	        descriptors_block[4] = sd1[threadIdx.z][threadIdx.y][0];
-	        descriptors_block[5] = sdabs1[threadIdx.z][threadIdx.y][0];
-	        descriptors_block[6] = sd2[threadIdx.z][threadIdx.y][0];
-	        descriptors_block[7] = sdabs2[threadIdx.z][threadIdx.y][0];
+                sdesc_bin[4] = sd1_bin[0];
+                sdesc_bin[5] = sdabs1_bin[0];
+                sdesc_bin[6] = sd2_bin[0];
+                sdesc_bin[7] = sdabs2_bin[0];
        }
+        __syncthreads();
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        if (tid < 128)
+            descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
    }

    void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
@@ -1046,7 +1071,7 @@ namespace cv { namespace gpu { namespace surf
        
        if (descriptors.cols == 64)
        {
-            compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
+            compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaThreadSynchronize() );
@@ -1058,7 +1083,7 @@ namespace cv { namespace gpu { namespace surf
        }
        else
        {
-            compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
+            compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaThreadSynchronize() );
@@ -1080,9 +1105,6 @@ namespace cv { namespace gpu { namespace surf
        }
        __syncthreads();

-        float sin_theta, cos_theta;
-        sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);
-
        // Compute sampling points
        // since grids are 2D, need to compute xBlock and yBlock indices
        const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
@@ -1090,100 +1112,40 @@ namespace cv { namespace gpu { namespace surf
        const int xIndex = xBlock * blockDim.x + threadIdx.x;
        const int yIndex = yBlock * blockDim.y + threadIdx.y;

-        // Compute rotated sampling points
-        // (clockwise rotation since we are rotating the lattice)
-        // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
-        const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] 
-            + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
-        const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] 
-            + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
-
-        // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
-        //	a b c
-        //	d	f
-        //	g h i
-        const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
-        const float b = tex2D(sumTex, sample_x,                sample_y - ipt[SF_SIZE]);
-        const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
-        const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
-        const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
-        const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
-        const float h = tex2D(sumTex, sample_x,                sample_y + ipt[SF_SIZE]);
-        const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);	
-
-        // compute axis-aligned HaarX, HaarY
-        // (could group the additions together into multiplications)
-        const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
-        const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i));            // unrotated dx
-        const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i));            // unrotated dy
-
-        // rotate responses (store all dxs then all dys)
-        // - counterclockwise rotation to rotate back to zero orientation
-        sdx[tid] = aa_dx * cos_theta - aa_dy * sin_theta;     // rotated dx
-        sdy[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
-    }
-
-    __device__ void reduce_sum_old(float sdata[25], int tid)
-    {
-        // first step is to reduce from 25 to 16
-        if (tid < 9) // use 9 threads
-            sdata[tid] += sdata[tid + 16];
-        __syncthreads();
-
-        // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
-        if (tid < 16)
-        {
-            volatile float* smem = sdata;
-
-            smem[tid] += smem[tid + 8];
-            smem[tid] += smem[tid + 4];
-            smem[tid] += smem[tid + 2];
-            smem[tid] += smem[tid + 1];
-        }
+        calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid);
    }

    // Spawn 16 blocks per interest point
    // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
    __global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)
    {
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        
-        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
-        
        // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
        __shared__ float sdx[25];
        __shared__ float sdy[25];
+        __shared__ float sdxabs[25];
+        __shared__ float sdyabs[25];
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;

        calc_dx_dy_old(sdx, sdy, features, tid);
        __syncthreads();

-        __shared__ float sabs[25];
+        sdxabs[tid] = fabs(sdx[tid]); // |dx| array
+        sdyabs[tid] = fabs(sdy[tid]); // |dy| array
+        __syncthreads();

-        sabs[tid] = fabs(sdx[tid]); // |dx| array
+        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
        __syncthreads();

-        reduce_sum_old(sdx, tid);
-        reduce_sum_old(sdy, tid);
-        reduce_sum_old(sabs, tid);
+        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);

-        // write dx, dy, |dx|
+        // write dx, dy, |dx|, |dy|
        if (tid == 0)
        {
            descriptors_block[0] = sdx[0];
            descriptors_block[1] = sdy[0];
-            descriptors_block[2] = sabs[0];
-        }
-        __syncthreads();
-
-        sabs[tid] = fabs(sdy[tid]); // |dy| array
-        __syncthreads();
-        
-        reduce_sum_old(sabs, tid);
-
-        // write |dy|
-        if (tid == 0)
-        {
-            descriptors_block[3] = sabs[0];
+            descriptors_block[2] = sdxabs[0];
+            descriptors_block[3] = sdyabs[0];
        }
    }

@@ -1191,23 +1153,21 @@ namespace cv { namespace gpu { namespace surf
    // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
    __global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)
    {
-        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
-
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        
        // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
        __shared__ float sdx[25];
        __shared__ float sdy[25];

-        calc_dx_dy_old(sdx, sdy, features, tid);
-        __syncthreads();
-
        // sum (reduce) 5x5 area response
        __shared__ float sd1[25];
        __shared__ float sd2[25];
        __shared__ float sdabs1[25];
        __shared__ float sdabs2[25];

+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        calc_dx_dy_old(sdx, sdy, features, tid);
+        __syncthreads();
+
        if (sdy[tid] >= 0)
        {
            sd1[tid] = sdx[tid];
@@ -1224,10 +1184,10 @@ namespace cv { namespace gpu { namespace surf
        }
        __syncthreads();

-        reduce_sum_old(sd1, tid);
-        reduce_sum_old(sd2, tid);
-        reduce_sum_old(sdabs1, tid);
-        reduce_sum_old(sdabs2, tid);
+        reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);
+        __syncthreads();
+
+        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);

        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
        if (tid == 0)
@@ -1255,10 +1215,8 @@ namespace cv { namespace gpu { namespace surf
        }
        __syncthreads();

-        reduce_sum_old(sd1, tid);
-        reduce_sum_old(sd2, tid);
-        reduce_sum_old(sdabs1, tid);
-        reduce_sum_old(sdabs2, tid);
+        reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);
+        __syncthreads();

        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
        if (tid == 0)

--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -233,8 +233,8 @@ namespace
            typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors, 
                const KeyPoint_GPU* features, int nFeatures);

-            const compute_descriptors_t compute_descriptors = 
-                DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
+            const compute_descriptors_t compute_descriptors = compute_descriptors_gpu_old;
+                //DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;

            if (keypoints.cols > 0)
            {