Commit e6f3c9b0 authored by Andrey Pavlenko's avatar Andrey Pavlenko Committed by OpenCV Buildbot

Merge pull request #2281 from vpisarev:ocl_surf

parents 055f41c9 3e0c72a8
...@@ -235,7 +235,7 @@ public: ...@@ -235,7 +235,7 @@ public:
// Compute the BRISK features and descriptors on an image // Compute the BRISK features and descriptors on an image
void operator()( InputArray image, InputArray mask, std::vector<KeyPoint>& keypoints, void operator()( InputArray image, InputArray mask, std::vector<KeyPoint>& keypoints,
OutputArray descriptors, bool useProvidedKeypoints=false ) const; OutputArray descriptors, bool useProvidedKeypoints=false ) const;
AlgorithmInfo* info() const; AlgorithmInfo* info() const;
......
...@@ -246,105 +246,3 @@ The class ``SURF_CUDA`` uses some buffers and provides access to it. All buffers ...@@ -246,105 +246,3 @@ The class ``SURF_CUDA`` uses some buffers and provides access to it. All buffers
.. note:: .. note::
* An example for using the SURF keypoint matcher on GPU can be found at opencv_source_code/samples/gpu/surf_keypoint_matcher.cpp * An example for using the SURF keypoint matcher on GPU can be found at opencv_source_code/samples/gpu/surf_keypoint_matcher.cpp
ocl::SURF_OCL
-------------
.. ocv:class:: ocl::SURF_OCL
Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
class SURF_OCL
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the default constructor
SURF_OCL();
//! the full constructor taking all the necessary parameters
explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! upload host keypoints to device memory
void uploadKeypoints(const vector<KeyPoint>& keypoints,
oclMat& keypointsocl);
//! download keypoints from device to host memory
void downloadKeypoints(const oclMat& keypointsocl,
vector<KeyPoint>& keypoints);
//! download descriptors from device to host memory
void downloadDescriptors(const oclMat& descriptorsocl,
vector<float>& descriptors);
void operator()(const oclMat& img, const oclMat& mask,
oclMat& keypoints);
void operator()(const oclMat& img, const oclMat& mask,
oclMat& keypoints, oclMat& descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints, oclMat& descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints,
std::vector<float>& descriptors,
bool useProvidedKeypoints = false);
void releaseMemory();
// SURF parameters
double hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
float keypointsRatio;
oclMat sum, mask1, maskSum, intBuffer;
oclMat det, trace;
oclMat maxPosBuffer;
};
The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]`` contains the laplacian sign of the i-th feature.
* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
.. seealso:: :ocv:class:`SURF`
.. note::
* OCL : An example of the SURF detector can be found at opencv_source_code/samples/ocl/surf_matcher.cpp
...@@ -142,7 +142,6 @@ public: ...@@ -142,7 +142,6 @@ public:
CV_PROP_RW bool upright; CV_PROP_RW bool upright;
protected: protected:
void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask = noArray() ) const; void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask = noArray() ) const;
void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const; void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const;
}; };
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_NONFREE_OCL_HPP__
#define __OPENCV_NONFREE_OCL_HPP__
#include "opencv2/ocl.hpp"
namespace cv
{
namespace ocl
{
//! Speeded up robust features, port from CUDA module.
////////////////////////////////// SURF //////////////////////////////////////////
class CV_EXPORTS SURF_OCL
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the default constructor
SURF_OCL();
//! the full constructor taking all the necessary parameters
explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! returns the default norm type
int defaultNorm() const;
//! upload host keypoints to device memory
void uploadKeypoints(const std::vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
//! download keypoints from device to host memory
void downloadKeypoints(const oclMat &keypointsocl, std::vector<KeyPoint> &keypoints);
//! download descriptors from device to host memory
void downloadDescriptors(const oclMat &descriptorsocl, std::vector<float> &descriptors);
//! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images
//! keypoints will have nFeature cols and 6 rows
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
//! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
bool useProvidedKeypoints = false);
void releaseMemory();
// SURF parameters
float hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
float keypointsRatio;
oclMat sum, mask1, maskSum, intBuffer;
oclMat det, trace;
oclMat maxPosBuffer;
};
}
}
#endif //__OPENCV_NONFREE_OCL_HPP__
...@@ -45,36 +45,59 @@ ...@@ -45,36 +45,59 @@
// //
//M*/ //M*/
// The number of degrees between orientation samples in calcOrientation
#define ORI_SEARCH_INC 5
// The local size of the calcOrientation kernel
#define ORI_LOCAL_SIZE (360 / ORI_SEARCH_INC)
// specialized for non-image2d_t supported platform, intel HD4000, for example // specialized for non-image2d_t supported platform, intel HD4000, for example
#ifdef DISABLE_IMAGE2D #ifndef HAVE_IMAGE2D
#define IMAGE_INT32 __global uint * __inline uint read_sumTex_(__global uint* sumTex, int sum_step, int img_rows, int img_cols, int2 coord)
#define IMAGE_INT8 __global uchar * {
#else int x = clamp(coord.x, 0, img_cols);
#define IMAGE_INT32 image2d_t int y = clamp(coord.y, 0, img_rows);
#define IMAGE_INT8 image2d_t return sumTex[sum_step * y + x];
#endif }
uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow) __inline uchar read_imgTex_(__global uchar* imgTex, int img_step, int img_rows, int img_cols, float2 coord)
{ {
#ifdef DISABLE_IMAGE2D int x = clamp(convert_int_rte(coord.x), 0, img_cols-1);
int x = clamp(coord.x, 0, cols); int y = clamp(convert_int_rte(coord.y), 0, img_rows-1);
int y = clamp(coord.y, 0, rows); return imgTex[img_step * y + x];
return img[elemPerRow * y + x]; }
#define read_sumTex(coord) read_sumTex_(sumTex, sum_step, img_rows, img_cols, coord)
#define read_imgTex(coord) read_imgTex_(imgTex, img_step, img_rows, img_cols, coord)
#define __PARAM_sumTex__ __global uint* sumTex, int sum_step, int sum_offset
#define __PARAM_imgTex__ __global uchar* imgTex, int img_step, int img_offset
#define __PASS_sumTex__ sumTex, sum_step, sum_offset
#define __PASS_imgTex__ imgTex, img_step, img_offset
#else #else
return read_imageui(img, sam, coord).x; __inline uint read_sumTex_(image2d_t sumTex, sampler_t sam, int2 coord)
#endif {
return read_imageui(sumTex, sam, coord).x;
} }
uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
__inline uchar read_imgTex_(image2d_t imgTex, sampler_t sam, float2 coord)
{ {
#ifdef DISABLE_IMAGE2D return (uchar)read_imageui(imgTex, sam, coord).x;
int x = clamp(round(coord.x), 0, cols - 1);
int y = clamp(round(coord.y), 0, rows - 1);
return img[elemPerRow * y + x];
#else
return (uchar)read_imageui(img, sam, coord).x;
#endif
} }
#define read_sumTex(coord) read_sumTex_(sumTex, sampler, coord)
#define read_imgTex(coord) read_imgTex_(imgTex, sampler, coord)
#define __PARAM_sumTex__ image2d_t sumTex
#define __PARAM_imgTex__ image2d_t imgTex
#define __PASS_sumTex__ sumTex
#define __PASS_imgTex__ imgTex
#endif
// dynamically change the precision used for floating type // dynamically change the precision used for floating type
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
...@@ -89,7 +112,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col ...@@ -89,7 +112,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col
#endif #endif
// Image read mode // Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
#ifndef FLT_EPSILON #ifndef FLT_EPSILON
#define FLT_EPSILON (1e-15) #define FLT_EPSILON (1e-15)
...@@ -99,45 +122,6 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM ...@@ -99,45 +122,6 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
#define CV_PI_F 3.14159265f #define CV_PI_F 3.14159265f
#endif #endif
// Use integral image to calculate haar wavelets.
// N = 2
// for simple haar paatern
float icvCalcHaarPatternSum_2(
IMAGE_INT32 sumTex,
__constant float2 *src,
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
F d = 0;
int2 dx1 = convert_int2(round(ratio * src[0]));
int2 dy1 = convert_int2(round(ratio * src[1]));
int2 dx2 = convert_int2(round(ratio * src[2]));
int2 dy2 = convert_int2(round(ratio * src[3]));
F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
return (float)d;
}
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Hessian // Hessian
...@@ -175,23 +159,21 @@ F calcAxisAlignedDerivative( ...@@ -175,23 +159,21 @@ F calcAxisAlignedDerivative(
} }
//calculate targeted layer per-pixel determinant and trace with an integral image //calculate targeted layer per-pixel determinant and trace with an integral image
__kernel void icvCalcLayerDetAndTrace( __kernel void SURF_calcLayerDetAndTrace(
IMAGE_INT32 sumTex, // input integral image __PARAM_sumTex__, // input integral image
__global float * det, // output Determinant int img_rows, int img_cols,
int c_nOctaveLayers, int c_octave, int c_layer_rows,
__global float * det, // output determinant
int det_step, int det_offset,
__global float * trace, // output trace __global float * trace, // output trace
int det_step, // the step of det in bytes int trace_step, int trace_offset)
int trace_step, // the step of trace in bytes
int c_img_rows,
int c_img_cols,
int c_nOctaveLayers,
int c_octave,
int c_layer_rows,
int sumTex_step
)
{ {
det_step /= sizeof(*det); det_step /= sizeof(*det);
trace_step /= sizeof(*trace); trace_step /= sizeof(*trace);
sumTex_step/= sizeof(uint); #ifndef HAVE_IMAGE2D
sum_step/= sizeof(uint);
#endif
// Determine the indices // Determine the indices
const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2); const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2);
const int blockIdx_y = get_group_id(1) % gridDim_y; const int blockIdx_y = get_group_id(1) % gridDim_y;
...@@ -203,13 +185,13 @@ __kernel void icvCalcLayerDetAndTrace( ...@@ -203,13 +185,13 @@ __kernel void icvCalcLayerDetAndTrace(
const int size = calcSize(c_octave, layer); const int size = calcSize(c_octave, layer);
const int samples_i = 1 + ((c_img_rows - size) >> c_octave); const int samples_i = 1 + ((img_rows - size) >> c_octave);
const int samples_j = 1 + ((c_img_cols - size) >> c_octave); const int samples_j = 1 + ((img_cols - size) >> c_octave);
// Ignore pixels where some of the kernel is outside the image // Ignore pixels where some of the kernel is outside the image
const int margin = (size >> 1) >> c_octave; const int margin = (size >> 1) >> c_octave;
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) if (size <= img_rows && size <= img_cols && i < samples_i && j < samples_j)
{ {
int x = j << c_octave; int x = j << c_octave;
int y = i << c_octave; int y = i << c_octave;
...@@ -233,14 +215,14 @@ __kernel void icvCalcLayerDetAndTrace( ...@@ -233,14 +215,14 @@ __kernel void icvCalcLayerDetAndTrace(
{ {
// Some of the pixels needed to compute the derivative are // Some of the pixels needed to compute the derivative are
// repeated, so we only don't duplicate the fetch here. // repeated, so we only don't duplicate the fetch here.
int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step ); int t02 = read_sumTex( (int2)(x, y + r2));
int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step ); int t07 = read_sumTex( (int2)(x, y + r7));
int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step ); int t32 = read_sumTex( (int2)(x + r3, y + r2));
int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step ); int t37 = read_sumTex( (int2)(x + r3, y + r7));
int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step ); int t62 = read_sumTex( (int2)(x + r6, y + r2));
int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step ); int t67 = read_sumTex( (int2)(x + r6, y + r7));
int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step ); int t92 = read_sumTex( (int2)(x + r9, y + r2));
int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step ); int t97 = read_sumTex( (int2)(x + r9, y + r7));
d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2), d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2),
t62, t67, t92, t97, (r9 - r6) * (r7 - r2), t62, t67, t92, t97, (r9 - r6) * (r7 - r2),
...@@ -253,14 +235,14 @@ __kernel void icvCalcLayerDetAndTrace( ...@@ -253,14 +235,14 @@ __kernel void icvCalcLayerDetAndTrace(
{ {
// Some of the pixels needed to compute the derivative are // Some of the pixels needed to compute the derivative are
// repeated, so we only don't duplicate the fetch here. // repeated, so we only don't duplicate the fetch here.
int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step ); int t20 = read_sumTex( (int2)(x + r2, y) );
int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step ); int t23 = read_sumTex( (int2)(x + r2, y + r3) );
int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step ); int t70 = read_sumTex( (int2)(x + r7, y) );
int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step ); int t73 = read_sumTex( (int2)(x + r7, y + r3) );
int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step ); int t26 = read_sumTex( (int2)(x + r2, y + r6) );
int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step ); int t76 = read_sumTex( (int2)(x + r7, y + r6) );
int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step ); int t29 = read_sumTex( (int2)(x + r2, y + r9) );
int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step ); int t79 = read_sumTex( (int2)(x + r7, y + r9) );
d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3), d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3),
t26, t29, t76, t79, (r7 - r2) * (r9 - r6), t26, t29, t76, t79, (r7 - r2) * (r9 - r6),
...@@ -274,31 +256,31 @@ __kernel void icvCalcLayerDetAndTrace( ...@@ -274,31 +256,31 @@ __kernel void icvCalcLayerDetAndTrace(
// There's no saving us here, we just have to get all of the pixels in // There's no saving us here, we just have to get all of the pixels in
// separate fetches // separate fetches
F t = 0; F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r1, y + r1) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r1, y + r4) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r4, y + r1) );
t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r4, y + r4) );
d += t / ((r4 - r1) * (r4 - r1)); d += t / ((r4 - r1) * (r4 - r1));
t = 0; t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r5, y + r1) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r5, y + r4) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r8, y + r1) );
t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r8, y + r4) );
d -= t / ((r8 - r5) * (r4 - r1)); d -= t / ((r8 - r5) * (r4 - r1));
t = 0; t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r1, y + r5) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r1, y + r8) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r4, y + r5) );
t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r4, y + r8) );
d -= t / ((r4 - r1) * (r8 - r5)); d -= t / ((r4 - r1) * (r8 - r5));
t = 0; t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r5, y + r5) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r5, y + r8) );
t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step ); t -= read_sumTex( (int2)(x + r8, y + r5) );
t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step ); t += read_sumTex( (int2)(x + r8, y + r8) );
d += t / ((r8 - r5) * (r8 - r5)); d += t / ((r8 - r5) * (r8 - r5));
} }
const float dxy = (float)d; const float dxy = (float)d;
...@@ -311,171 +293,17 @@ __kernel void icvCalcLayerDetAndTrace( ...@@ -311,171 +293,17 @@ __kernel void icvCalcLayerDetAndTrace(
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// NONMAX // NONMAX
__constant float c_DM[5] = {0, 0, 9, 9, 1};
bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
{
float ratio = (float)size / 9.0f;
float d = 0;
int dx1 = round(ratio * c_DM[0]);
int dy1 = round(ratio * c_DM[1]);
int dx2 = round(ratio * c_DM[2]);
int dy2 = round(ratio * c_DM[3]);
float t = 0;
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
return (d >= 0.5f);
}
// Non-maximal suppression to further filtering the candidates from previous step
__kernel __kernel
void icvFindMaximaInLayer_withmask( void SURF_findMaximaInLayer(
__global const float * det,
__global const float * trace,
__global int4 * maxPosBuffer,
volatile __global int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
int trace_step, // the step of trace in bytes
int c_img_rows,
int c_img_cols,
int c_nOctaveLayers,
int c_octave,
int c_layer_rows,
int c_layer_cols,
int c_max_candidates,
float c_hessianThreshold,
IMAGE_INT32 maskSumTex,
int mask_step
)
{
volatile __local float N9[768]; // threads.x * threads.y * 3
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
maxCounter += counter_offset;
mask_step /= sizeof(uint);
// Determine the indices
const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
const int blockIdx_y = get_group_id(1) % gridDim_y;
const int blockIdx_z = get_group_id(1) / gridDim_y;
const int layer = blockIdx_z + 1;
const int size = calcSize(c_octave, layer);
// Ignore pixels without a 3x3x3 neighbourhood in the layer above
const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
// Is this thread within the hessian buffer?
const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
N9[localLin - zoff] =
det[det_step *
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin ] =
det[det_step *
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin + zoff] =
det[det_step *
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
&& get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
float val0 = N9[localLin];
if (val0 > c_hessianThreshold)
{
// Coordinates for the start of the wavelet in the sum image. There
// is some integer division involved, so don't try to simplify this
// (cancel out sampleStep) without checking the result is the same
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
{
// Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
&& val0 > N9[localLin - get_local_size(0) - zoff]
&& val0 > N9[localLin + 1 - get_local_size(0) - zoff]
&& val0 > N9[localLin - 1 - zoff]
&& val0 > N9[localLin - zoff]
&& val0 > N9[localLin + 1 - zoff]
&& val0 > N9[localLin - 1 + get_local_size(0) - zoff]
&& val0 > N9[localLin + get_local_size(0) - zoff]
&& val0 > N9[localLin + 1 + get_local_size(0) - zoff]
&& val0 > N9[localLin - 1 - get_local_size(0)]
&& val0 > N9[localLin - get_local_size(0)]
&& val0 > N9[localLin + 1 - get_local_size(0)]
&& val0 > N9[localLin - 1 ]
&& val0 > N9[localLin + 1 ]
&& val0 > N9[localLin - 1 + get_local_size(0)]
&& val0 > N9[localLin + get_local_size(0)]
&& val0 > N9[localLin + 1 + get_local_size(0)]
&& val0 > N9[localLin - 1 - get_local_size(0) + zoff]
&& val0 > N9[localLin - get_local_size(0) + zoff]
&& val0 > N9[localLin + 1 - get_local_size(0) + zoff]
&& val0 > N9[localLin - 1 + zoff]
&& val0 > N9[localLin + zoff]
&& val0 > N9[localLin + 1 + zoff]
&& val0 > N9[localLin - 1 + get_local_size(0) + zoff]
&& val0 > N9[localLin + get_local_size(0) + zoff]
&& val0 > N9[localLin + 1 + get_local_size(0) + zoff]
;
if(condmax)
{
int ind = atomic_inc(maxCounter);
if (ind < c_max_candidates)
{
const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
}
}
}
}
}
}
__kernel
void icvFindMaximaInLayer(
__global float * det, __global float * det,
int det_step, int det_offset,
__global float * trace, __global float * trace,
int trace_step, int trace_offset,
__global int4 * maxPosBuffer, __global int4 * maxPosBuffer,
volatile __global int* maxCounter, volatile __global int* maxCounter,
int counter_offset, int counter_offset,
int det_step, // the step of det in bytes int img_rows,
int trace_step, // the step of trace in bytes int img_cols,
int c_img_rows,
int c_img_cols,
int c_nOctaveLayers, int c_nOctaveLayers,
int c_octave, int c_octave,
int c_layer_rows, int c_layer_rows,
...@@ -509,8 +337,8 @@ void icvFindMaximaInLayer( ...@@ -509,8 +337,8 @@ void icvFindMaximaInLayer(
const int zoff = get_local_size(0) * get_local_size(1); const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
int l_x = min(max(j, 0), c_img_cols - 1); int l_x = min(max(j, 0), img_cols - 1);
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1); int l_y = c_layer_rows * layer + min(max(i, 0), img_rows - 1);
N9[localLin - zoff] = N9[localLin - zoff] =
det[det_step * (l_y - c_layer_rows) + l_x]; det[det_step * (l_y - c_layer_rows) + l_x];
...@@ -590,7 +418,7 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x) ...@@ -590,7 +418,7 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x)
if (det != 0) if (det != 0)
{ {
F invdet = 1.0 / det; F invdet = 1.0f / det;
x[0] = invdet * x[0] = invdet *
(b[0] * (A[1].y * A[2].z - A[1].z * A[2].y) - (b[0] * (A[1].y * A[2].z - A[1].z * A[2].y) -
...@@ -624,15 +452,15 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x) ...@@ -624,15 +452,15 @@ inline bool solve3x3_float(const float4 *A, const float *b, float *x)
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// INTERPOLATION // INTERPOLATION
__kernel __kernel
void icvInterpolateKeypoint( void SURF_interpolateKeypoint(
__global const float * det, __global const float * det,
int det_step, int det_offset,
__global const int4 * maxPosBuffer, __global const int4 * maxPosBuffer,
__global float * keypoints, __global float * keypoints,
volatile __global int * featureCounter, int keypoints_step, int keypoints_offset,
int det_step, volatile __global int* featureCounter,
int keypoints_step, int img_rows,
int c_img_rows, int img_cols,
int c_img_cols,
int c_octave, int c_octave,
int c_layer_rows, int c_layer_rows,
int c_max_features int c_max_features
...@@ -724,7 +552,7 @@ void icvInterpolateKeypoint( ...@@ -724,7 +552,7 @@ void icvInterpolateKeypoint(
const int grad_wav_size = 2 * round(2.0f * s); const int grad_wav_size = 2 * round(2.0f * s);
// check when grad_wav_size is too big // check when grad_wav_size is too big
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) if ((img_rows + 1) >= grad_wav_size && (img_cols + 1) >= grad_wav_size)
{ {
// Get a new feature index. // Get a new feature index.
int ind = atomic_inc(featureCounter); int ind = atomic_inc(featureCounter);
...@@ -829,23 +657,19 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc ...@@ -829,23 +657,19 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
} }
__kernel __kernel
void icvCalcOrientation( void SURF_calcOrientation(
IMAGE_INT32 sumTex, __PARAM_sumTex__, int img_rows, int img_cols,
__global float * keypoints, __global float * keypoints, int keypoints_step, int keypoints_offset )
int keypoints_step,
int c_img_rows,
int c_img_cols,
int sum_step
)
{ {
keypoints_step /= sizeof(*keypoints); keypoints_step /= sizeof(*keypoints);
#ifndef HAVE_IMAGE2D
sum_step /= sizeof(uint); sum_step /= sizeof(uint);
#endif
__global float* featureX = keypoints + X_ROW * keypoints_step; __global float* featureX = keypoints + X_ROW * keypoints_step;
__global float* featureY = keypoints + Y_ROW * keypoints_step; __global float* featureY = keypoints + Y_ROW * keypoints_step;
__global float* featureSize = keypoints + SIZE_ROW * keypoints_step; __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
__global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
__local float s_X[ORI_SAMPLES]; __local float s_X[ORI_SAMPLES];
__local float s_Y[ORI_SAMPLES]; __local float s_Y[ORI_SAMPLES];
__local float s_angle[ORI_SAMPLES]; __local float s_angle[ORI_SAMPLES];
...@@ -860,7 +684,6 @@ void icvCalcOrientation( ...@@ -860,7 +684,6 @@ void icvCalcOrientation(
and building the keypoint descriptor are defined relative to 's' */ and building the keypoint descriptor are defined relative to 's' */
const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f; const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
/* To find the dominant orientation, the gradients in x and y are /* To find the dominant orientation, the gradients in x and y are
sampled in a circle of radius 6s using wavelets of size 4s. sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the We ensure the gradient wavelet size is even to ensure the
...@@ -868,7 +691,7 @@ void icvCalcOrientation( ...@@ -868,7 +691,7 @@ void icvCalcOrientation(
const int grad_wav_size = 2 * round(2.0f * s); const int grad_wav_size = 2 * round(2.0f * s);
// check when grad_wav_size is too big // check when grad_wav_size is too big
if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size) if ((img_rows + 1) < grad_wav_size || (img_cols + 1) < grad_wav_size)
return; return;
// Calc X, Y, angle and store it to shared memory // Calc X, Y, angle and store it to shared memory
...@@ -880,8 +703,8 @@ void icvCalcOrientation( ...@@ -880,8 +703,8 @@ void icvCalcOrientation(
float ratio = (float)grad_wav_size / 4; float ratio = (float)grad_wav_size / 4;
int r2 = round(ratio * 2.0); int r2 = round(ratio * 2.0f);
int r4 = round(ratio * 4.0); int r4 = round(ratio * 4.0f);
for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE ) for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE )
{ {
float X = 0.0f, Y = 0.0f, angle = 0.0f; float X = 0.0f, Y = 0.0f, angle = 0.0f;
...@@ -889,21 +712,20 @@ void icvCalcOrientation( ...@@ -889,21 +712,20 @@ void icvCalcOrientation(
const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin); const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin);
const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin); const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin);
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && if (y >= 0 && y < (img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size) x >= 0 && x < (img_cols + 1) - grad_wav_size)
{ {
float apt = c_aptW[i]; float apt = c_aptW[i];
// Compute the haar sum without fetching duplicate pixels. // Compute the haar sum without fetching duplicate pixels.
float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step); float t00 = read_sumTex( (int2)(x, y));
float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step); float t02 = read_sumTex( (int2)(x, y + r2));
float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step); float t04 = read_sumTex( (int2)(x, y + r4));
float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step); float t20 = read_sumTex( (int2)(x + r2, y));
float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step); float t24 = read_sumTex( (int2)(x + r2, y + r4));
float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step); float t40 = read_sumTex( (int2)(x + r4, y));
float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step); float t42 = read_sumTex( (int2)(x + r4, y + r2));
float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step); float t44 = read_sumTex( (int2)(x + r4, y + r4));
F t = t00 - t04 - t20 + t24; F t = t00 - t04 - t20 + t24;
X -= t / ((r2) * (r4)); X -= t / ((r2) * (r4));
...@@ -995,18 +817,17 @@ void icvCalcOrientation( ...@@ -995,18 +817,17 @@ void icvCalcOrientation(
} }
__kernel __kernel
void icvSetUpright( void SURF_setUpRight(
__global float * keypoints, __global float * keypoints,
int keypoints_step, int keypoints_step, int keypoints_offset,
int nFeatures int rows, int cols )
)
{ {
int i = get_global_id(0);
keypoints_step /= sizeof(*keypoints); keypoints_step /= sizeof(*keypoints);
__global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
if(get_global_id(0) <= nFeatures) if(i < cols)
{ {
featureDir[get_global_id(0)] = 270.0f; keypoints[mad24(keypoints_step, ANGLE_ROW, i)] = 270.f;
} }
} }
...@@ -1045,22 +866,14 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] = ...@@ -1045,22 +866,14 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
}; };
// utility for linear filter // utility for linear filter
inline uchar readerGet( #define readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, i, j) \
IMAGE_INT8 src, read_imgTex((float2)(centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir, \
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir))
int i, int j, int rows, int cols, int elemPerRow
)
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
}
inline float linearFilter( inline float linearFilter(
IMAGE_INT8 src, __PARAM_imgTex__, int img_rows, int img_cols,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, float centerX, float centerY, float win_offset,
float y, float x, int rows, int cols, int elemPerRow float cos_dir, float sin_dir, float y, float x )
)
{ {
x -= 0.5f; x -= 0.5f;
y -= 0.5f; y -= 0.5f;
...@@ -1072,34 +885,31 @@ inline float linearFilter( ...@@ -1072,34 +885,31 @@ inline float linearFilter(
const int x2 = x1 + 1; const int x2 = x1 + 1;
const int y2 = y1 + 1; const int y2 = y1 + 1;
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow); uchar src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
out = out + src_reg * ((x2 - x) * (y2 - y)); out = out + src_reg * ((x2 - x) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow); src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
out = out + src_reg * ((x - x1) * (y2 - y)); out = out + src_reg * ((x - x1) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow); src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
out = out + src_reg * ((x2 - x) * (y - y1)); out = out + src_reg * ((x2 - x) * (y - y1));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow); src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
out = out + src_reg * ((x - x1) * (y - y1)); out = out + src_reg * ((x - x1) * (y - y1));
return out; return out;
} }
void calc_dx_dy( void calc_dx_dy(
IMAGE_INT8 imgTex, __PARAM_imgTex__,
int img_rows, int img_cols,
volatile __local float *s_dx_bin, volatile __local float *s_dx_bin,
volatile __local float *s_dy_bin, volatile __local float *s_dy_bin,
volatile __local float *s_PATCH, volatile __local float *s_PATCH,
__global const float* featureX, __global const float* featureX,
__global const float* featureY, __global const float* featureY,
__global const float* featureSize, __global const float* featureSize,
__global const float* featureDir, __global const float* featureDir )
int rows,
int cols,
int elemPerRow
)
{ {
const float centerX = featureX[get_group_id(0)]; const float centerX = featureX[get_group_id(0)];
const float centerY = featureY[get_group_id(0)]; const float centerY = featureY[get_group_id(0)];
...@@ -1136,7 +946,9 @@ void calc_dx_dy( ...@@ -1136,7 +946,9 @@ void calc_dx_dy(
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow); s_PATCH[get_local_id(1) * 6 + get_local_id(0)] =
linearFilter(__PASS_imgTex__, img_rows, img_cols, centerX, centerY,
win_offset, cos_dir, sin_dir, icoo, jcoo);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -1162,6 +974,7 @@ void calc_dx_dy( ...@@ -1162,6 +974,7 @@ void calc_dx_dy(
s_dy_bin[tid] = vy; s_dy_bin[tid] = vy;
} }
} }
void reduce_sum25( void reduce_sum25(
volatile __local float* sdata1, volatile __local float* sdata1,
volatile __local float* sdata2, volatile __local float* sdata2,
...@@ -1225,16 +1038,13 @@ void reduce_sum25( ...@@ -1225,16 +1038,13 @@ void reduce_sum25(
} }
__kernel __kernel
void compute_descriptors64( void SURF_computeDescriptors64(
IMAGE_INT8 imgTex, __PARAM_imgTex__,
int img_rows, int img_cols,
__global const float* keypoints,
int keypoints_step, int keypoints_offset,
__global float * descriptors, __global float * descriptors,
__global const float * keypoints, int descriptors_step, int descriptors_offset)
int descriptors_step,
int keypoints_step,
int rows,
int cols,
int img_step
)
{ {
descriptors_step /= sizeof(float); descriptors_step /= sizeof(float);
keypoints_step /= sizeof(float); keypoints_step /= sizeof(float);
...@@ -1250,7 +1060,7 @@ void compute_descriptors64( ...@@ -1250,7 +1060,7 @@ void compute_descriptors64(
volatile __local float sdyabs[25]; volatile __local float sdyabs[25];
volatile __local float s_PATCH[6*6]; volatile __local float s_PATCH[6*6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); calc_dx_dy(__PASS_imgTex__, img_rows, img_cols, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
...@@ -1279,17 +1089,15 @@ void compute_descriptors64( ...@@ -1279,17 +1089,15 @@ void compute_descriptors64(
} }
} }
} }
__kernel __kernel
void compute_descriptors128( void SURF_computeDescriptors128(
IMAGE_INT8 imgTex, __PARAM_imgTex__,
__global float * descriptors, int img_rows, int img_cols,
__global float * keypoints, __global const float* keypoints,
int descriptors_step, int keypoints_step, int keypoints_offset,
int keypoints_step, __global float* descriptors,
int rows, int descriptors_step, int descriptors_offset)
int cols,
int img_step
)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
keypoints_step /= sizeof(*keypoints); keypoints_step /= sizeof(*keypoints);
...@@ -1310,7 +1118,7 @@ void compute_descriptors128( ...@@ -1310,7 +1118,7 @@ void compute_descriptors128(
volatile __local float sdabs2[25]; volatile __local float sdabs2[25];
volatile __local float s_PATCH[6*6]; volatile __local float s_PATCH[6*6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step); calc_dx_dy(__PASS_imgTex__, img_rows, img_cols, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
...@@ -1483,7 +1291,7 @@ void reduce_sum64(volatile __local float* smem, int tid) ...@@ -1483,7 +1291,7 @@ void reduce_sum64(volatile __local float* smem, int tid)
} }
__kernel __kernel
void normalize_descriptors128(__global float * descriptors, int descriptors_step) void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_step, int descriptors_offset)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
// no need for thread ID // no need for thread ID
...@@ -1509,8 +1317,9 @@ void normalize_descriptors128(__global float * descriptors, int descriptors_step ...@@ -1509,8 +1317,9 @@ void normalize_descriptors128(__global float * descriptors, int descriptors_step
// normalize and store in output // normalize and store in output
descriptor_base[get_local_id(0)] = lookup / len; descriptor_base[get_local_id(0)] = lookup / len;
} }
__kernel __kernel
void normalize_descriptors64(__global float * descriptors, int descriptors_step) void SURF_normalizeDescriptors64(__global float * descriptors, int descriptors_step, int descriptors_offset)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
// no need for thread ID // no need for thread ID
......
...@@ -60,11 +60,6 @@ ...@@ -60,11 +60,6 @@
# include "opencv2/cudaarithm.hpp" # include "opencv2/cudaarithm.hpp"
#endif #endif
#ifdef HAVE_OPENCV_OCL
# include "opencv2/nonfree/ocl.hpp"
# include "opencv2/ocl/private/util.hpp"
#endif
#include "opencv2/core/private.hpp" #include "opencv2/core/private.hpp"
#endif #endif
...@@ -108,6 +108,7 @@ Modifications by Ian Mahon ...@@ -108,6 +108,7 @@ Modifications by Ian Mahon
*/ */
#include "precomp.hpp" #include "precomp.hpp"
#include "surf.hpp"
namespace cv namespace cv
{ {
...@@ -897,11 +898,42 @@ void SURF::operator()(InputArray _img, InputArray _mask, ...@@ -897,11 +898,42 @@ void SURF::operator()(InputArray _img, InputArray _mask,
OutputArray _descriptors, OutputArray _descriptors,
bool useProvidedKeypoints) const bool useProvidedKeypoints) const
{ {
Mat img = _img.getMat(), mask = _mask.getMat(), mask1, sum, msum; int imgtype = _img.type(), imgcn = CV_MAT_CN(imgtype);
bool doDescriptors = _descriptors.needed(); bool doDescriptors = _descriptors.needed();
CV_Assert(!img.empty() && img.depth() == CV_8U); CV_Assert(!_img.empty() && CV_MAT_DEPTH(imgtype) == CV_8U && (imgcn == 1 || imgcn == 3 || imgcn == 4));
if( img.channels() > 1 ) CV_Assert(_descriptors.needed() || !useProvidedKeypoints);
if( ocl::useOpenCL() )
{
SURF_OCL ocl_surf;
UMat gpu_kpt;
bool ok = ocl_surf.init(this);
if( ok )
{
if( !_descriptors.needed() )
{
ok = ocl_surf.detect(_img, _mask, gpu_kpt);
}
else
{
if(useProvidedKeypoints)
ocl_surf.uploadKeypoints(keypoints, gpu_kpt);
ok = ocl_surf.detectAndCompute(_img, _mask, gpu_kpt, _descriptors, useProvidedKeypoints);
}
}
if( ok )
{
if(!useProvidedKeypoints)
ocl_surf.downloadKeypoints(gpu_kpt, keypoints);
return;
}
}
Mat img = _img.getMat(), mask = _mask.getMat(), mask1, sum, msum;
if( imgcn > 1 )
cvtColor(img, img, COLOR_BGR2GRAY); cvtColor(img, img, COLOR_BGR2GRAY);
CV_Assert(mask.empty() || (mask.type() == CV_8U && mask.size() == img.size())); CV_Assert(mask.empty() || (mask.type() == CV_8U && mask.size() == img.size()));
......
///////////// see LICENSE.txt in the OpenCV root directory //////////////
#ifndef __OPENCV_NONFREE_SURF_HPP__
#define __OPENCV_NONFREE_SURF_HPP__
namespace cv
{
//! Speeded up robust features, port from CUDA module.
////////////////////////////////// SURF //////////////////////////////////////////
class SURF_OCL
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the full constructor taking all the necessary parameters
SURF_OCL();
bool init(const SURF* params);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const { return params->extended ? 128 : 64; }
void uploadKeypoints(const std::vector<KeyPoint> &keypoints, UMat &keypointsGPU);
void downloadKeypoints(const UMat &keypointsGPU, std::vector<KeyPoint> &keypoints);
//! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images
//! keypoints will have nFeature cols and 6 rows
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
bool detect(InputArray img, InputArray mask, UMat& keypoints);
//! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
bool detectAndCompute(InputArray img, InputArray mask, UMat& keypoints,
OutputArray descriptors, bool useProvidedKeypoints = false);
protected:
bool setImage(InputArray img, InputArray mask);
// kernel callers declarations
bool calcLayerDetAndTrace(int octave, int layer_rows);
bool findMaximaInLayer(int counterOffset, int octave, int layer_rows, int layer_cols);
bool interpolateKeypoint(int maxCounter, UMat &keypoints, int octave, int layer_rows, int maxFeatures);
bool calcOrientation(UMat &keypoints);
bool setUpRight(UMat &keypoints);
bool computeDescriptors(const UMat &keypoints, OutputArray descriptors);
bool detectKeypoints(UMat &keypoints);
const SURF* params;
int refcount;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
UMat sum, intBuffer;
UMat det, trace;
UMat maxPosBuffer;
int img_cols, img_rows;
int maxCandidates;
int maxFeatures;
UMat img, counters;
// texture buffers
ocl::Image2D imgTex, sumTex;
bool haveImageSupport;
String kerOpts;
int status;
};
/*
template<typename _Tp> void copyVectorToUMat(const std::vector<_Tp>& v, UMat& um)
{
if(v.empty())
um.release();
else
Mat(1, (int)(v.size()*sizeof(v[0])), CV_8U, (void*)&v[0]).copyTo(um);
}
template<typename _Tp> void copyUMatToVector(const UMat& um, std::vector<_Tp>& v)
{
if(um.empty())
v.clear();
else
{
size_t sz = um.total()*um.elemSize();
CV_Assert(um.isContinuous() && (sz % sizeof(_Tp) == 0));
v.resize(sz/sizeof(_Tp));
Mat m(um.size(), um.type(), &v[0]);
um.copyTo(m);
}
}*/
}
#endif
...@@ -43,42 +43,16 @@ ...@@ -43,42 +43,16 @@
// //
//M*/ //M*/
#include "precomp.hpp" #include "precomp.hpp"
#include "surf.hpp"
#ifdef HAVE_OPENCV_OCL
#include <cstdio> #include <cstdio>
#include <sstream> #include <sstream>
#include "opencl_kernels.hpp" #include "opencl_kernels.hpp"
using namespace cv;
using namespace cv::ocl;
static ProgramEntry surfprog = cv::ocl::nonfree::surf;
namespace cv namespace cv
{ {
namespace ocl
{
// The number of degrees between orientation samples in calcOrientation
const static int ORI_SEARCH_INC = 5;
// The local size of the calcOrientation kernel
const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC);
static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], enum { ORI_SEARCH_INC=5, ORI_LOCAL_SIZE=(360 / ORI_SEARCH_INC) };
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
{
std::stringstream optsStr;
optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " ";
optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " ";
cl_kernel kernel;
kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str());
size_t wave_size = queryWaveFrontSize(kernel);
CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
optsStr << "-D WAVE_SIZE=" << wave_size;
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str());
}
}
}
static inline int calcSize(int octave, int layer) static inline int calcSize(int octave, int layer)
{ {
...@@ -96,223 +70,208 @@ static inline int calcSize(int octave, int layer) ...@@ -96,223 +70,208 @@ static inline int calcSize(int octave, int layer)
} }
class SURF_OCL_Invoker SURF_OCL::SURF_OCL()
{ {
public: img_cols = img_rows = maxCandidates = maxFeatures = 0;
// facilities haveImageSupport = false;
void bindImgTex(const oclMat &img, cl_mem &texture); status = -1;
}
//void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
//void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
// kernel callers declarations
void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter, bool SURF_OCL::init(const SURF* p)
oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures); {
params = p;
if(status < 0)
{
status = 0;
if(ocl::haveOpenCL())
{
const ocl::Device& dev = ocl::Device::getDefault();
if( dev.type() == ocl::Device::TYPE_CPU || dev.doubleFPConfig() == 0 )
return false;
haveImageSupport = false;//dev.imageSupport();
kerOpts = haveImageSupport ? "-D HAVE_IMAGE2D -D DOUBLE_SUPPORT" : "";
status = 1;
}
}
return status > 0;
}
void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures); bool SURF_OCL::setImage(InputArray _img, InputArray _mask)
{
if( status <= 0 )
return false;
if( !_mask.empty())
return false;
int imgtype = _img.type();
CV_Assert(!_img.empty());
CV_Assert(params && params->nOctaves > 0 && params->nOctaveLayers > 0);
int min_size = calcSize(params->nOctaves - 1, 0);
Size sz = _img.size();
img_cols = sz.width;
img_rows = sz.height;
CV_Assert(img_rows >= min_size && img_cols >= min_size);
const int layer_rows = img_rows >> (params->nOctaves - 1);
const int layer_cols = img_cols >> (params->nOctaves - 1);
const int min_margin = ((calcSize((params->nOctaves - 1), 2) >> 1) >> (params->nOctaves - 1)) + 1;
CV_Assert(layer_rows - 2 * min_margin > 0);
CV_Assert(layer_cols - 2 * min_margin > 0);
maxFeatures = std::min(static_cast<int>(img_cols*img_rows * 0.01f), 65535);
maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
CV_Assert(maxFeatures > 0);
counters.create(1, params->nOctaves + 1, CV_32SC1);
counters.setTo(Scalar::all(0));
img.release();
if(_img.isUMat() && imgtype == CV_8UC1)
img = _img.getUMat();
else if( imgtype == CV_8UC1 )
_img.copyTo(img);
else
cvtColor(_img, img, COLOR_BGR2GRAY);
void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures); integral(img, sum);
// end of kernel callers declarations
SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) : if(haveImageSupport)
surf_(surf),
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty()), counters(oclMat()),
imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
{ {
CV_Assert(!img.empty() && img.type() == CV_8UC1); imgTex = ocl::Image2D(img);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); sumTex = ocl::Image2D(sum);
CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0); }
const int min_size = calcSize(surf_.nOctaves - 1, 0); return true;
CV_Assert(img_rows - min_size >= 0); }
CV_Assert(img_cols - min_size >= 0);
const int layer_rows = img_rows >> (surf_.nOctaves - 1);
const int layer_cols = img_cols >> (surf_.nOctaves - 1);
const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
CV_Assert(layer_rows - 2 * min_margin > 0);
CV_Assert(layer_cols - 2 * min_margin > 0);
maxFeatures = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535); bool SURF_OCL::detectKeypoints(UMat &keypoints)
maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535); {
// create image pyramid buffers
// different layers have same sized buffers, but they are sampled from Gaussian kernel.
det.create(img_rows * (params->nOctaveLayers + 2), img_cols, CV_32F);
trace.create(img_rows * (params->nOctaveLayers + 2), img_cols, CV_32FC1);
CV_Assert(maxFeatures > 0); maxPosBuffer.create(1, maxCandidates, CV_32SC4);
keypoints.create(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32F);
keypoints.setTo(Scalar::all(0));
Mat cpuCounters;
counters.create(1, surf_.nOctaves + 1, CV_32SC1); for (int octave = 0; octave < params->nOctaves; ++octave)
counters.setTo(Scalar::all(0)); {
const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave;
integral(img, surf_.sum); if(!calcLayerDetAndTrace(octave, layer_rows))
return false;
bindImgTex(img, imgTex); if(!findMaximaInLayer(1 + octave, octave, layer_rows, layer_cols))
bindImgTex(surf_.sum, sumTex); return false;
finish();
maskSumTex = 0; cpuCounters = counters.getMat(ACCESS_READ);
int maxCounter = cpuCounters.at<int>(1 + octave);
maxCounter = std::min(maxCounter, maxCandidates);
cpuCounters.release();
if (use_mask) if (maxCounter > 0)
{ {
CV_Error(Error::StsBadFunc, "Masked SURF detector is not implemented yet"); if(!interpolateKeypoint(maxCounter, keypoints, octave, layer_rows, maxFeatures))
//!FIXME return false;
// temp fix for missing min overload
//oclMat temp(mask.size(), mask.type());
//temp.setTo(Scalar::all(1.0));
////cv::ocl::min(mask, temp, surf_.mask1); ///////// disable this
//integral(surf_.mask1, surf_.maskSum);
//bindImgTex(surf_.maskSum, maskSumTex);
} }
} }
void detectKeypoints(oclMat &keypoints) cpuCounters = counters.getMat(ACCESS_READ);
{ int featureCounter = cpuCounters.at<int>(0);
// create image pyramid buffers featureCounter = std::min(featureCounter, maxFeatures);
// different layers have same sized buffers, but they are sampled from Gaussian kernel. cpuCounters.release();
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
ensureSizeIsEnough(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
keypoints.setTo(Scalar::all(0));
for (int octave = 0; octave < surf_.nOctaves; ++octave)
{
const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave;
//loadOctaveConstants(octave, layer_rows, layer_cols); keypoints = UMat(keypoints, Rect(0, 0, featureCounter, keypoints.rows));
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows); if (params->upright)
return setUpRight(keypoints);
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave, else
octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols); return calcOrientation(keypoints);
}
int maxCounter = ((Mat)counters).at<int>(1 + octave);
maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));
if (maxCounter > 0)
{
icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter,
keypoints, counters, octave, layer_rows, maxFeatures);
}
}
int featureCounter = Mat(counters).at<int>(0);
featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));
keypoints.cols = featureCounter; bool SURF_OCL::setUpRight(UMat &keypoints)
{
int nFeatures = keypoints.cols;
if( nFeatures == 0 )
return true;
if (surf_.upright) size_t globalThreads[3] = {nFeatures, 1};
{ ocl::Kernel kerUpRight("SURF_setUpRight", ocl::nonfree::surf_oclsrc, kerOpts);
//keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0)); return kerUpRight.args(ocl::KernelArg::ReadWrite(keypoints)).run(2, globalThreads, 0, true);
setUpright(keypoints); }
}
else
{
findOrientation(keypoints);
}
}
void setUpright(oclMat &keypoints) bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptors)
{
int dsize = params->descriptorSize();
int nFeatures = keypoints.cols;
if (nFeatures == 0)
{ {
const int nFeatures = keypoints.cols; _descriptors.release();
if(nFeatures > 0) return true;
{
icvSetUpright_gpu(keypoints, keypoints.cols);
}
} }
_descriptors.create(nFeatures, dsize, CV_32F);
UMat descriptors;
if( _descriptors.isUMat() )
descriptors = _descriptors.getUMat();
else
descriptors.create(nFeatures, dsize, CV_32F);
void findOrientation(oclMat &keypoints) ocl::Kernel kerCalcDesc, kerNormDesc;
{
const int nFeatures = keypoints.cols;
if (nFeatures > 0)
{
icvCalcOrientation_gpu(keypoints, nFeatures);
}
}
void computeDescriptors(const oclMat &keypoints, oclMat &descriptors, int descriptorSize) if( dsize == 64 )
{ {
const int nFeatures = keypoints.cols; kerCalcDesc.create("SURF_computeDescriptors64", ocl::nonfree::surf_oclsrc, kerOpts);
if (nFeatures > 0) kerNormDesc.create("SURF_normalizeDescriptors64", ocl::nonfree::surf_oclsrc, kerOpts);
{
ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
compute_descriptors_gpu(descriptors, keypoints, nFeatures);
}
} }
else
~SURF_OCL_Invoker()
{ {
if(imgTex) CV_Assert(dsize == 128);
openCLFree(imgTex); kerCalcDesc.create("SURF_computeDescriptors128", ocl::nonfree::surf_oclsrc, kerOpts);
if(sumTex) kerNormDesc.create("SURF_normalizeDescriptors128", ocl::nonfree::surf_oclsrc, kerOpts);
openCLFree(sumTex);
if(maskSumTex)
openCLFree(maskSumTex);
} }
private: size_t localThreads[] = {6, 6};
SURF_OCL &surf_; size_t globalThreads[] = {nFeatures*localThreads[0], localThreads[1]};
int img_cols, img_rows;
bool use_mask; if(haveImageSupport)
int maxCandidates;
int maxFeatures;
oclMat counters;
// texture buffers
cl_mem imgTex;
cl_mem sumTex;
cl_mem maskSumTex;
const oclMat _img; // make a copy for non-image2d_t supported platform
SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
{ {
(*this) = right; kerCalcDesc.args(imgTex,
return *this; img_rows, img_cols,
} // remove warning C4512 ocl::KernelArg::ReadOnlyNoSize(keypoints),
}; ocl::KernelArg::WriteOnlyNoSize(descriptors));
}
else
{
kerCalcDesc.args(ocl::KernelArg::ReadOnlyNoSize(img),
img_rows, img_cols,
ocl::KernelArg::ReadOnlyNoSize(keypoints),
ocl::KernelArg::WriteOnlyNoSize(descriptors));
}
cv::ocl::SURF_OCL::SURF_OCL() if(!kerCalcDesc.run(2, globalThreads, localThreads, true))
{ return false;
hessianThreshold = 100.0f;
extended = true;
nOctaves = 4;
nOctaveLayers = 2;
keypointsRatio = 0.01f;
upright = false;
}
cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright) size_t localThreads_n[] = {dsize, 1};
{ size_t globalThreads_n[] = {nFeatures*localThreads_n[0], localThreads_n[1]};
hessianThreshold = saturate_cast<float>(_threshold);
extended = _extended;
nOctaves = _nOctaves;
nOctaveLayers = _nOctaveLayers;
keypointsRatio = _keypointsRatio;
upright = _upright;
}
int cv::ocl::SURF_OCL::descriptorSize() const globalThreads[0] = nFeatures * localThreads[0];
{ globalThreads[1] = localThreads[1];
return extended ? 128 : 64; bool ok = kerNormDesc.args(ocl::KernelArg::ReadWriteNoSize(descriptors)).
run(2, globalThreads_n, localThreads_n, true);
if(ok && !_descriptors.isUMat())
descriptors.copyTo(_descriptors);
return ok;
} }
int cv::ocl::SURF_OCL::defaultNorm() const
{
return NORM_L2;
}
void cv::ocl::SURF_OCL::uploadKeypoints(const std::vector<KeyPoint> &keypoints, oclMat &keypointsGPU) void SURF_OCL::uploadKeypoints(const std::vector<KeyPoint> &keypoints, UMat &keypointsGPU)
{ {
if (keypoints.empty()) if (keypoints.empty())
keypointsGPU.release(); keypointsGPU.release();
...@@ -340,11 +299,11 @@ void cv::ocl::SURF_OCL::uploadKeypoints(const std::vector<KeyPoint> &keypoints, ...@@ -340,11 +299,11 @@ void cv::ocl::SURF_OCL::uploadKeypoints(const std::vector<KeyPoint> &keypoints,
kp_laplacian[i] = 1; kp_laplacian[i] = 1;
} }
keypointsGPU.upload(keypointsCPU); keypointsCPU.copyTo(keypointsGPU);
} }
} }
void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vector<KeyPoint> &keypoints) void SURF_OCL::downloadKeypoints(const UMat &keypointsGPU, std::vector<KeyPoint> &keypoints)
{ {
const int nFeatures = keypointsGPU.cols; const int nFeatures = keypointsGPU.cols;
...@@ -354,8 +313,7 @@ void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vecto ...@@ -354,8 +313,7 @@ void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vecto
{ {
CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT); CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
Mat keypointsCPU(keypointsGPU); Mat keypointsCPU = keypointsGPU.getMat(ACCESS_READ);
keypoints.resize(nFeatures); keypoints.resize(nFeatures);
float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW); float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
...@@ -380,354 +338,122 @@ void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vecto ...@@ -380,354 +338,122 @@ void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vecto
} }
} }
void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat &descriptorsGPU, std::vector<float> &descriptors) bool SURF_OCL::detect(InputArray _img, InputArray _mask, UMat& keypoints)
{
if (descriptorsGPU.empty())
descriptors.clear();
else
{
CV_Assert(descriptorsGPU.type() == CV_32F);
descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
descriptorsGPU.download(descriptorsCPU);
}
}
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints)
{
if (!img.empty())
{
SURF_OCL_Invoker surf(*this, img, mask);
surf.detectKeypoints(keypoints);
}
}
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
bool useProvidedKeypoints)
{
if (!img.empty())
{
SURF_OCL_Invoker surf(*this, img, mask);
if (!useProvidedKeypoints)
surf.detectKeypoints(keypoints);
else if (!upright)
{
surf.findOrientation(keypoints);
}
surf.computeDescriptors(keypoints, descriptors, descriptorSize());
}
}
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints)
{ {
oclMat keypointsGPU; if( !setImage(_img, _mask) )
return false;
(*this)(img, mask, keypointsGPU);
downloadKeypoints(keypointsGPU, keypoints); return detectKeypoints(keypoints);
} }
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints,
oclMat &descriptors, bool useProvidedKeypoints)
{
oclMat keypointsGPU;
if (useProvidedKeypoints)
uploadKeypoints(keypoints, keypointsGPU);
(*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints); bool SURF_OCL::detectAndCompute(InputArray _img, InputArray _mask, UMat& keypoints,
OutputArray _descriptors, bool useProvidedKeypoints )
downloadKeypoints(keypointsGPU, keypoints);
}
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints,
std::vector<float> &descriptors, bool useProvidedKeypoints)
{ {
oclMat descriptorsGPU; if( !setImage(_img, _mask) )
return false;
(*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints); if( !useProvidedKeypoints && !detectKeypoints(keypoints) )
return false;
downloadDescriptors(descriptorsGPU, descriptors); return computeDescriptors(keypoints, _descriptors);
}
void cv::ocl::SURF_OCL::releaseMemory()
{
sum.release();
mask1.release();
maskSum.release();
intBuffer.release();
det.release();
trace.release();
maxPosBuffer.release();
} }
inline int divUp(int a, int b) { return (a + b-1)/b; }
// bind source buffer to image oject.
void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
{
if(texture)
{
openCLFree(texture);
}
texture = bindTexture(img);
}
//////////////////////////// ////////////////////////////
// kernel caller definitions // kernel caller definitions
void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int c_layer_rows) bool SURF_OCL::calcLayerDetAndTrace(int octave, int c_layer_rows)
{ {
int nOctaveLayers = params->nOctaveLayers;
const int min_size = calcSize(octave, 0); const int min_size = calcSize(octave, 0);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave); const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
const int max_samples_j = 1 + ((img_cols - min_size) >> octave); const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
Context *clCxt = det.clCxt; size_t localThreads[] = {16, 16};
String kernelName = "icvCalcLayerDetAndTrace"; size_t globalThreads[] =
std::vector< std::pair<size_t, const void *> > args; {
divUp(max_samples_j, (int)localThreads[0]) * localThreads[0],
if(sumTex) divUp(max_samples_i, (int)localThreads[1]) * localThreads[1] * (nOctaveLayers + 2)
};
ocl::Kernel kerCalcDetTrace("SURF_calcLayerDetAndTrace", ocl::nonfree::surf_oclsrc, kerOpts);
if(haveImageSupport)
{ {
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex)); kerCalcDetTrace.args(sumTex,
img_rows, img_cols, nOctaveLayers,
octave, c_layer_rows,
ocl::KernelArg::WriteOnlyNoSize(det),
ocl::KernelArg::WriteOnlyNoSize(trace));
} }
else else
{ {
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported kerCalcDetTrace.args(ocl::KernelArg::ReadOnlyNoSize(sum),
img_rows, img_cols, nOctaveLayers,
octave, c_layer_rows,
ocl::KernelArg::WriteOnlyNoSize(det),
ocl::KernelArg::WriteOnlyNoSize(trace));
} }
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data)); return kerCalcDetTrace.run(2, globalThreads, localThreads, true);
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3] =
{
divUp(max_samples_j, localThreads[0]) *localThreads[0],
divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
1
};
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
} }
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, bool SURF_OCL::findMaximaInLayer(int counterOffset, int octave,
int octave, bool useMask, int nLayers, int layer_rows, int layer_cols) int layer_rows, int layer_cols)
{ {
const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1; const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
int nOctaveLayers = params->nOctaveLayers;
Context *clCxt = det.clCxt; size_t localThreads[3] = {16, 16};
String kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer"; size_t globalThreads[3] =
std::vector< std::pair<size_t, const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&counterOffset));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&nLayers));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxCandidates));
args.push_back( std::make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
if(useMask)
{ {
if(maskSumTex) divUp(layer_cols - 2 * min_margin, (int)localThreads[0] - 2) * localThreads[0],
{ divUp(layer_rows - 2 * min_margin, (int)localThreads[1] - 2) * nOctaveLayers * localThreads[1]
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex)); };
}
else
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
}
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
1
};
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); ocl::Kernel kerFindMaxima("SURF_findMaximaInLayer", ocl::nonfree::surf_oclsrc, kerOpts);
return kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det),
ocl::KernelArg::ReadOnlyNoSize(trace),
ocl::KernelArg::PtrReadWrite(maxPosBuffer),
ocl::KernelArg::PtrReadWrite(counters),
counterOffset, img_rows, img_cols,
octave, nOctaveLayers,
layer_rows, layer_cols,
maxCandidates,
(float)params->hessianThreshold).run(2, globalThreads, localThreads, true);
} }
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter, bool SURF_OCL::interpolateKeypoint(int maxCounter, UMat &keypoints, int octave, int layer_rows, int max_features)
oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features)
{ {
Context *clCxt = det.clCxt;
String kernelName = "icvInterpolateKeypoint";
std::vector< std::pair<size_t, const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counters_.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&max_features));
size_t localThreads[3] = {3, 3, 3}; size_t localThreads[3] = {3, 3, 3};
size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1}; size_t globalThreads[3] = {maxCounter*localThreads[0], localThreads[1], 3};
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures) ocl::Kernel kerInterp("SURF_interpolateKeypoint", ocl::nonfree::surf_oclsrc, kerOpts);
{
Context *clCxt = counters.clCxt;
String kernelName = "icvCalcOrientation";
std::vector< std::pair<size_t, const void *> > args;
if(sumTex) return kerInterp.args(ocl::KernelArg::ReadOnlyNoSize(det),
{ ocl::KernelArg::PtrReadOnly(maxPosBuffer),
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex)); ocl::KernelArg::ReadWriteNoSize(keypoints),
} ocl::KernelArg::PtrReadWrite(counters),
else img_rows, img_cols, octave, layer_rows, max_features).
{ run(3, globalThreads, localThreads, true);
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
size_t localThreads[3] = {ORI_LOCAL_SIZE, 1, 1};
size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1};
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
} }
void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures) bool SURF_OCL::calcOrientation(UMat &keypoints)
{ {
Context *clCxt = counters.clCxt; int nFeatures = keypoints.cols;
String kernelName = "icvSetUpright"; if( nFeatures == 0 )
return true;
std::vector< std::pair<size_t, const void *> > args; ocl::Kernel kerOri("SURF_calcOrientation", ocl::nonfree::surf_oclsrc, kerOpts);
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data)); if( haveImageSupport )
args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step)); kerOri.args(sumTex, img_rows, img_cols,
args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures)); ocl::KernelArg::ReadWriteNoSize(keypoints));
size_t localThreads[3] = {256, 1, 1};
size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
Context *clCxt = descriptors.clCxt;
String kernelName;
std::vector< std::pair<size_t, const void *> > args;
size_t localThreads[3] = {1, 1, 1};
size_t globalThreads[3] = {1, 1, 1};
if(descriptors.cols == 64)
{
kernelName = "compute_descriptors64";
localThreads[0] = 6;
localThreads[1] = 6;
globalThreads[0] = nFeatures * localThreads[0];
globalThreads[1] = 16 * localThreads[1];
args.clear();
if(imgTex)
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
}
else
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
kernelName = "normalize_descriptors64";
localThreads[0] = 64;
localThreads[1] = 1;
globalThreads[0] = nFeatures * localThreads[0];
globalThreads[1] = localThreads[1];
args.clear();
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
}
else else
{ kerOri.args(ocl::KernelArg::ReadOnlyNoSize(sum),
kernelName = "compute_descriptors128"; img_rows, img_cols,
ocl::KernelArg::ReadWriteNoSize(keypoints));
localThreads[0] = 6;
localThreads[1] = 6;
globalThreads[0] = nFeatures * localThreads[0];
globalThreads[1] = 16 * localThreads[1];
args.clear(); size_t localThreads[3] = {ORI_LOCAL_SIZE, 1};
if(imgTex) size_t globalThreads[3] = {nFeatures * localThreads[0], 1};
{ return kerOri.run(2, globalThreads, localThreads, true);
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
}
else
{
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
}
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
kernelName = "normalize_descriptors128";
localThreads[0] = 128;
localThreads[1] = 1;
globalThreads[0] = nFeatures * localThreads[0];
globalThreads[1] = localThreads[1];
args.clear();
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
}
} }
#endif //HAVE_OPENCV_OCL }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment