Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
ea433cc4
Commit
ea433cc4
authored
Feb 26, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
remove the device specific logic
parent
620c6994
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
360 additions
and
236 deletions
+360
-236
nonfree_surf.cl
modules/ocl/src/kernels/nonfree_surf.cl
+195
-133
surf.cpp
modules/ocl/src/surf.cpp
+165
-103
No files found.
modules/ocl/src/kernels/nonfree_surf.cl
View file @
ea433cc4
...
...
@@ -43,10 +43,39 @@
//
//M*/
#
pragma
OPENCL
EXTENSION
cl_amd_printf
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_local_int32_base_atomics
:
enable
//
specialized
for
non-image2d_t
supported
platform,
intel
HD4000,
for
example
#
ifdef
DISABLE_IMAGE2D
#
define
IMAGE_INT32
__global
uint
*
#
define
IMAGE_INT8
__global
uchar
*
#
else
#
define
IMAGE_INT32
image2d_t
#
define
IMAGE_INT8
image2d_t
#
endif
uint
read_sumTex
(
IMAGE_INT32
img,
sampler_t
sam,
int2
coord,
int
rows,
int
cols,
int
elemPerRow
)
{
#
ifdef
DISABLE_IMAGE2D
int
x
=
clamp
(
coord.x,
0
,
cols
)
;
int
y
=
clamp
(
coord.y,
0
,
rows
)
;
return
img[elemPerRow
*
y
+
x]
;
#
else
return
read_imageui
(
img,
sam,
coord
)
.
x
;
#
endif
}
uchar
read_imgTex
(
IMAGE_INT8
img,
sampler_t
sam,
float2
coord,
int
rows,
int
cols,
int
elemPerRow
)
{
#
ifdef
DISABLE_IMAGE2D
int
x
=
clamp
(
convert_int_rte
(
coord.x
)
,
0
,
cols
-
1
)
;
int
y
=
clamp
(
convert_int_rte
(
coord.y
)
,
0
,
rows
-
1
)
;
return
img[elemPerRow
*
y
+
x]
;
#
else
return
(
uchar
)
read_imageui
(
img,
sam,
coord
)
.
x
;
#
endif
}
//
dynamically
change
the
precision
used
for
floating
type
#
if
defined
(
__ATI__
)
|
| defined (__NVIDIA__)
...
...
@@ -58,14 +87,24 @@
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE
| CLK_FILTER_NEAREST;
#ifndef FLT_EPSILON
#define FLT_EPSILON (1e-15)
#
define CV_PI_F 3.14159265
f
#
endi
f
#ifndef CV_PI_F
#define CV_PI_F 3.14159265f
#endif
// Use integral image to calculate haar wavelets.
// N = 2
// for simple haar paatern
float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_2(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
...
...
@@ -81,11 +120,10 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
...
...
@@ -93,7 +131,13 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
}
// N = 3
float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_3(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
...
...
@@ -109,11 +153,10 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
...
...
@@ -121,7 +164,13 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
}
// N = 4
float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x)
float icvCalcHaarPatternSum_4(
IMAGE_INT32 sumTex,
__constant float src[2][5],
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
...
...
@@ -137,11 +186,10 @@ float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int
int dy2 = convert_int_rte(ratio * src[k][3]);
F t = 0;
t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
...
...
@@ -172,7 +220,7 @@ __inline int calcSize(int octave, int layer)
//calculate targeted layer per-pixel determinant and trace with an integral image
__kernel void icvCalcLayerDetAndTrace(
image2d_t
sumTex, // input integral image
IMAGE_INT32
sumTex, // input integral image
__global float * det, // output Determinant
__global float * trace, // output trace
int det_step, // the step of det in bytes
...
...
@@ -181,11 +229,13 @@ __kernel void icvCalcLayerDetAndTrace(
int c_img_cols,
int c_nOctaveLayers,
int c_octave,
int c_layer_rows
int c_layer_rows,
int sumTex_step
)
{
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
sumTex_step/= sizeof(uint);
// Determine the indices
const int gridDim_y = get_num_groups(1) / (c_nOctaveLayers + 2);
const int blockIdx_y = get_group_id(1) % gridDim_y;
...
...
@@ -205,12 +255,12 @@ __kernel void icvCalcLayerDetAndTrace(
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave
, c_img_rows, c_img_cols, sumTex_step
);
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave
, c_img_rows, c_img_cols, sumTex_step
);
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave
, c_img_rows, c_img_cols, sumTex_step
);
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
}
}
...
...
@@ -220,7 +270,7 @@ __kernel void icvCalcLayerDetAndTrace(
__constant float c_DM[5] = {0, 0, 9, 9, 1};
bool within_check(
image2d_t maskSumTex, int sum_i, int sum_j, int size
)
bool within_check(
IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step
)
{
float ratio = (float)size / 9.0f;
...
...
@@ -233,10 +283,10 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
float t = 0;
t += read_
imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x
;
t -= read_
imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x
;
t -= read_
imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x
;
t += read_
imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x
;
t += read_
sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step)
;
t -= read_
sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step)
;
t -= read_
sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step)
;
t += read_
sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step)
;
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
...
...
@@ -246,9 +296,9 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
// Non-maximal suppression to further filtering the candidates from previous step
__kernel
void icvFindMaximaInLayer_withmask(
__global const float * det,
__global const float * trace,
__global int4 * maxPosBuffer,
__global const float * det,
__global const float * trace,
__global int4 * maxPosBuffer,
volatile __global unsigned int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
...
...
@@ -261,7 +311,8 @@ __kernel
int c_layer_cols,
int c_max_candidates,
float c_hessianThreshold,
image2d_t maskSumTex
IMAGE_INT32 maskSumTex,
int mask_step
)
{
volatile __local float N9[768]; // threads.x * threads.y * 3
...
...
@@ -269,6 +320,7 @@ __kernel
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
maxCounter += counter_offset;
mask_step /= sizeof(uint);
// Determine the indices
const int gridDim_y = get_num_groups(1) / c_nOctaveLayers;
...
...
@@ -288,26 +340,26 @@ __kernel
// Is this thread within the hessian buffer?
const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
N9[localLin - zoff] =
det[det_step *
N9[localLin - zoff] =
det[det_step *
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin ] =
det[det_step *
N9[localLin ] =
det[det_step *
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin + zoff] =
det[det_step *
N9[localLin + zoff] =
det[det_step *
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin
if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
&& get_local_id(0) > 0
&& get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0
&& get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
...
...
@@ -321,7 +373,7 @@ __kernel
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (within_check(maskSumTex, sum_i, sum_j, size))
if (within_check(maskSumTex, sum_i, sum_j, size
, c_img_rows, c_img_cols, mask_step
))
{
// Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
...
...
@@ -372,9 +424,9 @@ __kernel
__kernel
void icvFindMaximaInLayer(
__global float * det,
__global float * trace,
__global int4 * maxPosBuffer,
__global float * det,
__global float * trace,
__global int4 * maxPosBuffer,
volatile __global unsigned int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
...
...
@@ -417,19 +469,19 @@ __kernel
int l_x = min(max(j, 0), c_img_cols - 1);
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
N9[localLin - zoff] =
N9[localLin - zoff] =
det[det_step * (l_y - c_layer_rows) + l_x];
N9[localLin ] =
N9[localLin ] =
det[det_step * (l_y ) + l_x];
N9[localLin + zoff] =
N9[localLin + zoff] =
det[det_step * (l_y + c_layer_rows) + l_x];
barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin
if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
&& get_local_id(0) > 0
&& get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0
&& get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
...
...
@@ -497,17 +549,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
{
F invdet = 1.0 / det;
x[0] = invdet *
x[0] = invdet *
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
x[1] = invdet *
x[1] = invdet *
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
x[2] = invdet *
x[2] = invdet *
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
...
...
@@ -528,9 +580,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
__kernel
__kernel
void icvInterpolateKeypoint(
__global const float * det,
__global const float * det,
__global const int4 * maxPosBuffer,
__global float * keypoints,
volatile __global unsigned int * featureCounter,
...
...
@@ -560,7 +612,7 @@ __kernel
volatile __local float N9[3][3][3];
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
det[det_step * (c_layer_rows * layer + i) + j];
barrier(CLK_LOCAL_MEM_FENCE);
...
...
@@ -658,27 +710,27 @@ __kernel
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
0.001707611023448408f, 0.001455130288377404f};
...
...
@@ -691,27 +743,29 @@ void reduce_32_sum(volatile __local float * data, float partial_reduction, int
data[tid] = partial_reduction;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
if (tid < 16)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
#undef op
}
__kernel
void icvCalcOrientation(
image2d_t
sumTex,
IMAGE_INT32
sumTex,
__global float * keypoints,
int keypoints_step,
int c_img_rows,
int c_img_cols
int c_img_cols,
int sum_step
)
{
keypoints_step /= sizeof(*keypoints);
sum_step /= sizeof(uint);
__global float* featureX = keypoints + X_ROW * keypoints_step;
__global float* featureY = keypoints + Y_ROW * keypoints_step;
__global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
...
...
@@ -754,8 +808,8 @@ __kernel
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
{
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x
, c_img_rows, c_img_cols, sum_step
);
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x
, c_img_rows, c_img_cols, sum_step
);
angle = atan2(Y, X);
...
...
@@ -881,20 +935,20 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
// utility for linear filter
inline uchar readerGet(
image2d_t src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j
IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j
, int rows, int cols, int elemPerRow
)
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
return
(uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x
;
return
read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow)
;
}
inline float linearFilter(
image2d_t src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x
IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x
, int rows, int cols, int elemPerRow
)
{
x -= 0.5f;
...
...
@@ -907,30 +961,33 @@ inline float linearFilter(
const int x2 = x1 + 1;
const int y2 = y1 + 1;
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1
, rows, cols, elemPerRow
);
out = out + src_reg * ((x2 - x) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2
, rows, cols, elemPerRow
);
out = out + src_reg * ((x - x1) * (y2 - y));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1
, rows, cols, elemPerRow
);
out = out + src_reg * ((x2 - x) * (y - y1));
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2
, rows, cols, elemPerRow
);
out = out + src_reg * ((x - x1) * (y - y1));
return out;
}
void calc_dx_dy(
image2d_t
imgTex,
IMAGE_INT8
imgTex,
volatile __local float s_dx_bin[25],
volatile __local float s_dy_bin[25],
volatile __local float s_PATCH[6][6],
__global const float* featureX,
__global const float* featureY,
__global const float* featureSize,
__global const float* featureDir
__global const float* featureX,
__global const float* featureY,
__global const float* featureSize,
__global const float* featureDir,
int rows,
int cols,
int elemPerRow
)
{
const float centerX = featureX[get_group_id(0)];
...
...
@@ -965,7 +1022,7 @@ void calc_dx_dy(
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo);
s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo
, rows, cols, elemPerRow
);
barrier(CLK_LOCAL_MEM_FENCE);
...
...
@@ -976,26 +1033,26 @@ void calc_dx_dy(
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
* dw;
const float vy = (
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
* dw;
s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
}
}
void reduce_sum25(
volatile __local float* sdata1,
volatile __local float* sdata2,
volatile __local float* sdata3,
volatile __local float* sdata4,
volatile __local float* sdata1,
volatile __local float* sdata2,
volatile __local float* sdata3,
volatile __local float* sdata4,
int tid
)
{
...
...
@@ -1033,18 +1090,20 @@ void reduce_sum25(
}
}
__kernel
__kernel
void compute_descriptors64(
image2d_t
imgTex,
volatile __global float * descriptors,
IMAGE_INT8
imgTex,
volatile __global float * descriptors,
__global const float * keypoints,
int descriptors_step,
int keypoints_step
int keypoints_step,
int rows,
int cols,
int img_step
)
{
descriptors_step /= sizeof(float);
keypoints_step /= sizeof(float);
__global const float * featureX = keypoints + X_ROW * keypoints_step;
__global const float * featureY = keypoints + Y_ROW * keypoints_step;
__global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
...
...
@@ -1057,7 +1116,7 @@ __kernel
volatile __local float sdyabs[25];
volatile __local float s_PATCH[6][6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir
, rows, cols, img_step
);
barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
...
...
@@ -1066,10 +1125,10 @@ __kernel
{
sdxabs[tid] = fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
barrier(CLK_LOCAL_MEM_FENCE);
//
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//
barrier(CLK_LOCAL_MEM_FENCE);
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
...
...
@@ -1083,13 +1142,16 @@ __kernel
}
}
}
__kernel
__kernel
void compute_descriptors128(
image2d_t
imgTex,
__global volatile float * descriptors,
IMAGE_INT8
imgTex,
__global volatile float * descriptors,
__global float * keypoints,
int descriptors_step,
int keypoints_step
int keypoints_step,
int rows,
int cols,
int img_step
)
{
descriptors_step /= sizeof(*descriptors);
...
...
@@ -1111,7 +1173,7 @@ __kernel
volatile __local float sdabs2[25];
volatile __local float s_PATCH[6][6];
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir
, rows, cols, img_step
);
barrier(CLK_LOCAL_MEM_FENCE);
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
...
...
@@ -1132,10 +1194,10 @@ __kernel
sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
//
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//
barrier(CLK_LOCAL_MEM_FENCE);
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
...
...
@@ -1162,10 +1224,10 @@ __kernel
sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]);
}
barrier(CLK_LOCAL_MEM_FENCE);
//
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
barrier(CLK_LOCAL_MEM_FENCE);
//
barrier(CLK_LOCAL_MEM_FENCE);
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy
|
(
dx
<
0
)
if
(
tid
==
0
)
...
...
@@ -1178,7 +1240,7 @@ __kernel
}
}
__kernel
__kernel
void
normalize_descriptors128
(
__global
float
*
descriptors,
int
descriptors_step
)
{
descriptors_step
/=
sizeof
(
*descriptors
)
;
...
...
@@ -1219,7 +1281,7 @@ __kernel
//
normalize
and
store
in
output
descriptor_base[get_local_id
(
0
)
]
=
lookup
/
len
;
}
__kernel
__kernel
void
normalize_descriptors64
(
__global
float
*
descriptors,
int
descriptors_step
)
{
descriptors_step
/=
sizeof
(
*descriptors
)
;
...
...
modules/ocl/src/surf.cpp
View file @
ea433cc4
/*M///////////////////////////////////////////////////////////////////////////////////////
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
...
...
@@ -44,6 +44,7 @@
//M*/
#include <iomanip>
#include "precomp.hpp"
#include "mcwutil.hpp"
//#include "opencv2/highgui/highgui.hpp"
using
namespace
cv
;
...
...
@@ -71,7 +72,7 @@ static inline int calcSize(int octave, int layer)
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbo
u
rs of a sample, the layers
This ensures that when looking for the neighbors of a sample, the layers
above and below are aligned correctly. */
const
int
HAAR_SIZE_INC
=
6
;
...
...
@@ -79,6 +80,41 @@ static inline int calcSize(int octave, int layer)
return
(
HAAR_SIZE0
+
HAAR_SIZE_INC
*
layer
)
<<
octave
;
}
namespace
{
const
char
*
noImage2dOption
=
"-D DISABLE_IMAGE2D"
;
// default kernel name can be any kernel in nonfree_surf.cl
bool
support_image2d
(
const
char
*
kernel_name
=
"icvCalcLayerDetAndTrace"
)
{
static
bool
_isTested
=
false
;
static
bool
_support
=
false
;
if
(
_isTested
)
{
return
_support
;
}
try
{
cv
::
ocl
::
openCLGetKernelFromSource
(
Context
::
getContext
(),
&
nonfree_surf
,
kernel_name
);
_support
=
true
;
}
catch
(
cv
::
Exception
&
e
)
{
if
(
e
.
code
==
-
217
)
{
_support
=
false
;
}
else
{
// throw e once again
cv
::
error
(
e
);
}
}
_isTested
=
true
;
return
_support
;
}
}
class
SURF_OCL_Invoker
{
public
:
...
...
@@ -88,7 +124,7 @@ public:
//void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
//void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
// kernel callers decl
e
arations
// kernel callers declarations
void
icvCalcLayerDetAndTrace_gpu
(
oclMat
&
det
,
oclMat
&
trace
,
int
octave
,
int
nOctaveLayers
,
int
layer_rows
);
void
icvFindMaximaInLayer_gpu
(
const
oclMat
&
det
,
const
oclMat
&
trace
,
oclMat
&
maxPosBuffer
,
oclMat
&
maxCounter
,
int
counterOffset
,
...
...
@@ -100,14 +136,14 @@ public:
void
icvCalcOrientation_gpu
(
const
oclMat
&
keypoints
,
int
nFeatures
);
void
compute_descriptors_gpu
(
const
oclMat
&
descriptors
,
const
oclMat
&
keypoints
,
int
nFeatures
);
// end of kernel callers decl
e
arations
// end of kernel callers declarations
SURF_OCL_Invoker
(
SURF_OCL
&
surf
,
const
oclMat
&
img
,
const
oclMat
&
mask
)
:
surf_
(
surf
),
img_cols
(
img
.
cols
),
img_rows
(
img
.
rows
),
use_mask
(
!
mask
.
empty
()),
imgTex
(
NULL
),
sumTex
(
NULL
),
maskSumTex
(
NULL
)
use_mask
(
!
mask
.
empty
()),
counters
(
oclMat
()),
imgTex
(
NULL
),
sumTex
(
NULL
),
maskSumTex
(
NULL
)
,
_img
(
img
)
{
CV_Assert
(
!
img
.
empty
()
&&
img
.
type
()
==
CV_8UC1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
size
()
==
img
.
size
()
&&
mask
.
type
()
==
CV_8UC1
));
...
...
@@ -131,12 +167,13 @@ public:
counters
.
create
(
1
,
surf_
.
nOctaves
+
1
,
CV_32SC1
);
counters
.
setTo
(
Scalar
::
all
(
0
));
//loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
bindImgTex
(
img
,
imgTex
);
integral
(
img
,
surf_
.
sum
);
// the two argumented integral version is incorrect
integral
(
img
,
surf_
.
sum
);
if
(
support_image2d
())
{
bindImgTex
(
img
,
imgTex
);
bindImgTex
(
surf_
.
sum
,
sumTex
);
}
bindImgTex
(
surf_
.
sum
,
sumTex
);
maskSumTex
=
0
;
if
(
use_mask
)
...
...
@@ -155,7 +192,7 @@ public:
void
detectKeypoints
(
oclMat
&
keypoints
)
{
// create image pyramid buffers
// different layers have same sized buffers, but they are sampled from
gaussi
n kernel.
// different layers have same sized buffers, but they are sampled from
Gaussia
n kernel.
ensureSizeIsEnough
(
img_rows
*
(
surf_
.
nOctaveLayers
+
2
),
img_cols
,
CV_32FC1
,
surf_
.
det
);
ensureSizeIsEnough
(
img_rows
*
(
surf_
.
nOctaveLayers
+
2
),
img_cols
,
CV_32FC1
,
surf_
.
trace
);
...
...
@@ -222,7 +259,6 @@ public:
openCLFree
(
sumTex
);
if
(
maskSumTex
)
openCLFree
(
maskSumTex
);
additioalParamBuffer
.
release
();
}
private
:
...
...
@@ -236,13 +272,13 @@ private:
int
maxFeatures
;
oclMat
counters
;
// texture buffers
cl_mem
imgTex
;
cl_mem
sumTex
;
cl_mem
maskSumTex
;
oclMat
additioalParamBuffer
;
const
oclMat
_img
;
// make a copy for non-image2d_t supported platform
SURF_OCL_Invoker
&
operator
=
(
const
SURF_OCL_Invoker
&
right
)
{
...
...
@@ -362,11 +398,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
{
if
(
!
img
.
empty
())
{
if
(
img
.
clCxt
->
impl
->
devName
.
find
(
"Intel(R) HD Graphics"
)
!=
string
::
npos
)
{
cout
<<
" Intel HD GPU device unsupported "
<<
endl
;
return
;
}
SURF_OCL_Invoker
surf
(
*
this
,
img
,
mask
);
surf
.
detectKeypoints
(
keypoints
);
...
...
@@ -378,11 +409,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
{
if
(
!
img
.
empty
())
{
if
(
img
.
clCxt
->
impl
->
devName
.
find
(
"Intel(R) HD Graphics"
)
!=
string
::
npos
)
{
cout
<<
" Intel HD GPU device unsupported "
<<
endl
;
return
;
}
SURF_OCL_Invoker
surf
(
*
this
,
img
,
mask
);
if
(
!
useProvidedKeypoints
)
...
...
@@ -443,74 +469,11 @@ void cv::ocl::SURF_OCL::releaseMemory()
// bind source buffer to image oject.
void
SURF_OCL_Invoker
::
bindImgTex
(
const
oclMat
&
img
,
cl_mem
&
texture
)
{
cl_image_format
format
;
int
err
;
int
depth
=
img
.
depth
();
int
channels
=
img
.
channels
();
switch
(
depth
)
{
case
CV_8U
:
format
.
image_channel_data_type
=
CL_UNSIGNED_INT8
;
break
;
case
CV_32S
:
format
.
image_channel_data_type
=
CL_UNSIGNED_INT32
;
break
;
case
CV_32F
:
format
.
image_channel_data_type
=
CL_FLOAT
;
break
;
default
:
throw
std
::
exception
();
break
;
}
switch
(
channels
)
{
case
1
:
format
.
image_channel_order
=
CL_R
;
break
;
case
3
:
format
.
image_channel_order
=
CL_RGB
;
break
;
case
4
:
format
.
image_channel_order
=
CL_RGBA
;
break
;
default
:
throw
std
::
exception
();
break
;
}
if
(
texture
)
{
openCLFree
(
texture
);
}
#ifdef CL_VERSION_1_2
cl_image_desc
desc
;
desc
.
image_type
=
CL_MEM_OBJECT_IMAGE2D
;
desc
.
image_width
=
img
.
step
/
img
.
elemSize
();
desc
.
image_height
=
img
.
rows
;
desc
.
image_depth
=
0
;
desc
.
image_array_size
=
1
;
desc
.
image_row_pitch
=
0
;
desc
.
image_slice_pitch
=
0
;
desc
.
buffer
=
NULL
;
desc
.
num_mip_levels
=
0
;
desc
.
num_samples
=
0
;
texture
=
clCreateImage
(
Context
::
getContext
()
->
impl
->
clContext
,
CL_MEM_READ_WRITE
,
&
format
,
&
desc
,
NULL
,
&
err
);
#else
texture
=
clCreateImage2D
(
Context
::
getContext
()
->
impl
->
clContext
,
CL_MEM_READ_WRITE
,
&
format
,
img
.
step
/
img
.
elemSize
(),
img
.
rows
,
0
,
NULL
,
&
err
);
#endif
size_t
origin
[]
=
{
0
,
0
,
0
};
size_t
region
[]
=
{
img
.
step
/
img
.
elemSize
(),
img
.
rows
,
1
};
clEnqueueCopyBufferToImage
(
img
.
clCxt
->
impl
->
clCmdQueue
,
(
cl_mem
)
img
.
data
,
texture
,
0
,
origin
,
region
,
0
,
NULL
,
0
);
openCLSafeCall
(
err
);
texture
=
bindTexture
(
img
);
}
////////////////////////////
...
...
@@ -525,7 +488,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
string
kernelName
=
"icvCalcLayerDetAndTrace"
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
sumTex
));
if
(
sumTex
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
sumTex
));
}
else
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
surf_
.
sum
.
data
));
// if image2d is not supported
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
det
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
trace
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
det
.
step
));
...
...
@@ -535,6 +505,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
nOctaveLayers
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
octave
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
c_layer_rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
surf_
.
sum
.
step
));
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
size_t
globalThreads
[
3
]
=
...
...
@@ -543,7 +514,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
divUp
(
max_samples_i
,
localThreads
[
1
])
*
localThreads
[
1
]
*
(
nOctaveLayers
+
2
),
1
};
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
if
(
support_image2d
())
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
else
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
noImage2dOption
);
}
}
void
SURF_OCL_Invoker
::
icvFindMaximaInLayer_gpu
(
const
oclMat
&
det
,
const
oclMat
&
trace
,
oclMat
&
maxPosBuffer
,
oclMat
&
maxCounter
,
int
counterOffset
,
...
...
@@ -573,16 +551,30 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
if
(
use_mask
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
maskSumTex
));
if
(
maskSumTex
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
maskSumTex
));
}
else
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
surf_
.
maskSum
.
data
));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
surf_
.
maskSum
.
step
));
}
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
size_t
globalThreads
[
3
]
=
{
divUp
(
layer_cols
-
2
*
min_margin
,
localThreads
[
0
]
-
2
)
*
localThreads
[
0
],
divUp
(
layer_rows
-
2
*
min_margin
,
localThreads
[
1
]
-
2
)
*
nLayers
*
localThreads
[
1
],
1
};
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
if
(
support_image2d
())
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
else
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
noImage2dOption
);
}
}
void
SURF_OCL_Invoker
::
icvInterpolateKeypoint_gpu
(
const
oclMat
&
det
,
const
oclMat
&
maxPosBuffer
,
unsigned
int
maxCounter
,
...
...
@@ -607,7 +599,14 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
size_t
localThreads
[
3
]
=
{
3
,
3
,
3
};
size_t
globalThreads
[
3
]
=
{
maxCounter
*
localThreads
[
0
],
localThreads
[
1
],
1
};
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
if
(
support_image2d
())
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
else
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
noImage2dOption
);
}
}
void
SURF_OCL_Invoker
::
icvCalcOrientation_gpu
(
const
oclMat
&
keypoints
,
int
nFeatures
)
...
...
@@ -617,16 +616,31 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
sumTex
));
if
(
sumTex
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
sumTex
));
}
else
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
surf_
.
sum
.
data
));
// if image2d is not supported
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
keypoints
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
keypoints
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
img_rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
img_cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
surf_
.
sum
.
step
));
size_t
localThreads
[
3
]
=
{
32
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
nFeatures
*
localThreads
[
0
],
localThreads
[
1
],
1
};
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
if
(
support_image2d
())
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
else
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
noImage2dOption
);
}
}
void
SURF_OCL_Invoker
::
compute_descriptors_gpu
(
const
oclMat
&
descriptors
,
const
oclMat
&
keypoints
,
int
nFeatures
)
...
...
@@ -649,12 +663,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
globalThreads
[
1
]
=
16
*
localThreads
[
1
];
args
.
clear
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
imgTex
));
if
(
imgTex
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
imgTex
));
}
else
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
_img
.
data
));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
descriptors
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
keypoints
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
descriptors
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
keypoints
.
step
));
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
_img
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
_img
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
_img
.
step
));
if
(
support_image2d
())
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
else
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
noImage2dOption
);
}
kernelName
=
"normalize_descriptors64"
;
...
...
@@ -667,7 +698,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args
.
clear
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
descriptors
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
descriptors
.
step
));
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
if
(
support_image2d
())
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
else
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
noImage2dOption
);
}
}
else
{
...
...
@@ -680,12 +718,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
globalThreads
[
1
]
=
16
*
localThreads
[
1
];
args
.
clear
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
imgTex
));
if
(
imgTex
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
imgTex
));
}
else
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
_img
.
data
));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
descriptors
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
keypoints
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
descriptors
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
keypoints
.
step
));
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
_img
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
_img
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
_img
.
step
));
if
(
support_image2d
())
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
else
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
noImage2dOption
);
}
kernelName
=
"normalize_descriptors128"
;
...
...
@@ -698,7 +753,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args
.
clear
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
descriptors
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
descriptors
.
step
));
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
if
(
support_image2d
())
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
else
{
openCLExecuteKernel
(
clCxt
,
&
nonfree_surf
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
noImage2dOption
);
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment