1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// OpenCL port of the ORB feature detector and descriptor extractor
// Copyright (C) 2014, Itseez Inc. See the license at http://opencv.org
//
// The original code has been contributed by Peter Andreas Entschev, peter@entschev.com
#define LAYERINFO_SIZE 1
#define LAYERINFO_OFS 0
#define KEYPOINT_SIZE 3
#define ORIENTED_KEYPOINT_SIZE 4
#define KEYPOINT_X 0
#define KEYPOINT_Y 1
#define KEYPOINT_Z 2
#define KEYPOINT_ANGLE 3
/////////////////////////////////////////////////////////////
#ifdef ORB_RESPONSES
__kernel void
ORB_HarrisResponses(__global const uchar* imgbuf, int imgstep, int imgoffset0,
__global const int* layerinfo, __global const int* keypoints,
__global float* responses, int nkeypoints )
{
int idx = get_global_id(0);
if( idx < nkeypoints )
{
__global const int* kpt = keypoints + idx*KEYPOINT_SIZE;
__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
__global const uchar* img = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
(kpt[KEYPOINT_Y] - blockSize/2)*imgstep + (kpt[KEYPOINT_X] - blockSize/2);
int i, j;
int a = 0, b = 0, c = 0;
for( i = 0; i < blockSize; i++, img += imgstep-blockSize )
{
for( j = 0; j < blockSize; j++, img++ )
{
int Ix = (img[1] - img[-1])*2 + img[-imgstep+1] - img[-imgstep-1] + img[imgstep+1] - img[imgstep-1];
int Iy = (img[imgstep] - img[-imgstep])*2 + img[imgstep-1] - img[-imgstep-1] + img[imgstep+1] - img[-imgstep+1];
a += Ix*Ix;
b += Iy*Iy;
c += Ix*Iy;
}
}
responses[idx] = ((float)a * b - (float)c * c - HARRIS_K * (float)(a + b) * (a + b))*scale_sq_sq;
}
}
#endif
/////////////////////////////////////////////////////////////
#ifdef ORB_ANGLES
#define _DBL_EPSILON 2.2204460492503131e-16f
#define atan2_p1 (0.9997878412794807f*57.29577951308232f)
#define atan2_p3 (-0.3258083974640975f*57.29577951308232f)
#define atan2_p5 (0.1555786518463281f*57.29577951308232f)
#define atan2_p7 (-0.04432655554792128f*57.29577951308232f)
inline float fastAtan2( float y, float x )
{
float ax = fabs(x), ay = fabs(y);
float a, c, c2;
if( ax >= ay )
{
c = ay/(ax + _DBL_EPSILON);
c2 = c*c;
a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
}
else
{
c = ax/(ay + _DBL_EPSILON);
c2 = c*c;
a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
}
if( x < 0 )
a = 180.f - a;
if( y < 0 )
a = 360.f - a;
return a;
}
__kernel void
ORB_ICAngle(__global const uchar* imgbuf, int imgstep, int imgoffset0,
__global const int* layerinfo, __global const int* keypoints,
__global float* responses, const __global int* u_max,
int nkeypoints, int half_k )
{
int idx = get_global_id(0);
if( idx < nkeypoints )
{
__global const int* kpt = keypoints + idx*KEYPOINT_SIZE;
__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
__global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];
int u, v, m_01 = 0, m_10 = 0;
// Treat the center line differently, v=0
for( u = -half_k; u <= half_k; u++ )
m_10 += u * center[u];
// Go line by line in the circular patch
for( v = 1; v <= half_k; v++ )
{
// Proceed over the two lines
int v_sum = 0;
int d = u_max[v];
for( u = -d; u <= d; u++ )
{
int val_plus = center[u + v*imgstep], val_minus = center[u - v*imgstep];
v_sum += (val_plus - val_minus);
m_10 += u * (val_plus + val_minus);
}
m_01 += v * v_sum;
}
// we do not use OpenCL's atan2 intrinsic,
// because we want to get _exactly_ the same results as the CPU version
responses[idx] = fastAtan2((float)m_01, (float)m_10);
}
}
#endif
/////////////////////////////////////////////////////////////
#ifdef ORB_DESCRIPTORS
__kernel void
ORB_computeDescriptor(__global const uchar* imgbuf, int imgstep, int imgoffset0,
__global const int* layerinfo, __global const int* keypoints,
__global uchar* _desc, const __global int* pattern,
int nkeypoints, int dsize )
{
int idx = get_global_id(0);
if( idx < nkeypoints )
{
int i;
__global const int* kpt = keypoints + idx*ORIENTED_KEYPOINT_SIZE;
__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
__global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];
float angle = as_float(kpt[KEYPOINT_ANGLE]);
angle *= 0.01745329251994329547f;
float cosa;
float sina = sincos(angle, &cosa);
__global uchar* desc = _desc + idx*dsize;
#define GET_VALUE(idx) \
center[mad24(convert_int_rte(pattern[(idx)*2] * sina + pattern[(idx)*2+1] * cosa), imgstep, \
convert_int_rte(pattern[(idx)*2] * cosa - pattern[(idx)*2+1] * sina))]
for( i = 0; i < dsize; i++ )
{
int val;
#if WTA_K == 2
int t0, t1;
t0 = GET_VALUE(0); t1 = GET_VALUE(1);
val = t0 < t1;
t0 = GET_VALUE(2); t1 = GET_VALUE(3);
val |= (t0 < t1) << 1;
t0 = GET_VALUE(4); t1 = GET_VALUE(5);
val |= (t0 < t1) << 2;
t0 = GET_VALUE(6); t1 = GET_VALUE(7);
val |= (t0 < t1) << 3;
t0 = GET_VALUE(8); t1 = GET_VALUE(9);
val |= (t0 < t1) << 4;
t0 = GET_VALUE(10); t1 = GET_VALUE(11);
val |= (t0 < t1) << 5;
t0 = GET_VALUE(12); t1 = GET_VALUE(13);
val |= (t0 < t1) << 6;
t0 = GET_VALUE(14); t1 = GET_VALUE(15);
val |= (t0 < t1) << 7;
pattern += 16*2;
#elif WTA_K == 3
int t0, t1, t2;
t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
pattern += 12*2;
#elif WTA_K == 4
int t0, t1, t2, t3, k;
int a, b;
t0 = GET_VALUE(0); t1 = GET_VALUE(1);
t2 = GET_VALUE(2); t3 = GET_VALUE(3);
a = 0, b = 2;
if( t1 > t0 ) t0 = t1, a = 1;
if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b;
val = k;
t0 = GET_VALUE(4); t1 = GET_VALUE(5);
t2 = GET_VALUE(6); t3 = GET_VALUE(7);
a = 0, b = 2;
if( t1 > t0 ) t0 = t1, a = 1;
if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b;
val |= k << 2;
t0 = GET_VALUE(8); t1 = GET_VALUE(9);
t2 = GET_VALUE(10); t3 = GET_VALUE(11);
a = 0, b = 2;
if( t1 > t0 ) t0 = t1, a = 1;
if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b;
val |= k << 4;
t0 = GET_VALUE(12); t1 = GET_VALUE(13);
t2 = GET_VALUE(14); t3 = GET_VALUE(15);
a = 0, b = 2;
if( t1 > t0 ) t0 = t1, a = 1;
if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b;
val |= k << 6;
pattern += 16*2;
#else
#error "unknown/undefined WTA_K value; should be 2, 3 or 4"
#endif
desc[i] = (uchar)val;
}
}
}
#endif