Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
56d62118
Commit
56d62118
authored
Mar 29, 2013
by
Andrey Kamaev
Committed by
OpenCV Buildbot
Mar 29, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #707 from pengx17:2.4_surf
parents
b5dd26e4
f2ecf4f9
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
263 additions
and
61 deletions
+263
-61
surf.cl
modules/nonfree/src/opencl/surf.cl
+175
-51
surf.ocl.cpp
modules/nonfree/src/surf.ocl.cpp
+33
-10
util.hpp
modules/ocl/include/opencv2/ocl/private/util.hpp
+10
-0
initialization.cpp
modules/ocl/src/initialization.cpp
+45
-0
No files found.
modules/nonfree/src/opencl/surf.cl
View file @
56d62118
...
...
@@ -747,21 +747,42 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
#
define
op
(
A,
B
)
(
*A
)
+
(
B
)
data[tid]
=
*partial_reduction
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifndef
WAVE_SIZE
#
define
WAVE_SIZE
1
#
endif
if
(
tid
<
16
)
{
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
16]
)
;
#
if
WAVE_SIZE
<
16
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
#
endif
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
8
]
)
;
#
if
WAVE_SIZE
<
8
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
#
endif
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
4
]
)
;
#
if
WAVE_SIZE
<
4
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
2
)
{
#
endif
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
2
]
)
;
#
if
WAVE_SIZE
<
2
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
1
)
{
#
endif
data[tid]
=
*partial_reduction
=
op
(
partial_reduction,
data[tid
+
1
]
)
;
}
#
undef
WAVE_SIZE
#
undef
op
}
...
...
@@ -1087,44 +1108,67 @@ void reduce_sum25(
int tid
)
{
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
// first step is to reduce from 25 to 16
if (tid < 9)
// use 9 threads
if (tid < 9)
{
sdata1[tid] += sdata1[tid + 16];
sdata2[tid] += sdata2[tid + 16];
sdata3[tid] += sdata3[tid + 16];
sdata4[tid] += sdata4[tid + 16];
#if WAVE_SIZE < 16
}
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
sdata1[tid] += sdata1[tid + 8];
sdata1[tid] += sdata1[tid + 4];
sdata1[tid] += sdata1[tid + 2];
sdata1[tid] += sdata1[tid + 1];
sdata2[tid] += sdata2[tid + 8];
sdata2[tid] += sdata2[tid + 4];
sdata2[tid] += sdata2[tid + 2];
sdata2[tid] += sdata2[tid + 1];
sdata3[tid] += sdata3[tid + 8];
sdata3[tid] += sdata3[tid + 4];
sdata3[tid] += sdata3[tid + 2];
sdata3[tid] += sdata3[tid + 1];
sdata4[tid] += sdata4[tid + 8];
#if WAVE_SIZE < 8
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
sdata1[tid] += sdata1[tid + 4];
sdata2[tid] += sdata2[tid + 4];
sdata3[tid] += sdata3[tid + 4];
sdata4[tid] += sdata4[tid + 4];
#if WAVE_SIZE < 4
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
sdata1[tid] += sdata1[tid + 2];
sdata2[tid] += sdata2[tid + 2];
sdata3[tid] += sdata3[tid + 2];
sdata4[tid] += sdata4[tid + 2];
#if WAVE_SIZE < 2
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
sdata1[tid] += sdata1[tid + 1];
sdata2[tid] += sdata2[tid + 1];
sdata3[tid] += sdata3[tid + 1];
sdata4[tid] += sdata4[tid + 1];
}
#undef WAVE_SIZE
}
__kernel
void compute_descriptors64(
IMAGE_INT8 imgTex,
volatile
__global float * descriptors,
__global float * descriptors,
__global const float * keypoints,
int descriptors_step,
int keypoints_step,
...
...
@@ -1158,14 +1202,13 @@ __kernel
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
volatile
__global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
__global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
// write dx, dy, |dx|, |dy|
if (tid == 0)
...
...
@@ -1180,7 +1223,7 @@ __kernel
__kernel
void compute_descriptors128(
IMAGE_INT8 imgTex,
__global
volatile
float * descriptors,
__global float * descriptors,
__global float * keypoints,
int descriptors_step,
int keypoints_step,
...
...
@@ -1229,13 +1272,15 @@ __kernel
sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]);
}
//barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
//barrier(CLK_LOCAL_MEM_FENCE);
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
barrier(CLK_LOCAL_MEM_FENCE);
__global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
if (tid < 25)
{
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (tid == 0)
{
...
...
@@ -1259,11 +1304,14 @@ __kernel
sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]);
}
//barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
//
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy
|
(
dx
<
0
)
if
(
tid
==
0
)
{
...
...
@@ -1274,6 +1322,103 @@ __kernel
}
}
}
void
reduce_sum128
(
volatile
__local
float*
smem,
int
tid
)
{
#
ifndef
WAVE_SIZE
#
define
WAVE_SIZE
1
#
endif
if
(
tid
<
64
)
{
smem[tid]
+=
smem[tid
+
64]
;
#
if
WAVE_SIZE
<
64
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
32
)
{
#
endif
smem[tid]
+=
smem[tid
+
32]
;
#
if
WAVE_SIZE
<
32
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
smem[tid]
+=
smem[tid
+
16]
;
#
if
WAVE_SIZE
<
16
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
#
endif
smem[tid]
+=
smem[tid
+
8]
;
#
if
WAVE_SIZE
<
8
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
#
endif
smem[tid]
+=
smem[tid
+
4]
;
#
if
WAVE_SIZE
<
4
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
2
)
{
#
endif
smem[tid]
+=
smem[tid
+
2]
;
#
if
WAVE_SIZE
<
2
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
1
)
{
#
endif
smem[tid]
+=
smem[tid
+
1]
;
}
}
void
reduce_sum64
(
volatile
__local
float*
smem,
int
tid
)
{
#
ifndef
WAVE_SIZE
#
define
WAVE_SIZE
1
#
endif
if
(
tid
<
32
)
{
smem[tid]
+=
smem[tid
+
32]
;
#
if
WAVE_SIZE
<
32
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
smem[tid]
+=
smem[tid
+
16]
;
#
if
WAVE_SIZE
<
16
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
#
endif
smem[tid]
+=
smem[tid
+
8]
;
#
if
WAVE_SIZE
<
8
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
#
endif
smem[tid]
+=
smem[tid
+
4]
;
#
if
WAVE_SIZE
<
4
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
2
)
{
#
endif
smem[tid]
+=
smem[tid
+
2]
;
#
if
WAVE_SIZE
<
2
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
1
)
{
#
endif
smem[tid]
+=
smem[tid
+
1]
;
}
}
__kernel
void
normalize_descriptors128
(
__global
float
*
descriptors,
int
descriptors_step
)
...
...
@@ -1288,22 +1433,10 @@ __kernel
sqDesc[get_local_id
(
0
)
]
=
lookup
*
lookup
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
64
)
sqDesc[get_local_id
(
0
)
]
+=
sqDesc[get_local_id
(
0
)
+
64]
;
reduce_sum128
(
sqDesc,
get_local_id
(
0
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
reduction
to
get
total
if
(
get_local_id
(
0
)
<
32
)
{
volatile
__local
float*
smem
=
sqDesc
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
32]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
16]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
8]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
4]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
2]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
1]
;
}
//
compute
length
(
square
root
)
volatile
__local
float
len
;
...
...
@@ -1329,18 +1462,9 @@ __kernel
sqDesc[get_local_id
(
0
)
]
=
lookup
*
lookup
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
reduction
to
get
total
if
(
get_local_id
(
0
)
<
32
)
{
volatile
__local
float*
smem
=
sqDesc
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
32]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
16]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
8]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
4]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
2]
;
smem[get_local_id
(
0
)
]
+=
smem[get_local_id
(
0
)
+
1]
;
}
reduce_sum64
(
sqDesc,
get_local_id
(
0
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
compute
length
(
square
root
)
volatile
__local
float
len
;
...
...
modules/nonfree/src/surf.ocl.cpp
View file @
56d62118
...
...
@@ -43,6 +43,7 @@
//
//M*/
#include "precomp.hpp"
#include <cstdio>
#ifdef HAVE_OPENCV_OCL
...
...
@@ -57,25 +58,35 @@ namespace cv
///////////////////////////OpenCL kernel strings///////////////////////////
extern
const
char
*
surf
;
const
char
*
noImage2dOption
=
"-D DISABLE_IMAGE2D"
;
const
char
noImage2dOption
[]
=
"-D DISABLE_IMAGE2D"
;
static
char
SURF_OPTIONS
[
1024
]
=
""
;
static
bool
USE_IMAGE2d
=
false
;
static
void
openCLExecuteKernelSURF
(
Context
*
clCxt
,
const
char
**
source
,
string
kernelName
,
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
vector
<
pair
<
size_t
,
const
void
*>
>
&
args
,
int
channels
,
int
depth
)
{
if
(
support_image2d
())
char
*
pSURF_OPTIONS
=
SURF_OPTIONS
;
static
bool
OPTION_INIT
=
false
;
if
(
!
OPTION_INIT
)
{
openCLExecuteKernel
(
clCxt
,
source
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
);
}
else
{
openCLExecuteKernel
(
clCxt
,
source
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
noImage2dOption
);
if
(
!
USE_IMAGE2d
)
{
strcat
(
pSURF_OPTIONS
,
noImage2dOption
);
pSURF_OPTIONS
+=
strlen
(
noImage2dOption
);
}
size_t
wave_size
=
0
;
queryDeviceInfo
(
WAVEFRONT_SIZE
,
&
wave_size
);
std
::
sprintf
(
pSURF_OPTIONS
,
" -D WAVE_SIZE=%d"
,
static_cast
<
int
>
(
wave_size
));
OPTION_INIT
=
true
;
}
openCLExecuteKernel
(
clCxt
,
source
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
SURF_OPTIONS
);
}
}
}
static
inline
in
t
divUp
(
size_t
total
,
size_t
grain
)
static
inline
size_
t
divUp
(
size_t
total
,
size_t
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
}
...
...
@@ -152,8 +163,20 @@ public:
integral
(
img
,
surf_
.
sum
);
if
(
support_image2d
())
{
bindImgTex
(
img
,
imgTex
);
bindImgTex
(
surf_
.
sum
,
sumTex
);
try
{
bindImgTex
(
img
,
imgTex
);
bindImgTex
(
surf_
.
sum
,
sumTex
);
USE_IMAGE2d
=
true
;
}
catch
(
const
cv
::
Exception
&
e
)
{
USE_IMAGE2d
=
false
;
if
(
e
.
code
!=
CL_IMAGE_FORMAT_NOT_SUPPORTED
&&
e
.
code
!=
-
217
)
{
throw
e
;
}
}
}
maskSumTex
=
0
;
...
...
modules/ocl/include/opencv2/ocl/private/util.hpp
View file @
56d62118
...
...
@@ -123,6 +123,16 @@ namespace cv
// returns whether the current context supports image2d_t format or not
bool
CV_EXPORTS
support_image2d
(
Context
*
clCxt
=
Context
::
getContext
());
// the enums are used to query device information
// currently only support wavefront size queries
enum
DEVICE_INFO
{
WAVEFRONT_SIZE
,
//in AMD speak
WARP_SIZE
=
WAVEFRONT_SIZE
//in nvidia speak
};
//info should have been pre-allocated
void
CV_EXPORTS
queryDeviceInfo
(
DEVICE_INFO
info_type
,
void
*
info
);
}
//namespace ocl
}
//namespace cv
...
...
modules/ocl/src/initialization.cpp
View file @
56d62118
...
...
@@ -353,6 +353,51 @@ namespace cv
{
return
&
(
Context
::
getContext
()
->
impl
->
clCmdQueue
);
}
void
queryDeviceInfo
(
DEVICE_INFO
info_type
,
void
*
info
)
{
static
Info
::
Impl
*
impl
=
Context
::
getContext
()
->
impl
;
switch
(
info_type
)
{
case
WAVEFRONT_SIZE
:
{
#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
try
{
openCLSafeCall
(
clGetDeviceInfo
(
Context
::
getContext
()
->
impl
->
devices
[
0
],
CL_DEVICE_WAVEFRONT_WIDTH_AMD
,
sizeof
(
size_t
),
info
,
0
));
}
catch
(
const
cv
::
Exception
&
)
#elif defined (CL_DEVICE_WARP_SIZE_NV)
const
int
EXT_LEN
=
4096
+
1
;
char
extends_set
[
EXT_LEN
];
size_t
extends_size
;
openCLSafeCall
(
clGetDeviceInfo
(
impl
->
devices
[
impl
->
devnum
],
CL_DEVICE_EXTENSIONS
,
EXT_LEN
,
(
void
*
)
extends_set
,
&
extends_size
));
extends_set
[
EXT_LEN
-
1
]
=
0
;
if
(
std
::
string
(
extends_set
).
find
(
"cl_nv_device_attribute_query"
)
!=
std
::
string
::
npos
)
{
openCLSafeCall
(
clGetDeviceInfo
(
Context
::
getContext
()
->
impl
->
devices
[
0
],
CL_DEVICE_WARP_SIZE_NV
,
sizeof
(
size_t
),
info
,
0
));
}
else
#endif
{
// if no way left for us to query the warp size, we can get it from kernel group info
static
const
char
*
_kernel_string
=
"__kernel void test_func() {}"
;
cl_kernel
kernel
;
kernel
=
openCLGetKernelFromSource
(
Context
::
getContext
(),
&
_kernel_string
,
"test_func"
);
openCLSafeCall
(
clGetKernelWorkGroupInfo
(
kernel
,
impl
->
devices
[
impl
->
devnum
],
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
,
sizeof
(
size_t
),
info
,
NULL
));
}
}
break
;
default
:
CV_Error
(
-
1
,
"Invalid device info type"
);
break
;
}
}
void
openCLReadBuffer
(
Context
*
clCxt
,
cl_mem
dst_buffer
,
void
*
host_buffer
,
size_t
size
)
{
cl_int
status
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment