Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
cb63bbf0
Commit
cb63bbf0
authored
Apr 03, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix hog on some CPU device running ocl
parent
fd4a6f0a
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
185 additions
and
83 deletions
+185
-83
hog.cpp
modules/ocl/src/hog.cpp
+40
-21
objdetect_hog.cl
modules/ocl/src/opencl/objdetect_hog.cl
+145
-62
No files found.
modules/ocl/src/hog.cpp
View file @
cb63bbf0
...
...
@@ -44,7 +44,6 @@
//M*/
#include "precomp.hpp"
using
namespace
cv
;
using
namespace
cv
::
ocl
;
using
namespace
std
;
...
...
@@ -230,7 +229,6 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc
}
}
void
cv
::
ocl
::
HOGDescriptor
::
computeBlockHistograms
(
const
oclMat
&
img
)
{
computeGradient
(
img
,
grad
,
qangle
);
...
...
@@ -1571,6 +1569,27 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b
cdescr_size
=
descr_size
;
}
static
inline
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
}
static
void
openCLExecuteKernel_hog
(
Context
*
clCxt
,
const
char
**
source
,
string
kernelName
,
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
vector
<
pair
<
size_t
,
const
void
*>
>
&
args
)
{
size_t
wave_size
=
0
;
queryDeviceInfo
(
WAVEFRONT_SIZE
,
&
wave_size
);
if
(
wave_size
<=
16
)
{
char
build_options
[
64
];
sprintf
(
build_options
,
(
wave_size
==
16
)
?
"-D WAVE_SIZE_16"
:
"-D WAVE_SIZE_1"
);
openCLExecuteKernel
(
clCxt
,
source
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
else
openCLExecuteKernel
(
clCxt
,
source
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
compute_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
grad
,
const
cv
::
ocl
::
oclMat
&
qangle
,
float
sigma
,
cv
::
ocl
::
oclMat
&
block_hists
)
...
...
@@ -1582,8 +1601,10 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_height
=
(
height
-
CELLS_PER_BLOCK_Y
*
CELL_HEIGHT
+
block_stride_y
)
/
block_stride_y
;
size_t
globalThreads
[
3
]
=
{
img_block_width
*
32
,
img_block_height
*
2
,
1
};
size_t
localThreads
[
3
]
=
{
32
,
2
,
1
};
int
blocks_total
=
img_block_width
*
img_block_height
;
int
blocks_in_group
=
4
;
size_t
localThreads
[
3
]
=
{
blocks_in_group
*
24
,
2
,
1
};
size_t
globalThreads
[
3
]
=
{
divUp
(
blocks_total
,
blocks_in_group
)
*
localThreads
[
0
],
2
,
1
};
int
grad_quadstep
=
grad
.
step
>>
2
;
int
qangle_step
=
qangle
.
step
;
...
...
@@ -1593,14 +1614,15 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
int
hists_size
=
(
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
)
*
sizeof
(
float
);
int
final_hists_size
=
(
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
)
*
sizeof
(
float
);
int
smem
=
hists_size
+
final_hists_size
;
int
smem
=
(
hists_size
+
final_hists_size
)
*
blocks_in_group
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cblock_stride_x
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cblock_stride_y
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cnbins
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cblock_hist_size
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
img_block_width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
blocks_in_group
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
blocks_total
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
grad_quadstep
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
qangle_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
grad
.
data
));
...
...
@@ -1609,7 +1631,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
block_hists
.
data
));
args
.
push_back
(
make_pair
(
smem
,
(
void
*
)
NULL
));
openCLExecuteKernel
2
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
_hog
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
);
}
void
cv
::
ocl
::
device
::
hog
::
normalize_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
...
...
@@ -1637,7 +1659,7 @@ void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int bl
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
threshold
));
args
.
push_back
(
make_pair
(
nthreads
*
sizeof
(
float
),
(
void
*
)
NULL
));
openCLExecuteKernel
2
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
_hog
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
);
}
void
cv
::
ocl
::
device
::
hog
::
classify_hists
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
...
...
@@ -1671,7 +1693,7 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
threshold
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
labels
.
data
));
openCLExecuteKernel
2
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
_hog
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
);
}
void
cv
::
ocl
::
device
::
hog
::
extract_descrs_by_rows
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
...
...
@@ -1702,7 +1724,7 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
block_hists
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
descriptors
.
data
));
openCLExecuteKernel
2
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
extract_descrs_by_cols
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
...
...
@@ -1734,12 +1756,7 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
block_hists
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
descriptors
.
data
));
openCLExecuteKernel2
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
static
inline
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
compute_gradients_8UC1
(
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
img
,
...
...
@@ -1768,7 +1785,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
args
.
push_back
(
make_pair
(
sizeof
(
cl_char
),
(
void
*
)
&
correctGamma
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cnbins
));
openCLExecuteKernel
2
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
compute_gradients_8UC4
(
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
img
,
...
...
@@ -1798,7 +1815,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
args
.
push_back
(
make_pair
(
sizeof
(
cl_char
),
(
void
*
)
&
correctGamma
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cnbins
));
openCLExecuteKernel
2
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
resize
(
const
oclMat
&
src
,
oclMat
&
dst
,
const
Size
sz
)
...
...
@@ -1815,14 +1832,16 @@ void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz
float
ifx
=
(
float
)
src
.
cols
/
sz
.
width
;
float
ify
=
(
float
)
src
.
rows
/
sz
.
height
;
int
src_step
=
static_cast
<
int
>
(
src
.
step
);
int
dst_step
=
static_cast
<
int
>
(
dst
.
step
);
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
_
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
_
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
sz
.
width
));
...
...
@@ -1830,5 +1849,5 @@ void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
ifx
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
ify
));
openCLExecuteKernel
2
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
modules/ocl/src/opencl/objdetect_hog.cl
View file @
cb63bbf0
...
...
@@ -53,76 +53,96 @@
//----------------------------------------------------------------------------
//
Histogram
computation
__kernel
void
compute_hists_kernel
(
const
int
width,
const
int
cblock_stride_x,
const
int
cblock_stride_y,
const
int
cnbins,
const
int
cblock_hist_size,
const
int
img_block_width,
const
int
grad_quadstep,
const
int
qangle_step,
__global
const
float*
grad,
__global
const
uchar*
qangle,
const
float
scale,
__global
float*
block_hists,
__local
float*
smem
)
//
12
threads
for
a
cell,
12x4
threads
per
block
__kernel
void
compute_hists_kernel
(
const
int
cblock_stride_x,
const
int
cblock_stride_y,
const
int
cnbins,
const
int
cblock_hist_size,
const
int
img_block_width,
const
int
blocks_in_group,
const
int
blocks_total,
const
int
grad_quadstep,
const
int
qangle_step,
__global
const
float*
grad,
__global
const
uchar*
qangle,
const
float
scale,
__global
float*
block_hists,
__local
float*
smem
)
{
const
int
lidX
=
get_local_id
(
0
)
;
const
int
lx
=
get_local_id
(
0
)
;
const
int
lp
=
lx
/
24
; /* local group id */
const
int
gid
=
get_group_id
(
0
)
*
blocks_in_group
+
lp
;/* global group id */
const
int
gidY
=
gid
/
img_block_width
;
const
int
gidX
=
gid
-
gidY
*
img_block_width
;
const
int
lidX
=
lx
-
lp
*
24
;
const
int
lidY
=
get_local_id
(
1
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
const
int
cell_x
=
lidX
/
1
6
;
const
int
cell_x
=
lidX
/
1
2
;
const
int
cell_y
=
lidY
;
const
int
cell_thread_x
=
lidX
&
0xF
;
const
int
cell_thread_x
=
lidX
-
cell_x
*
12
;
__local
float*
hists
=
smem
;
__local
float*
final_hist
=
smem
+
cnbins
*
48
;
__local
float*
hists
=
smem
+
lp
*
cnbins
*
(
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
+
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
)
;
__local
float*
final_hist
=
hists
+
cnbins
*
(
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
)
;
const
int
offset_x
=
gidX
*
cblock_stride_x
+
(
cell_x
<<
2
)
+
cell_thread_x
;
const
int
offset_y
=
gidY
*
cblock_stride_y
+
(
cell_y
<<
2
)
;
__global
const
float*
grad_ptr
=
grad
+
offset_y
*
grad_quadstep
+
(
offset_x
<<
1
)
;
__global
const
uchar*
qangle_ptr
=
qangle
+
offset_y
*
qangle_step
+
(
offset_x
<<
1
)
;
//
12
means
that
12
pixels
affect
on
block
's
cell
(
in
one
row
)
if
(
cell_thread_x
<
12
)
{
__local
float*
hist
=
hists
+
12
*
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
+
cell_thread_x
;
for
(
int
bin_id
=
0
; bin_id < cnbins; ++bin_id)
hist[bin_id
*
48]
=
0.f
;
__global
const
float*
grad_ptr
=
(
gid
<
blocks_total
)
?
grad
+
offset_y
*
grad_quadstep
+
(
offset_x
<<
1
)
:
grad
;
__global
const
uchar*
qangle_ptr
=
(
gid
<
blocks_total
)
?
qangle
+
offset_y
*
qangle_step
+
(
offset_x
<<
1
)
:
qangle
;
const
int
dist_x
=
-4
+
cell_thread_x
-
4
*
cell_x
;
__local
float*
hist
=
hists
+
12
*
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
+
cell_thread_x
;
for
(
int
bin_id
=
0
; bin_id < cnbins; ++bin_id)
hist[bin_id
*
48]
=
0.f
;
const
int
dist_y_begin
=
-4
-
4
*
lidY
;
for
(
int
dist_y
=
dist_y_begin
; dist_y < dist_y_begin + 12; ++dist_y)
{
float2
vote
=
(
float2
)
(
grad_ptr[0],
grad_ptr[1]
)
;
uchar2
bin
=
(
uchar2
)
(
qangle_ptr[0],
qangle_ptr[1]
)
;
const
int
dist_x
=
-4
+
cell_thread_x
-
4
*
cell_x
;
const
int
dist_center_x
=
dist_x
-
4
*
(
1
-
2
*
cell_x
)
;
grad_ptr
+=
grad_quadstep
;
qangle_ptr
+=
qangle_step
;
const
int
dist_y_begin
=
-4
-
4
*
lidY
;
for
(
int
dist_y
=
dist_y_begin
; dist_y < dist_y_begin + 12; ++dist_y)
{
float2
vote
=
(
float2
)
(
grad_ptr[0],
grad_ptr[1]
)
;
uchar2
bin
=
(
uchar2
)
(
qangle_ptr[0],
qangle_ptr[1]
)
;
int
dist_center_y
=
dist_y
-
4
*
(
1
-
2
*
cell_y
)
;
int
dist_center_x
=
dist_x
-
4
*
(
1
-
2
*
cell_x
)
;
grad_ptr
+=
grad_quadstep
;
qangle_ptr
+=
qangle_step
;
float
gaussian
=
exp
(
-
(
dist_center_y
*
dist_center_y
+
dist_center_x
*
dist_center_x
)
*
scale
)
;
float
interp_weight
=
(
8.f
-
fabs
(
dist_y
+
0.5f
))
*
(
8.f
-
fabs
(
dist_x
+
0.5f
))
/
64.f
;
int
dist_center_y
=
dist_y
-
4
*
(
1
-
2
*
cell_y
)
;
hist[bin.x
*
48]
+=
gaussian
*
interp_weight
*
vote.x
;
hist[bin.y
*
48]
+=
gaussian
*
interp_weight
*
vote.y
;
}
float
gaussian
=
exp
(
-
(
dist_center_y
*
dist_center_y
+
dist_center_x
*
dist_center_x
)
*
scale
)
;
float
interp_weight
=
(
8.f
-
fabs
(
dist_y
+
0.5f
))
*
(
8.f
-
fabs
(
dist_x
+
0.5f
))
/
64.f
;
volatile
__local
float*
hist_
=
hist
;
for
(
int
bin_id
=
0
; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if
(
cell_thread_x
<
6
)
hist_[0]
+=
hist_[6]
;
if
(
cell_thread_x
<
3
)
hist_[0]
+=
hist_[3]
;
if
(
cell_thread_x
==
0
)
final_hist[
(
cell_x
*
2
+
cell_y
)
*
cnbins
+
bin_id]
=
hist_[0]
+
hist_[1]
+
hist_[2]
;
}
hist[bin.x
*
48]
+=
gaussian
*
interp_weight
*
vote.x
;
hist[bin.y
*
48]
+=
gaussian
*
interp_weight
*
vote.y
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__global
float*
block_hist
=
block_hists
+
(
gidY
*
img_block_width
+
gidX
)
*
cblock_hist_size
;
volatile
__local
float*
hist_
=
hist
;
for
(
int
bin_id
=
0
; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if
(
cell_thread_x
<
6
)
hist_[0]
+=
hist_[6]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
cell_thread_x
<
3
)
hist_[0]
+=
hist_[3]
;
#
ifdef
WAVE_SIZE_1
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
cell_thread_x
==
0
)
final_hist[
(
cell_x
*
2
+
cell_y
)
*
cnbins
+
bin_id]
=
hist_[0]
+
hist_[1]
+
hist_[2]
;
}
#
ifdef
WAVE_SIZE_1
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
int
tid
=
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
*
16
+
cell_thread_x
;
if
(
tid
<
cblock_hist_size
)
int
tid
=
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
*
12
+
cell_thread_x
;
if
((
tid
<
cblock_hist_size
)
&&
(
gid
<
blocks_total
))
{
__global
float*
block_hist
=
block_hists
+
(
gidY
*
img_block_width
+
gidX
)
*
cblock_hist_size
;
block_hist[tid]
=
final_hist[tid]
;
}
}
//-------------------------------------------------------------
...
...
@@ -133,21 +153,59 @@ float reduce_smem(volatile __local float* smem, int size)
unsigned
int
tid
=
get_local_id
(
0
)
;
float
sum
=
smem[tid]
;
if
(
size
>=
512
)
{
if
(
tid
<
256
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
256]
; barrier(CLK_LOCAL_MEM_FENCE); }
if
(
size
>=
256
)
{
if
(
tid
<
128
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
128]
; barrier(CLK_LOCAL_MEM_FENCE); }
if
(
size
>=
128
)
{
if
(
tid
<
64
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
64]
; barrier(CLK_LOCAL_MEM_FENCE); }
if
(
size
>=
512
)
{
if
(
tid
<
256
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
256]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
size
>=
256
)
{
if
(
tid
<
128
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
128]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
size
>=
128
)
{
if
(
tid
<
64
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
tid
<
32
)
{
if
(
size
>=
64
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
32]
;
#
if
defined
(
WAVE_SIZE_16
)
|
| defined(WAVE_SIZE_1)
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
#endif
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
}
...
...
@@ -224,19 +282,44 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
if (tid < 64) products[tid] = product = product + products[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
if (tid < 32)
{
volatile
__local
float*
smem
=
products
;
smem[tid] = product = product + smem[tid + 32];
#if defined(WAVE_SIZE_16) |
|
defined
(
WAVE_SIZE_1
)
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
volatile
__local
float*
smem
=
products
;
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
#
ifdef
WAVE_SIZE_1
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
#
ifdef
WAVE_SIZE_1
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
#
ifdef
WAVE_SIZE_1
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
2
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
#
ifdef
WAVE_SIZE_1
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
1
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
1]
;
}
...
...
@@ -248,8 +331,8 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
//
Extract
descriptors
__kernel
void
extract_descrs_by_rows_kernel
(
const
int
cblock_hist_size,
const
int
descriptors_quadstep,
const
int
cdescr_size,
const
int
cdescr_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
{
int
tid
=
get_local_id
(
0
)
;
int
gidX
=
get_group_id
(
0
)
;
...
...
@@ -271,8 +354,8 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
}
__kernel
void
extract_descrs_by_cols_kernel
(
const
int
cblock_hist_size,
const
int
descriptors_quadstep,
const
int
cdescr_size,
const
int
cnblocks_win_x,
const
int
cnblocks_win_y,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
const
int
cnblocks_win_x,
const
int
cnblocks_win_y,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
{
int
tid
=
get_local_id
(
0
)
;
int
gidX
=
get_group_id
(
0
)
;
...
...
@@ -301,8 +384,8 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
//
Gradients
computation
__kernel
void
compute_gradients_8UC4_kernel
(
const
int
height,
const
int
width,
const
int
img_step,
const
int
grad_quadstep,
const
int
qangle_step,
const
__global
uchar4
*
img,
__global
float
*
grad,
__global
uchar
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
const
__global
uchar4
*
img,
__global
float
*
grad,
__global
uchar
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
tid
=
get_local_id
(
0
)
;
...
...
@@ -400,8 +483,8 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
}
__kernel
void
compute_gradients_8UC1_kernel
(
const
int
height,
const
int
width,
const
int
img_step,
const
int
grad_quadstep,
const
int
qangle_step,
__global
const
uchar
*
img,
__global
float
*
grad,
__global
uchar
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
__global
const
uchar
*
img,
__global
float
*
grad,
__global
uchar
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
tid
=
get_local_id
(
0
)
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment