Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
26c24614
Commit
26c24614
authored
Jun 19, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
optimize hog
parent
843094a0
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
710 additions
and
324 deletions
+710
-324
hog.cpp
modules/ocl/src/hog.cpp
+343
-170
objdetect_hog.cl
modules/ocl/src/opencl/objdetect_hog.cl
+367
-154
No files found.
modules/ocl/src/hog.cpp
View file @
26c24614
...
...
@@ -15,7 +15,7 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
//
Wenju He, wenju@multicorewareinc.com
//
Wenju He, wenju@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
...
...
@@ -48,13 +48,107 @@ using namespace cv;
using
namespace
cv
::
ocl
;
using
namespace
std
;
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2
#define NTHREADS 256
static
oclMat
gauss_w_lut
;
static
bool
hog_device_cpu
;
/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
static
const
float
gaussian_interp_lut
[]
=
{
/* gaussian lut */
0.01831564
f
,
0.02926831
f
,
0.04393693
f
,
0.06196101
f
,
0.08208500
f
,
0.10215643
f
,
0.11943297
f
,
0.13117145
f
,
0.13533528
f
,
0.13117145
f
,
0.11943297
f
,
0.10215643
f
,
0.08208500
f
,
0.06196101
f
,
0.04393693
f
,
0.02926831
f
,
0.02926831
f
,
0.04677062
f
,
0.07021102
f
,
0.09901341
f
,
0.13117145
f
,
0.16324551
f
,
0.19085334
f
,
0.20961139
f
,
0.21626517
f
,
0.20961139
f
,
0.19085334
f
,
0.16324551
f
,
0.13117145
f
,
0.09901341
f
,
0.07021102
f
,
0.04677062
f
,
0.04393693
f
,
0.07021102
f
,
0.10539922
f
,
0.14863673
f
,
0.19691168
f
,
0.24506053
f
,
0.28650481
f
,
0.31466395
f
,
0.32465246
f
,
0.31466395
f
,
0.28650481
f
,
0.24506053
f
,
0.19691168
f
,
0.14863673
f
,
0.10539922
f
,
0.07021102
f
,
0.06196101
f
,
0.09901341
f
,
0.14863673
f
,
0.20961139
f
,
0.27768996
f
,
0.34559074
f
,
0.40403652
f
,
0.44374731
f
,
0.45783335
f
,
0.44374731
f
,
0.40403652
f
,
0.34559074
f
,
0.27768996
f
,
0.20961139
f
,
0.14863673
f
,
0.09901341
f
,
0.08208500
f
,
0.13117145
f
,
0.19691168
f
,
0.27768996
f
,
0.36787945
f
,
0.45783335
f
,
0.53526145
f
,
0.58786964
f
,
0.60653067
f
,
0.58786964
f
,
0.53526145
f
,
0.45783335
f
,
0.36787945
f
,
0.27768996
f
,
0.19691168
f
,
0.13117145
f
,
0.10215643
f
,
0.16324551
f
,
0.24506053
f
,
0.34559074
f
,
0.45783335
f
,
0.56978285
f
,
0.66614360
f
,
0.73161560
f
,
0.75483960
f
,
0.73161560
f
,
0.66614360
f
,
0.56978285
f
,
0.45783335
f
,
0.34559074
f
,
0.24506053
f
,
0.16324551
f
,
0.11943297
f
,
0.19085334
f
,
0.28650481
f
,
0.40403652
f
,
0.53526145
f
,
0.66614360
f
,
0.77880079
f
,
0.85534531
f
,
0.88249689
f
,
0.85534531
f
,
0.77880079
f
,
0.66614360
f
,
0.53526145
f
,
0.40403652
f
,
0.28650481
f
,
0.19085334
f
,
0.13117145
f
,
0.20961139
f
,
0.31466395
f
,
0.44374731
f
,
0.58786964
f
,
0.73161560
f
,
0.85534531
f
,
0.93941307
f
,
0.96923321
f
,
0.93941307
f
,
0.85534531
f
,
0.73161560
f
,
0.58786964
f
,
0.44374731
f
,
0.31466395
f
,
0.20961139
f
,
0.13533528
f
,
0.21626517
f
,
0.32465246
f
,
0.45783335
f
,
0.60653067
f
,
0.75483960
f
,
0.88249689
f
,
0.96923321
f
,
1.00000000
f
,
0.96923321
f
,
0.88249689
f
,
0.75483960
f
,
0.60653067
f
,
0.45783335
f
,
0.32465246
f
,
0.21626517
f
,
0.13117145
f
,
0.20961139
f
,
0.31466395
f
,
0.44374731
f
,
0.58786964
f
,
0.73161560
f
,
0.85534531
f
,
0.93941307
f
,
0.96923321
f
,
0.93941307
f
,
0.85534531
f
,
0.73161560
f
,
0.58786964
f
,
0.44374731
f
,
0.31466395
f
,
0.20961139
f
,
0.11943297
f
,
0.19085334
f
,
0.28650481
f
,
0.40403652
f
,
0.53526145
f
,
0.66614360
f
,
0.77880079
f
,
0.85534531
f
,
0.88249689
f
,
0.85534531
f
,
0.77880079
f
,
0.66614360
f
,
0.53526145
f
,
0.40403652
f
,
0.28650481
f
,
0.19085334
f
,
0.10215643
f
,
0.16324551
f
,
0.24506053
f
,
0.34559074
f
,
0.45783335
f
,
0.56978285
f
,
0.66614360
f
,
0.73161560
f
,
0.75483960
f
,
0.73161560
f
,
0.66614360
f
,
0.56978285
f
,
0.45783335
f
,
0.34559074
f
,
0.24506053
f
,
0.16324551
f
,
0.08208500
f
,
0.13117145
f
,
0.19691168
f
,
0.27768996
f
,
0.36787945
f
,
0.45783335
f
,
0.53526145
f
,
0.58786964
f
,
0.60653067
f
,
0.58786964
f
,
0.53526145
f
,
0.45783335
f
,
0.36787945
f
,
0.27768996
f
,
0.19691168
f
,
0.13117145
f
,
0.06196101
f
,
0.09901341
f
,
0.14863673
f
,
0.20961139
f
,
0.27768996
f
,
0.34559074
f
,
0.40403652
f
,
0.44374731
f
,
0.45783335
f
,
0.44374731
f
,
0.40403652
f
,
0.34559074
f
,
0.27768996
f
,
0.20961139
f
,
0.14863673
f
,
0.09901341
f
,
0.04393693
f
,
0.07021102
f
,
0.10539922
f
,
0.14863673
f
,
0.19691168
f
,
0.24506053
f
,
0.28650481
f
,
0.31466395
f
,
0.32465246
f
,
0.31466395
f
,
0.28650481
f
,
0.24506053
f
,
0.19691168
f
,
0.14863673
f
,
0.10539922
f
,
0.07021102
f
,
0.02926831
f
,
0.04677062
f
,
0.07021102
f
,
0.09901341
f
,
0.13117145
f
,
0.16324551
f
,
0.19085334
f
,
0.20961139
f
,
0.21626517
f
,
0.20961139
f
,
0.19085334
f
,
0.16324551
f
,
0.13117145
f
,
0.09901341
f
,
0.07021102
f
,
0.04677062
f
,
/* interp_weight lut */
0.00390625
f
,
0.01171875
f
,
0.01953125
f
,
0.02734375
f
,
0.03515625
f
,
0.04296875
f
,
0.05078125
f
,
0.05859375
f
,
0.05859375
f
,
0.05078125
f
,
0.04296875
f
,
0.03515625
f
,
0.02734375
f
,
0.01953125
f
,
0.01171875
f
,
0.00390625
f
,
0.01171875
f
,
0.03515625
f
,
0.05859375
f
,
0.08203125
f
,
0.10546875
f
,
0.12890625
f
,
0.15234375
f
,
0.17578125
f
,
0.17578125
f
,
0.15234375
f
,
0.12890625
f
,
0.10546875
f
,
0.08203125
f
,
0.05859375
f
,
0.03515625
f
,
0.01171875
f
,
0.01953125
f
,
0.05859375
f
,
0.09765625
f
,
0.13671875
f
,
0.17578125
f
,
0.21484375
f
,
0.25390625
f
,
0.29296875
f
,
0.29296875
f
,
0.25390625
f
,
0.21484375
f
,
0.17578125
f
,
0.13671875
f
,
0.09765625
f
,
0.05859375
f
,
0.01953125
f
,
0.02734375
f
,
0.08203125
f
,
0.13671875
f
,
0.19140625
f
,
0.24609375
f
,
0.30078125
f
,
0.35546875
f
,
0.41015625
f
,
0.41015625
f
,
0.35546875
f
,
0.30078125
f
,
0.24609375
f
,
0.19140625
f
,
0.13671875
f
,
0.08203125
f
,
0.02734375
f
,
0.03515625
f
,
0.10546875
f
,
0.17578125
f
,
0.24609375
f
,
0.31640625
f
,
0.38671875
f
,
0.45703125
f
,
0.52734375
f
,
0.52734375
f
,
0.45703125
f
,
0.38671875
f
,
0.31640625
f
,
0.24609375
f
,
0.17578125
f
,
0.10546875
f
,
0.03515625
f
,
0.04296875
f
,
0.12890625
f
,
0.21484375
f
,
0.30078125
f
,
0.38671875
f
,
0.47265625
f
,
0.55859375
f
,
0.64453125
f
,
0.64453125
f
,
0.55859375
f
,
0.47265625
f
,
0.38671875
f
,
0.30078125
f
,
0.21484375
f
,
0.12890625
f
,
0.04296875
f
,
0.05078125
f
,
0.15234375
f
,
0.25390625
f
,
0.35546875
f
,
0.45703125
f
,
0.55859375
f
,
0.66015625
f
,
0.76171875
f
,
0.76171875
f
,
0.66015625
f
,
0.55859375
f
,
0.45703125
f
,
0.35546875
f
,
0.25390625
f
,
0.15234375
f
,
0.05078125
f
,
0.05859375
f
,
0.17578125
f
,
0.29296875
f
,
0.41015625
f
,
0.52734375
f
,
0.64453125
f
,
0.76171875
f
,
0.87890625
f
,
0.87890625
f
,
0.76171875
f
,
0.64453125
f
,
0.52734375
f
,
0.41015625
f
,
0.29296875
f
,
0.17578125
f
,
0.05859375
f
,
0.05859375
f
,
0.17578125
f
,
0.29296875
f
,
0.41015625
f
,
0.52734375
f
,
0.64453125
f
,
0.76171875
f
,
0.87890625
f
,
0.87890625
f
,
0.76171875
f
,
0.64453125
f
,
0.52734375
f
,
0.41015625
f
,
0.29296875
f
,
0.17578125
f
,
0.05859375
f
,
0.05078125
f
,
0.15234375
f
,
0.25390625
f
,
0.35546875
f
,
0.45703125
f
,
0.55859375
f
,
0.66015625
f
,
0.76171875
f
,
0.76171875
f
,
0.66015625
f
,
0.55859375
f
,
0.45703125
f
,
0.35546875
f
,
0.25390625
f
,
0.15234375
f
,
0.05078125
f
,
0.04296875
f
,
0.12890625
f
,
0.21484375
f
,
0.30078125
f
,
0.38671875
f
,
0.47265625
f
,
0.55859375
f
,
0.64453125
f
,
0.64453125
f
,
0.55859375
f
,
0.47265625
f
,
0.38671875
f
,
0.30078125
f
,
0.21484375
f
,
0.12890625
f
,
0.04296875
f
,
0.03515625
f
,
0.10546875
f
,
0.17578125
f
,
0.24609375
f
,
0.31640625
f
,
0.38671875
f
,
0.45703125
f
,
0.52734375
f
,
0.52734375
f
,
0.45703125
f
,
0.38671875
f
,
0.31640625
f
,
0.24609375
f
,
0.17578125
f
,
0.10546875
f
,
0.03515625
f
,
0.02734375
f
,
0.08203125
f
,
0.13671875
f
,
0.19140625
f
,
0.24609375
f
,
0.30078125
f
,
0.35546875
f
,
0.41015625
f
,
0.41015625
f
,
0.35546875
f
,
0.30078125
f
,
0.24609375
f
,
0.19140625
f
,
0.13671875
f
,
0.08203125
f
,
0.02734375
f
,
0.01953125
f
,
0.05859375
f
,
0.09765625
f
,
0.13671875
f
,
0.17578125
f
,
0.21484375
f
,
0.25390625
f
,
0.29296875
f
,
0.29296875
f
,
0.25390625
f
,
0.21484375
f
,
0.17578125
f
,
0.13671875
f
,
0.09765625
f
,
0.05859375
f
,
0.01953125
f
,
0.01171875
f
,
0.03515625
f
,
0.05859375
f
,
0.08203125
f
,
0.10546875
f
,
0.12890625
f
,
0.15234375
f
,
0.17578125
f
,
0.17578125
f
,
0.15234375
f
,
0.12890625
f
,
0.10546875
f
,
0.08203125
f
,
0.05859375
f
,
0.03515625
f
,
0.01171875
f
,
0.00390625
f
,
0.01171875
f
,
0.01953125
f
,
0.02734375
f
,
0.03515625
f
,
0.04296875
f
,
0.05078125
f
,
0.05859375
f
,
0.05859375
f
,
0.05078125
f
,
0.04296875
f
,
0.03515625
f
,
0.02734375
f
,
0.01953125
f
,
0.01171875
f
,
0.00390625
f
};
namespace
cv
{
namespace
ocl
...
...
@@ -78,38 +172,43 @@ namespace cv
int
cnblocks_win_x
;
int
cnblocks_win_y
;
int
cblock_hist_size
;
int
cblock_hist_size_2up
;
int
cdescr_size
;
int
cdescr_width
;
int
cdescr_height
;
void
set_up_constants
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
nblocks_win_x
,
int
nblocks_win_y
);
void
compute_hists
(
int
nbins
,
int
block_stride_x
,
int
blovck_stride_y
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
grad
,
const
cv
::
ocl
::
oclMat
&
qangle
,
float
sigma
,
cv
::
ocl
::
oclMat
&
block_hists
);
int
height
,
int
width
,
float
sigma
,
const
cv
::
ocl
::
oclMat
&
grad
,
const
cv
::
ocl
::
oclMat
&
qangle
,
const
cv
::
ocl
::
oclMat
&
gauss_w_lut
,
cv
::
ocl
::
oclMat
&
block_hists
);
void
normalize_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
height
,
int
width
,
cv
::
ocl
::
oclMat
&
block_hists
,
float
threshold
);
int
height
,
int
width
,
cv
::
ocl
::
oclMat
&
block_hists
,
float
threshold
);
void
classify_hists
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
const
cv
::
ocl
::
oclMat
&
coefs
,
float
free_coef
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
const
cv
::
ocl
::
oclMat
&
coefs
,
float
free_coef
,
float
threshold
,
cv
::
ocl
::
oclMat
&
labels
);
void
extract_descrs_by_rows
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
void
extract_descrs_by_rows
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
cv
::
ocl
::
oclMat
&
descriptors
);
void
extract_descrs_by_cols
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
void
extract_descrs_by_cols
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
cv
::
ocl
::
oclMat
&
descriptors
);
void
compute_gradients_8UC1
(
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
img
,
float
angle_scale
,
cv
::
ocl
::
oclMat
&
grad
,
cv
::
ocl
::
oclMat
&
qangle
,
bool
correct_gamma
);
float
angle_scale
,
cv
::
ocl
::
oclMat
&
grad
,
cv
::
ocl
::
oclMat
&
qangle
,
bool
correct_gamma
);
void
compute_gradients_8UC4
(
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
img
,
float
angle_scale
,
cv
::
ocl
::
oclMat
&
grad
,
cv
::
ocl
::
oclMat
&
qangle
,
bool
correct_gamma
);
void
resize
(
const
oclMat
&
src
,
oclMat
&
dst
,
const
Size
sz
);
float
angle_scale
,
cv
::
ocl
::
oclMat
&
grad
,
cv
::
ocl
::
oclMat
&
qangle
,
bool
correct_gamma
);
}
}
}
...
...
@@ -117,8 +216,14 @@ namespace cv
using
namespace
::
cv
::
ocl
::
device
;
cv
::
ocl
::
HOGDescriptor
::
HOGDescriptor
(
Size
win_size_
,
Size
block_size_
,
Size
block_stride_
,
Size
cell_size_
,
int
nbins_
,
double
win_sigma_
,
double
threshold_L2hys_
,
bool
gamma_correction_
,
int
nlevels_
)
static
inline
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
}
cv
::
ocl
::
HOGDescriptor
::
HOGDescriptor
(
Size
win_size_
,
Size
block_size_
,
Size
block_stride_
,
Size
cell_size_
,
int
nbins_
,
double
win_sigma_
,
double
threshold_L2hys_
,
bool
gamma_correction_
,
int
nlevels_
)
:
win_size
(
win_size_
),
block_size
(
block_size_
),
block_stride
(
block_stride_
),
...
...
@@ -132,19 +237,27 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
CV_Assert
((
win_size
.
width
-
block_size
.
width
)
%
block_stride
.
width
==
0
&&
(
win_size
.
height
-
block_size
.
height
)
%
block_stride
.
height
==
0
);
CV_Assert
(
block_size
.
width
%
cell_size
.
width
==
0
&&
block_size
.
height
%
cell_size
.
height
==
0
);
CV_Assert
(
block_size
.
width
%
cell_size
.
width
==
0
&&
block_size
.
height
%
cell_size
.
height
==
0
);
CV_Assert
(
block_stride
==
cell_size
);
CV_Assert
(
cell_size
==
Size
(
8
,
8
));
Size
cells_per_block
=
Size
(
block_size
.
width
/
cell_size
.
width
,
block_size
.
height
/
cell_size
.
height
);
Size
cells_per_block
(
block_size
.
width
/
cell_size
.
width
,
block_size
.
height
/
cell_size
.
height
);
CV_Assert
(
cells_per_block
==
Size
(
2
,
2
));
cv
::
Size
blocks_per_win
=
numPartsWithin
(
win_size
,
block_size
,
block_stride
);
hog
::
set_up_constants
(
nbins
,
block_stride
.
width
,
block_stride
.
height
,
blocks_per_win
.
width
,
blocks_per_win
.
height
);
hog
::
set_up_constants
(
nbins
,
block_stride
.
width
,
block_stride
.
height
,
blocks_per_win
.
width
,
blocks_per_win
.
height
);
effect_size
=
Size
(
0
,
0
);
if
(
queryDeviceInfo
<
IS_CPU_DEVICE
,
bool
>
())
hog_device_cpu
=
true
;
else
hog_device_cpu
=
false
;
}
size_t
cv
::
ocl
::
HOGDescriptor
::
getDescriptorSize
()
const
...
...
@@ -154,7 +267,8 @@ size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
size_t
cv
::
ocl
::
HOGDescriptor
::
getBlockHistogramSize
()
const
{
Size
cells_per_block
=
Size
(
block_size
.
width
/
cell_size
.
width
,
block_size
.
height
/
cell_size
.
height
);
Size
cells_per_block
=
Size
(
block_size
.
width
/
cell_size
.
width
,
block_size
.
height
/
cell_size
.
height
);
return
(
size_t
)(
nbins
*
cells_per_block
.
area
());
}
...
...
@@ -167,7 +281,8 @@ bool cv::ocl::HOGDescriptor::checkDetectorSize() const
{
size_t
detector_size
=
detector
.
rows
*
detector
.
cols
;
size_t
descriptor_size
=
getDescriptorSize
();
return
detector_size
==
0
||
detector_size
==
descriptor_size
||
detector_size
==
descriptor_size
+
1
;
return
detector_size
==
0
||
detector_size
==
descriptor_size
||
detector_size
==
descriptor_size
+
1
;
}
void
cv
::
ocl
::
HOGDescriptor
::
setSVMDetector
(
const
vector
<
float
>
&
_detector
)
...
...
@@ -207,10 +322,16 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
const
size_t
block_hist_size
=
getBlockHistogramSize
();
const
Size
blocks_per_img
=
numPartsWithin
(
img
.
size
(),
block_size
,
block_stride
);
block_hists
.
create
(
1
,
static_cast
<
int
>
(
block_hist_size
*
blocks_per_img
.
area
()),
CV_32F
);
block_hists
.
create
(
1
,
static_cast
<
int
>
(
block_hist_size
*
blocks_per_img
.
area
())
+
256
,
CV_32F
);
Size
wins_per_img
=
numPartsWithin
(
img
.
size
(),
win_size
,
win_stride
);
labels
.
create
(
1
,
wins_per_img
.
area
(),
CV_8U
);
vector
<
float
>
v_lut
=
vector
<
float
>
(
gaussian_interp_lut
,
gaussian_interp_lut
+
sizeof
(
gaussian_interp_lut
)
/
sizeof
(
gaussian_interp_lut
[
0
]));
Mat
m_lut
(
v_lut
);
gauss_w_lut
.
upload
(
m_lut
.
reshape
(
1
,
1
));
}
void
cv
::
ocl
::
HOGDescriptor
::
computeGradient
(
const
oclMat
&
img
,
oclMat
&
grad
,
oclMat
&
qangle
)
...
...
@@ -221,29 +342,34 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc
switch
(
img
.
type
())
{
case
CV_8UC1
:
hog
::
compute_gradients_8UC1
(
effect_size
.
height
,
effect_size
.
width
,
img
,
angleScale
,
grad
,
qangle
,
gamma_correction
);
hog
::
compute_gradients_8UC1
(
effect_size
.
height
,
effect_size
.
width
,
img
,
angleScale
,
grad
,
qangle
,
gamma_correction
);
break
;
case
CV_8UC4
:
hog
::
compute_gradients_8UC4
(
effect_size
.
height
,
effect_size
.
width
,
img
,
angleScale
,
grad
,
qangle
,
gamma_correction
);
hog
::
compute_gradients_8UC4
(
effect_size
.
height
,
effect_size
.
width
,
img
,
angleScale
,
grad
,
qangle
,
gamma_correction
);
break
;
}
}
void
cv
::
ocl
::
HOGDescriptor
::
computeBlockHistograms
(
const
oclMat
&
img
)
{
computeGradient
(
img
,
grad
,
qangle
);
computeGradient
(
img
,
this
->
grad
,
this
->
qangle
);
hog
::
compute_hists
(
nbins
,
block_stride
.
width
,
block_stride
.
height
,
effect_size
.
height
,
effect_size
.
width
,
grad
,
qangle
,
(
float
)
getWinSigma
()
,
block_hists
);
hog
::
compute_hists
(
nbins
,
block_stride
.
width
,
block_stride
.
height
,
effect_size
.
height
,
effect_size
.
width
,
(
float
)
getWinSigma
(),
grad
,
qangle
,
gauss_w_lut
,
block_hists
);
hog
::
normalize_hists
(
nbins
,
block_stride
.
width
,
block_stride
.
height
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
(
float
)
threshold_L2hys
);
hog
::
normalize_hists
(
nbins
,
block_stride
.
width
,
block_stride
.
height
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
(
float
)
threshold_L2hys
);
}
void
cv
::
ocl
::
HOGDescriptor
::
getDescriptors
(
const
oclMat
&
img
,
Size
win_stride
,
oclMat
&
descriptors
,
int
descr_format
)
void
cv
::
ocl
::
HOGDescriptor
::
getDescriptors
(
const
oclMat
&
img
,
Size
win_stride
,
oclMat
&
descriptors
,
int
descr_format
)
{
CV_Assert
(
win_stride
.
width
%
block_stride
.
width
==
0
&&
win_stride
.
height
%
block_stride
.
height
==
0
);
CV_Assert
(
win_stride
.
width
%
block_stride
.
width
==
0
&&
win_stride
.
height
%
block_stride
.
height
==
0
);
init_buffer
(
img
,
win_stride
);
...
...
@@ -253,17 +379,20 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
Size
blocks_per_win
=
numPartsWithin
(
win_size
,
block_size
,
block_stride
);
Size
wins_per_img
=
numPartsWithin
(
effect_size
,
win_size
,
win_stride
);
descriptors
.
create
(
wins_per_img
.
area
(),
static_cast
<
int
>
(
blocks_per_win
.
area
()
*
block_hist_size
),
CV_32F
);
descriptors
.
create
(
wins_per_img
.
area
(),
static_cast
<
int
>
(
blocks_per_win
.
area
()
*
block_hist_size
),
CV_32F
);
switch
(
descr_format
)
{
case
DESCR_FORMAT_ROW_BY_ROW
:
hog
::
extract_descrs_by_rows
(
win_size
.
height
,
win_size
.
width
,
block_stride
.
height
,
block_stride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
descriptors
);
hog
::
extract_descrs_by_rows
(
win_size
.
height
,
win_size
.
width
,
block_stride
.
height
,
block_stride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
descriptors
);
break
;
case
DESCR_FORMAT_COL_BY_COL
:
hog
::
extract_descrs_by_cols
(
win_size
.
height
,
win_size
.
width
,
block_stride
.
height
,
block_stride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
descriptors
);
hog
::
extract_descrs_by_cols
(
win_size
.
height
,
win_size
.
width
,
block_stride
.
height
,
block_stride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
descriptors
);
break
;
default
:
CV_Error
(
CV_StsBadArg
,
"Unknown descriptor format"
);
...
...
@@ -271,7 +400,8 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
}
void
cv
::
ocl
::
HOGDescriptor
::
detect
(
const
oclMat
&
img
,
vector
<
Point
>
&
hits
,
double
hit_threshold
,
Size
win_stride
,
Size
padding
)
void
cv
::
ocl
::
HOGDescriptor
::
detect
(
const
oclMat
&
img
,
vector
<
Point
>
&
hits
,
double
hit_threshold
,
Size
win_stride
,
Size
padding
)
{
CV_Assert
(
img
.
type
()
==
CV_8UC1
||
img
.
type
()
==
CV_8UC4
);
CV_Assert
(
padding
==
Size
(
0
,
0
));
...
...
@@ -283,14 +413,16 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
if
(
win_stride
==
Size
())
win_stride
=
block_stride
;
else
CV_Assert
(
win_stride
.
width
%
block_stride
.
width
==
0
&&
win_stride
.
height
%
block_stride
.
height
==
0
);
CV_Assert
(
win_stride
.
width
%
block_stride
.
width
==
0
&&
win_stride
.
height
%
block_stride
.
height
==
0
);
init_buffer
(
img
,
win_stride
);
computeBlockHistograms
(
img
);
hog
::
classify_hists
(
win_size
.
height
,
win_size
.
width
,
block_stride
.
height
,
block_stride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
detector
,
(
float
)
free_coef
,
(
float
)
hit_threshold
,
labels
);
hog
::
classify_hists
(
win_size
.
height
,
win_size
.
width
,
block_stride
.
height
,
block_stride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
detector
,
(
float
)
free_coef
,
(
float
)
hit_threshold
,
labels
);
labels
.
download
(
labels_host
);
unsigned
char
*
vec
=
labels_host
.
ptr
();
...
...
@@ -306,8 +438,9 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
void
cv
::
ocl
::
HOGDescriptor
::
detectMultiScale
(
const
oclMat
&
img
,
vector
<
Rect
>
&
found_locations
,
double
hit_threshold
,
Size
win_stride
,
Size
padding
,
double
scale0
,
int
group_threshold
)
void
cv
::
ocl
::
HOGDescriptor
::
detectMultiScale
(
const
oclMat
&
img
,
vector
<
Rect
>
&
found_locations
,
double
hit_threshold
,
Size
win_stride
,
Size
padding
,
double
scale0
,
int
group_threshold
)
{
CV_Assert
(
img
.
type
()
==
CV_8UC1
||
img
.
type
()
==
CV_8UC4
);
CV_Assert
(
scale0
>
1
);
...
...
@@ -333,7 +466,8 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
if
(
win_stride
==
Size
())
win_stride
=
block_stride
;
else
CV_Assert
(
win_stride
.
width
%
block_stride
.
width
==
0
&&
win_stride
.
height
%
block_stride
.
height
==
0
);
CV_Assert
(
win_stride
.
width
%
block_stride
.
width
==
0
&&
win_stride
.
height
%
block_stride
.
height
==
0
);
init_buffer
(
img
,
win_stride
);
image_scale
.
create
(
img
.
size
(),
img
.
type
());
...
...
@@ -347,16 +481,18 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
}
else
{
hog
::
resize
(
img
,
image_scale
,
effect_size
);
resize
(
img
,
image_scale
,
effect_size
);
detect
(
image_scale
,
locations
,
hit_threshold
,
win_stride
,
padding
);
}
Size
scaled_win_size
(
cvRound
(
win_size
.
width
*
scale
),
cvRound
(
win_size
.
height
*
scale
));
Size
scaled_win_size
(
cvRound
(
win_size
.
width
*
scale
),
cvRound
(
win_size
.
height
*
scale
));
for
(
size_t
j
=
0
;
j
<
locations
.
size
();
j
++
)
all_candidates
.
push_back
(
Rect
(
Point2d
((
CvPoint
)
locations
[
j
])
*
scale
,
scaled_win_size
));
all_candidates
.
push_back
(
Rect
(
Point2d
((
CvPoint
)
locations
[
j
])
*
scale
,
scaled_win_size
));
}
found_locations
.
assign
(
all_candidates
.
begin
(),
all_candidates
.
end
());
groupRectangles
(
found_locations
,
group_threshold
,
0.2
/*magic number copied from CPU version*/
);
groupRectangles
(
found_locations
,
group_threshold
,
0.2
);
}
int
cv
::
ocl
::
HOGDescriptor
::
numPartsWithin
(
int
size
,
int
part_size
,
int
stride
)
...
...
@@ -364,9 +500,11 @@ int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
return
(
size
-
part_size
+
stride
)
/
stride
;
}
cv
::
Size
cv
::
ocl
::
HOGDescriptor
::
numPartsWithin
(
cv
::
Size
size
,
cv
::
Size
part_size
,
cv
::
Size
stride
)
cv
::
Size
cv
::
ocl
::
HOGDescriptor
::
numPartsWithin
(
cv
::
Size
size
,
cv
::
Size
part_size
,
cv
::
Size
stride
)
{
return
Size
(
numPartsWithin
(
size
.
width
,
part_size
.
width
,
stride
.
width
),
numPartsWithin
(
size
.
height
,
part_size
.
height
,
stride
.
height
));
return
Size
(
numPartsWithin
(
size
.
width
,
part_size
.
width
,
stride
.
width
),
numPartsWithin
(
size
.
height
,
part_size
.
height
,
stride
.
height
));
}
std
::
vector
<
float
>
cv
::
ocl
::
HOGDescriptor
::
getDefaultPeopleDetector
()
...
...
@@ -1547,8 +1685,9 @@ static int power_2up(unsigned int n)
return
-
1
;
// Input is too big
}
void
cv
::
ocl
::
device
::
hog
::
set_up_constants
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
nblocks_win_x
,
int
nblocks_win_y
)
void
cv
::
ocl
::
device
::
hog
::
set_up_constants
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
nblocks_win_x
,
int
nblocks_win_y
)
{
cnbins
=
nbins
;
cblock_stride_x
=
block_stride_x
;
...
...
@@ -1559,53 +1698,32 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b
int
block_hist_size
=
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
;
cblock_hist_size
=
block_hist_size
;
int
block_hist_size_2up
=
power_2up
(
block_hist_size
);
cblock_hist_size_2up
=
block_hist_size_2up
;
int
descr_width
=
nblocks_win_x
*
block_hist_size
;
cdescr_width
=
descr_width
;
cdescr_height
=
nblocks_win_y
;
int
descr_size
=
descr_width
*
nblocks_win_y
;
cdescr_size
=
descr_size
;
}
static
inline
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
}
static
void
openCLExecuteKernel_hog
(
Context
*
clCxt
,
const
char
**
source
,
string
kernelName
,
size_t
globalThreads
[
3
],
size_t
localThreads
[
3
],
vector
<
pair
<
size_t
,
const
void
*>
>
&
args
)
{
cl_kernel
kernel
=
openCLGetKernelFromSource
(
clCxt
,
source
,
kernelName
);
size_t
wave_size
=
queryDeviceInfo
<
WAVEFRONT_SIZE
,
size_t
>
(
kernel
);
openCLSafeCall
(
clReleaseKernel
(
kernel
));
if
(
wave_size
<=
16
)
{
char
build_options
[
64
];
sprintf
(
build_options
,
(
wave_size
==
16
)
?
"-D WAVE_SIZE_16"
:
"-D WAVE_SIZE_1"
);
openCLExecuteKernel
(
clCxt
,
source
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
else
openCLExecuteKernel
(
clCxt
,
source
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
compute_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
grad
,
const
cv
::
ocl
::
oclMat
&
qangle
,
float
sigma
,
cv
::
ocl
::
oclMat
&
block_hists
)
void
cv
::
ocl
::
device
::
hog
::
compute_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
height
,
int
width
,
float
sigma
,
const
cv
::
ocl
::
oclMat
&
grad
,
const
cv
::
ocl
::
oclMat
&
qangle
,
const
cv
::
ocl
::
oclMat
&
gauss_w_lut
,
cv
::
ocl
::
oclMat
&
block_hists
)
{
Context
*
clCxt
=
Context
::
getContext
();
string
kernelName
=
"compute_hists_kernel"
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
string
kernelName
=
(
sigma
==
4.0
f
)
?
"compute_hists_lut_kernel"
:
"compute_hists_kernel"
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_height
=
(
height
-
CELLS_PER_BLOCK_Y
*
CELL_HEIGHT
+
block_stride_y
)
/
block_stride_y
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_height
=
(
height
-
CELLS_PER_BLOCK_Y
*
CELL_HEIGHT
+
block_stride_y
)
/
block_stride_y
;
int
blocks_total
=
img_block_width
*
img_block_height
;
int
blocks_in_group
=
4
;
size_t
localThreads
[
3
]
=
{
blocks_in_group
*
24
,
2
,
1
};
size_t
globalThreads
[
3
]
=
{
divUp
(
blocks_total
,
blocks_in_group
)
*
localThreads
[
0
],
2
,
1
};
int
grad_quadstep
=
grad
.
step
>>
2
;
int
qangle_step
=
qangle
.
step
;
...
...
@@ -1613,6 +1731,11 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
// Precompute gaussian spatial window parameter
float
scale
=
1.
f
/
(
2.
f
*
sigma
*
sigma
);
int
blocks_in_group
=
4
;
size_t
localThreads
[
3
]
=
{
blocks_in_group
*
24
,
2
,
1
};
size_t
globalThreads
[
3
]
=
{
divUp
(
img_block_width
*
img_block_height
,
blocks_in_group
)
*
localThreads
[
0
],
2
,
1
};
int
hists_size
=
(
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
)
*
sizeof
(
float
);
int
final_hists_size
=
(
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
)
*
sizeof
(
float
);
int
smem
=
(
hists_size
+
final_hists_size
)
*
blocks_in_group
;
...
...
@@ -1628,62 +1751,120 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
qangle_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
grad
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
qangle
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
scale
));
if
(
kernelName
.
compare
(
"compute_hists_lut_kernel"
)
==
0
)
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
gauss_w_lut
.
data
));
else
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
scale
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
block_hists
.
data
));
args
.
push_back
(
make_pair
(
smem
,
(
void
*
)
NULL
));
openCLExecuteKernel_hog
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
);
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
normalize_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
height
,
int
width
,
cv
::
ocl
::
oclMat
&
block_hists
,
float
threshold
)
void
cv
::
ocl
::
device
::
hog
::
normalize_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
height
,
int
width
,
cv
::
ocl
::
oclMat
&
block_hists
,
float
threshold
)
{
Context
*
clCxt
=
Context
::
getContext
();
string
kernelName
=
"normalize_hists_kernel"
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
string
kernelName
;
int
block_hist_size
=
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
;
int
nthreads
=
power_2up
(
block_hist_size
);
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_height
=
(
height
-
CELLS_PER_BLOCK_Y
*
CELL_HEIGHT
+
block_stride_y
)
/
block_stride_y
;
size_t
globalThreads
[
3
]
=
{
img_block_width
*
nthreads
,
img_block_height
,
1
};
size_t
localThreads
[
3
]
=
{
nthreads
,
1
,
1
};
if
((
nthreads
<
32
)
||
(
nthreads
>
512
)
)
cv
::
ocl
::
error
(
"normalize_hists: histogram's size is too small or too big"
,
__FILE__
,
__LINE__
,
"normalize_hists"
);
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_height
=
(
height
-
CELLS_PER_BLOCK_Y
*
CELL_HEIGHT
+
block_stride_y
)
/
block_stride_y
;
int
nthreads
;
size_t
globalThreads
[
3
]
=
{
1
,
1
,
1
};
size_t
localThreads
[
3
]
=
{
1
,
1
,
1
};
if
(
nbins
==
9
)
{
/* optimized for the case of 9 bins */
kernelName
=
"normalize_hists_36_kernel"
;
int
blocks_in_group
=
NTHREADS
/
block_hist_size
;
nthreads
=
blocks_in_group
*
block_hist_size
;
int
num_groups
=
divUp
(
img_block_width
*
img_block_height
,
blocks_in_group
);
globalThreads
[
0
]
=
nthreads
*
num_groups
;
localThreads
[
0
]
=
nthreads
;
}
else
{
kernelName
=
"normalize_hists_kernel"
;
nthreads
=
power_2up
(
block_hist_size
);
globalThreads
[
0
]
=
img_block_width
*
nthreads
;
globalThreads
[
1
]
=
img_block_height
;
localThreads
[
0
]
=
nthreads
;
if
((
nthreads
<
32
)
||
(
nthreads
>
512
)
)
cv
::
ocl
::
error
(
"normalize_hists: histogram's size is too small or too big"
,
__FILE__
,
__LINE__
,
"normalize_hists"
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
nthreads
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
block_hist_size
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
img_block_width
));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
nthreads
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
block_hist_size
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
img_block_width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
block_hists
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
threshold
));
args
.
push_back
(
make_pair
(
nthreads
*
sizeof
(
float
),
(
void
*
)
NULL
));
openCLExecuteKernel_hog
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
);
if
(
hog_device_cpu
)
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
"-D CPU"
);
else
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
classify_hists
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
const
cv
::
ocl
::
oclMat
&
coefs
,
float
free_coef
,
float
threshold
,
cv
::
ocl
::
oclMat
&
labels
)
void
cv
::
ocl
::
device
::
hog
::
classify_hists
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
const
cv
::
ocl
::
oclMat
&
coefs
,
float
free_coef
,
float
threshold
,
cv
::
ocl
::
oclMat
&
labels
)
{
Context
*
clCxt
=
Context
::
getContext
();
string
kernelName
=
"classify_hists_kernel"
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
int
nthreads
;
string
kernelName
;
switch
(
cdescr_width
)
{
case
180
:
nthreads
=
180
;
kernelName
=
"classify_hists_180_kernel"
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cdescr_width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cdescr_height
));
break
;
case
252
:
nthreads
=
256
;
kernelName
=
"classify_hists_252_kernel"
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cdescr_width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cdescr_height
));
break
;
default
:
nthreads
=
256
;
kernelName
=
"classify_hists_kernel"
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cdescr_size
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cdescr_width
));
}
int
win_block_stride_x
=
win_stride_x
/
block_stride_x
;
int
win_block_stride_y
=
win_stride_y
/
block_stride_y
;
int
img_win_width
=
(
width
-
win_width
+
win_stride_x
)
/
win_stride_x
;
int
img_win_height
=
(
height
-
win_height
+
win_stride_y
)
/
win_stride_y
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
size_t
globalThreads
[
3
]
=
{
img_win_width
*
NTHREADS
,
img_win_height
,
1
};
size_t
localThreads
[
3
]
=
{
NTHREADS
,
1
,
1
};
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
size_t
globalThreads
[
3
]
=
{
img_win_width
*
nthreads
,
img_win_height
,
1
};
size_t
localThreads
[
3
]
=
{
nthreads
,
1
,
1
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cblock_hist_size
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cdescr_size
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cdescr_width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
img_win_width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
img_block_width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
win_block_stride_x
));
...
...
@@ -1694,12 +1875,20 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
threshold
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
labels
.
data
));
openCLExecuteKernel_hog
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
);
if
(
hog_device_cpu
)
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
"-D CPU"
);
else
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
extract_descrs_by_rows
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
cv
::
ocl
::
oclMat
&
descriptors
)
void
cv
::
ocl
::
device
::
hog
::
extract_descrs_by_rows
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
cv
::
ocl
::
oclMat
&
descriptors
)
{
Context
*
clCxt
=
Context
::
getContext
();
string
kernelName
=
"extract_descrs_by_rows_kernel"
;
...
...
@@ -1709,7 +1898,8 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
int
win_block_stride_y
=
win_stride_y
/
block_stride_y
;
int
img_win_width
=
(
width
-
win_width
+
win_stride_x
)
/
win_stride_x
;
int
img_win_height
=
(
height
-
win_height
+
win_stride_y
)
/
win_stride_y
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
descriptors_quadstep
=
descriptors
.
step
>>
2
;
size_t
globalThreads
[
3
]
=
{
img_win_width
*
NTHREADS
,
img_win_height
,
1
};
...
...
@@ -1725,12 +1915,16 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
block_hists
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
descriptors
.
data
));
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
extract_descrs_by_cols
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
cv
::
ocl
::
oclMat
&
descriptors
)
void
cv
::
ocl
::
device
::
hog
::
extract_descrs_by_cols
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
block_hists
,
cv
::
ocl
::
oclMat
&
descriptors
)
{
Context
*
clCxt
=
Context
::
getContext
();
string
kernelName
=
"extract_descrs_by_cols_kernel"
;
...
...
@@ -1740,7 +1934,8 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
int
win_block_stride_y
=
win_stride_y
/
block_stride_y
;
int
img_win_width
=
(
width
-
win_width
+
win_stride_x
)
/
win_stride_x
;
int
img_win_height
=
(
height
-
win_height
+
win_stride_y
)
/
win_stride_y
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
descriptors_quadstep
=
descriptors
.
step
>>
2
;
size_t
globalThreads
[
3
]
=
{
img_win_width
*
NTHREADS
,
img_win_height
,
1
};
...
...
@@ -1757,11 +1952,16 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
block_hists
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
descriptors
.
data
));
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
compute_gradients_8UC1
(
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
img
,
float
angle_scale
,
cv
::
ocl
::
oclMat
&
grad
,
cv
::
ocl
::
oclMat
&
qangle
,
bool
correct_gamma
)
void
cv
::
ocl
::
device
::
hog
::
compute_gradients_8UC1
(
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
img
,
float
angle_scale
,
cv
::
ocl
::
oclMat
&
grad
,
cv
::
ocl
::
oclMat
&
qangle
,
bool
correct_gamma
)
{
Context
*
clCxt
=
Context
::
getContext
();
string
kernelName
=
"compute_gradients_8UC1_kernel"
;
...
...
@@ -1786,11 +1986,16 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
args
.
push_back
(
make_pair
(
sizeof
(
cl_char
),
(
void
*
)
&
correctGamma
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cnbins
));
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
compute_gradients_8UC4
(
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
img
,
float
angle_scale
,
cv
::
ocl
::
oclMat
&
grad
,
cv
::
ocl
::
oclMat
&
qangle
,
bool
correct_gamma
)
void
cv
::
ocl
::
device
::
hog
::
compute_gradients_8UC4
(
int
height
,
int
width
,
const
cv
::
ocl
::
oclMat
&
img
,
float
angle_scale
,
cv
::
ocl
::
oclMat
&
grad
,
cv
::
ocl
::
oclMat
&
qangle
,
bool
correct_gamma
)
{
Context
*
clCxt
=
Context
::
getContext
();
string
kernelName
=
"compute_gradients_8UC4_kernel"
;
...
...
@@ -1816,39 +2021,6 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
args
.
push_back
(
make_pair
(
sizeof
(
cl_char
),
(
void
*
)
&
correctGamma
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cnbins
));
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
void
cv
::
ocl
::
device
::
hog
::
resize
(
const
oclMat
&
src
,
oclMat
&
dst
,
const
Size
sz
)
{
CV_Assert
(
(
src
.
channels
()
==
dst
.
channels
())
);
Context
*
clCxt
=
Context
::
getContext
();
string
kernelName
=
(
src
.
type
()
==
CV_8UC1
)
?
"resize_8UC1_kernel"
:
"resize_8UC4_kernel"
;
size_t
blkSizeX
=
16
,
blkSizeY
=
16
;
size_t
glbSizeX
=
sz
.
width
%
blkSizeX
==
0
?
sz
.
width
:
(
sz
.
width
/
blkSizeX
+
1
)
*
blkSizeX
;
size_t
glbSizeY
=
sz
.
height
%
blkSizeY
==
0
?
sz
.
height
:
(
sz
.
height
/
blkSizeY
+
1
)
*
blkSizeY
;
size_t
globalThreads
[
3
]
=
{
glbSizeX
,
glbSizeY
,
1
};
size_t
localThreads
[
3
]
=
{
blkSizeX
,
blkSizeY
,
1
};
float
ifx
=
(
float
)
src
.
cols
/
sz
.
width
;
float
ify
=
(
float
)
src
.
rows
/
sz
.
height
;
int
src_step
=
static_cast
<
int
>
(
src
.
step
);
int
dst_step
=
static_cast
<
int
>
(
dst
.
step
);
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
sz
.
width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
sz
.
height
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
ifx
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
ify
));
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
openCLExecuteKernel
(
clCxt
,
&
objdetect_hog
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
\ No newline at end of file
modules/ocl/src/opencl/objdetect_hog.cl
View file @
26c24614
...
...
@@ -43,7 +43,6 @@
//
//M*/
#
define
CELL_WIDTH
8
#
define
CELL_HEIGHT
8
#
define
CELLS_PER_BLOCK_X
2
...
...
@@ -51,6 +50,100 @@
#
define
NTHREADS
256
#
define
CV_PI_F
3.1415926535897932384626433832795f
//----------------------------------------------------------------------------
//
Histogram
computation
//
12
threads
for
a
cell,
12x4
threads
per
block
//
Use
pre-computed
gaussian
and
interp_weight
lookup
tables
if
sigma
is
4.0f
__kernel
void
compute_hists_lut_kernel
(
const
int
cblock_stride_x,
const
int
cblock_stride_y,
const
int
cnbins,
const
int
cblock_hist_size,
const
int
img_block_width,
const
int
blocks_in_group,
const
int
blocks_total,
const
int
grad_quadstep,
const
int
qangle_step,
__global
const
float*
grad,
__global
const
uchar*
qangle,
__global
const
float*
gauss_w_lut,
__global
float*
block_hists,
__local
float*
smem
)
{
const
int
lx
=
get_local_id
(
0
)
;
const
int
lp
=
lx
/
24
; /* local group id */
const
int
gid
=
get_group_id
(
0
)
*
blocks_in_group
+
lp
;/* global group id */
const
int
gidY
=
gid
/
img_block_width
;
const
int
gidX
=
gid
-
gidY
*
img_block_width
;
const
int
lidX
=
lx
-
lp
*
24
;
const
int
lidY
=
get_local_id
(
1
)
;
const
int
cell_x
=
lidX
/
12
;
const
int
cell_y
=
lidY
;
const
int
cell_thread_x
=
lidX
-
cell_x
*
12
;
__local
float*
hists
=
smem
+
lp
*
cnbins
*
(
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
+
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
)
;
__local
float*
final_hist
=
hists
+
cnbins
*
(
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
)
;
const
int
offset_x
=
gidX
*
cblock_stride_x
+
(
cell_x
<<
2
)
+
cell_thread_x
;
const
int
offset_y
=
gidY
*
cblock_stride_y
+
(
cell_y
<<
2
)
;
__global
const
float*
grad_ptr
=
(
gid
<
blocks_total
)
?
grad
+
offset_y
*
grad_quadstep
+
(
offset_x
<<
1
)
:
grad
;
__global
const
uchar*
qangle_ptr
=
(
gid
<
blocks_total
)
?
qangle
+
offset_y
*
qangle_step
+
(
offset_x
<<
1
)
:
qangle
;
__local
float*
hist
=
hists
+
12
*
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
+
cell_thread_x
;
for
(
int
bin_id
=
0
; bin_id < cnbins; ++bin_id)
hist[bin_id
*
48]
=
0.f
;
const
int
dist_x
=
-4
+
cell_thread_x
-
4
*
cell_x
;
const
int
dist_center_x
=
dist_x
-
4
*
(
1
-
2
*
cell_x
)
;
const
int
dist_y_begin
=
-4
-
4
*
lidY
;
for
(
int
dist_y
=
dist_y_begin
; dist_y < dist_y_begin + 12; ++dist_y)
{
float2
vote
=
(
float2
)
(
grad_ptr[0],
grad_ptr[1]
)
;
uchar2
bin
=
(
uchar2
)
(
qangle_ptr[0],
qangle_ptr[1]
)
;
grad_ptr
+=
grad_quadstep
;
qangle_ptr
+=
qangle_step
;
int
dist_center_y
=
dist_y
-
4
*
(
1
-
2
*
cell_y
)
;
int
idx
=
(
dist_center_y
+
8
)
*
16
+
(
dist_center_x
+
8
)
;
float
gaussian
=
gauss_w_lut[idx]
;
idx
=
(
dist_y
+
8
)
*
16
+
(
dist_x
+
8
)
;
float
interp_weight
=
gauss_w_lut[256+idx]
;
hist[bin.x
*
48]
+=
gaussian
*
interp_weight
*
vote.x
;
hist[bin.y
*
48]
+=
gaussian
*
interp_weight
*
vote.y
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
volatile
__local
float*
hist_
=
hist
;
for
(
int
bin_id
=
0
; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if
(
cell_thread_x
<
6
)
hist_[0]
+=
hist_[6]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
cell_thread_x
<
3
)
hist_[0]
+=
hist_[3]
;
#
ifdef
CPU
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
cell_thread_x
==
0
)
final_hist[
(
cell_x
*
2
+
cell_y
)
*
cnbins
+
bin_id]
=
hist_[0]
+
hist_[1]
+
hist_[2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
tid
=
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
*
12
+
cell_thread_x
;
if
((
tid
<
cblock_hist_size
)
&&
(
gid
<
blocks_total
))
{
__global
float*
block_hist
=
block_hists
+
(
gidY
*
img_block_width
+
gidX
)
*
cblock_hist_size
;
block_hist[tid]
=
final_hist[tid]
;
}
}
//----------------------------------------------------------------------------
//
Histogram
computation
//
12
threads
for
a
cell,
12x4
threads
per
block
...
...
@@ -125,16 +218,14 @@ __kernel void compute_hists_kernel(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
cell_thread_x
<
3
)
hist_[0]
+=
hist_[3]
;
#
ifdef
WAVE_SIZE_1
#
ifdef
CPU
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
cell_thread_x
==
0
)
final_hist[
(
cell_x
*
2
+
cell_y
)
*
cnbins
+
bin_id]
=
hist_[0]
+
hist_[1]
+
hist_[2]
;
}
#
ifdef
WAVE_SIZE_1
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
int
tid
=
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
*
12
+
cell_thread_x
;
if
((
tid
<
cblock_hist_size
)
&&
(
gid
<
blocks_total
))
...
...
@@ -145,6 +236,57 @@ __kernel void compute_hists_kernel(
}
}
//-------------------------------------------------------------
//
Normalization
of
histograms
via
L2Hys_norm
//
optimized
for
the
case
of
9
bins
__kernel
void
normalize_hists_36_kernel
(
__global
float*
block_hists,
const
float
threshold,
__local
float
*squares
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gid
=
get_global_id
(
0
)
;
const
int
bid
=
tid
/
36
; /* block-hist id, (0 - 6) */
const
int
boffset
=
bid
*
36
; /* block-hist offset in the work-group */
const
int
hid
=
tid
-
boffset
; /* histogram bin id, (0 - 35) */
float
elem
=
block_hists[gid]
;
squares[tid]
=
elem
*
elem
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float*
smem
=
squares
+
boffset
;
float
sum
=
smem[hid]
;
if
(
hid
<
18
)
smem[hid]
=
sum
=
sum
+
smem[hid
+
18]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
hid
<
9
)
smem[hid]
=
sum
=
sum
+
smem[hid
+
9]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
hid
<
4
)
smem[hid]
=
sum
+
smem[hid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
smem[0]
+
smem[1]
+
smem[2]
+
smem[3]
+
smem[8]
;
elem
=
elem
/
(
sqrt
(
sum
)
+
3.6f
)
;
elem
=
min
(
elem,
threshold
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
squares[tid]
=
elem
*
elem
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
smem[hid]
;
if
(
hid
<
18
)
smem[hid]
=
sum
=
sum
+
smem[hid
+
18]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
hid
<
9
)
smem[hid]
=
sum
=
sum
+
smem[hid
+
9]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
hid
<
4
)
smem[hid]
=
sum
+
smem[hid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
smem[0]
+
smem[1]
+
smem[2]
+
smem[3]
+
smem[8]
;
block_hists[gid]
=
elem
/
(
sqrt
(
sum
)
+
1e-3f
)
;
}
//-------------------------------------------------------------
//
Normalization
of
histograms
via
L2Hys_norm
//
...
...
@@ -153,76 +295,50 @@ float reduce_smem(volatile __local float* smem, int size)
unsigned
int
tid
=
get_local_id
(
0
)
;
float
sum
=
smem[tid]
;
if
(
size
>=
512
)
{
if
(
tid
<
256
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
256]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
size
>=
256
)
{
if
(
tid
<
128
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
128]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
size
>=
128
)
{
if
(
tid
<
64
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
size
>=
512
)
{
if
(
tid
<
256
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
256]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
256
)
{
if
(
tid
<
128
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
128]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
128
)
{
if
(
tid
<
64
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
#
ifdef
CPU
if
(
size
>=
64
)
{
if
(
tid
<
32
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
32
)
{
if
(
tid
<
16
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
16
)
{
if
(
tid
<
8
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
8
)
{
if
(
tid
<
4
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
4
)
{
if
(
tid
<
2
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
2
)
{
if
(
tid
<
1
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
1]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
#
else
if
(
tid
<
32
)
{
if
(
size
>=
64
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
32]
;
#
if
defined
(
WAVE_SIZE_16
)
|
| defined(WAVE_SIZE_1)
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
#endif
if
(
size
>=
32
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
16]
;
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
if
(
size
>=
16
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
8]
;
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
if
(
size
>=
8
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
4]
;
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
if
(
size
>=
4
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
2]
;
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
if
(
size
>=
2
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
1]
;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = smem[0];
#
endif
return
sum
;
}
__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
__global float* block_hists, const float threshold, __local float *squares)
__kernel
void
normalize_hists_kernel
(
const
int
nthreads,
const
int
block_hist_size,
const
int
img_block_width,
__global
float*
block_hists,
const
float
threshold,
__local
float
*squares
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
__global
float*
hist
=
block_hists
+
(
gidY
*
img_block_width
+
gidX
)
*
block_hist_size
+
tid
;
float
elem
=
0.f
;
if
(
tid
<
block_hist_size
)
...
...
@@ -249,100 +365,226 @@ __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_si
//---------------------------------------------------------------------
//
Linear
SVM
based
classification
//
__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float * block_hists, __global const float* coefs,
float free_coef, float threshold, __global uchar* labels)
//
48x96
window,
9
bins
and
default
parameters
//
180
threads,
each
thread
corresponds
to
a
bin
in
a
row
__kernel
void
classify_hists_180_kernel
(
const
int
cdescr_width,
const
int
cdescr_height,
const
int
cblock_hist_size,
const
int
img_win_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float
*
block_hists,
__global
const
float*
coefs,
float
free_coef,
float
threshold,
__global
uchar*
labels
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
float
product
=
0.f
;
for (int i = tid; i < cdescr_size; i += NTHREADS)
for
(
int
i
=
0
; i < cdescr_height; i++)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
product
+=
coefs[i
*
cdescr_width
+
tid]
*
hist[i
*
img_block_width
*
cblock_hist_size
+
tid]
;
}
__local float products[
NTHREADS
];
__local
float
products[
180
]
;
products[tid]
=
product
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if (tid <
128) products[tid] = product = product + products[tid + 128
];
if
(
tid
<
90
)
products[tid]
=
product
=
product
+
products[tid
+
90
]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if (tid <
64) products[tid] = product = product + products[tid + 64
];
if
(
tid
<
45
)
products[tid]
=
product
=
product
+
products[tid
+
45
]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
volatile
__local
float*
smem
=
products
;
if (tid < 32)
#
ifdef
CPU
if
(
tid
<
13
)
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<8
)
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<4
)
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<2
)
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
else
if
(
tid
<
13
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
#if defined(WAVE_SIZE_16) |
|
defined
(
WAVE_SIZE_1
)
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
#
ifdef
WAVE_SIZE_1
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
8
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
#
ifdef
WAVE_SIZE_1
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
4
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
#
ifdef
WAVE_SIZE_1
if
(
tid
==
0
)
{
product
=
product
+
smem[tid
+
1]
;
labels[gidY
*
img_win_width
+
gidX]
=
(
product
+
free_coef
>=
threshold
)
;
}
}
//---------------------------------------------------------------------
//
Linear
SVM
based
classification
//
64x128
window,
9
bins
and
default
parameters
//
256
threads,
252
of
them
are
used
__kernel
void
classify_hists_252_kernel
(
const
int
cdescr_width,
const
int
cdescr_height,
const
int
cblock_hist_size,
const
int
img_win_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float
*
block_hists,
__global
const
float*
coefs,
float
free_coef,
float
threshold,
__global
uchar*
labels
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
float
product
=
0.f
;
if
(
tid
<
cdescr_width
)
{
for
(
int
i
=
0
; i < cdescr_height; i++)
product
+=
coefs[i
*
cdescr_width
+
tid]
*
hist[i
*
img_block_width
*
cblock_hist_size
+
tid]
;
}
__local
float
products[NTHREADS]
;
products[tid]
=
product
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
2
)
{
#
endif
if
(
tid
<
128
)
products[tid]
=
product
=
product
+
products[tid
+
128]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
64
)
products[tid]
=
product
=
product
+
products[tid
+
64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
volatile
__local
float*
smem
=
products
;
#
ifdef
CPU
if
(
tid<32
)
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<16
)
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<8
)
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<4
)
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<2
)
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
else
if
(
tid
<
32
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
#
ifdef
WAVE_SIZE_1
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
1
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
1]
;
if
(
tid
==
0
)
{
product
=
product
+
smem[tid
+
1]
;
labels[gidY
*
img_win_width
+
gidX]
=
(
product
+
free_coef
>=
threshold
)
;
}
}
//---------------------------------------------------------------------
//
Linear
SVM
based
classification
//
256
threads
__kernel
void
classify_hists_kernel
(
const
int
cdescr_size,
const
int
cdescr_width,
const
int
cblock_hist_size,
const
int
img_win_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float
*
block_hists,
__global
const
float*
coefs,
float
free_coef,
float
threshold,
__global
uchar*
labels
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
float
product
=
0.f
;
for
(
int
i
=
tid
; i < cdescr_size; i += NTHREADS)
{
int
offset_y
=
i
/
cdescr_width
;
int
offset_x
=
i
-
offset_y
*
cdescr_width
;
product
+=
coefs[i]
*
hist[offset_y
*
img_block_width
*
cblock_hist_size
+
offset_x]
;
}
if
(
tid
==
0
)
__local
float
products[NTHREADS]
;
products[tid]
=
product
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
128
)
products[tid]
=
product
=
product
+
products[tid
+
128]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
64
)
products[tid]
=
product
=
product
+
products[tid
+
64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
volatile
__local
float*
smem
=
products
;
#
ifdef
CPU
if
(
tid<32
)
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<16
)
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<8
)
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<4
)
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<2
)
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
else
if
(
tid
<
32
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
}
#
endif
if
(
tid
==
0
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
1]
;
labels[gidY
*
img_win_width
+
gidX]
=
(
product
+
free_coef
>=
threshold
)
;
}
}
//----------------------------------------------------------------------------
//
Extract
descriptors
__kernel
void
extract_descrs_by_rows_kernel
(
const
int
cblock_hist_size,
const
int
descriptors_quadstep,
const
int
cdescr_size,
const
int
cdescr_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
__kernel
void
extract_descrs_by_rows_kernel
(
const
int
cblock_hist_size,
const
int
descriptors_quadstep,
const
int
cdescr_size,
const
int
cdescr_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
{
int
tid
=
get_local_id
(
0
)
;
int
gidX
=
get_group_id
(
0
)
;
int
gidY
=
get_group_id
(
1
)
;
//
Get
left
top
corner
of
the
window
in
src
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
//
Get
left
top
corner
of
the
window
in
dst
__global
float*
descriptor
=
descriptors
+
(
gidY
*
get_num_groups
(
0
)
+
gidX
)
*
descriptors_quadstep
;
__global
float*
descriptor
=
descriptors
+
(
gidY
*
get_num_groups
(
0
)
+
gidX
)
*
descriptors_quadstep
;
//
Copy
elements
from
src
to
dst
for
(
int
i
=
tid
; i < cdescr_size; i += NTHREADS)
...
...
@@ -353,19 +595,23 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
}
}
__kernel
void
extract_descrs_by_cols_kernel
(
const
int
cblock_hist_size,
const
int
descriptors_quadstep,
const
int
cdescr_size,
const
int
cnblocks_win_x,
const
int
cnblocks_win_y,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
__kernel
void
extract_descrs_by_cols_kernel
(
const
int
cblock_hist_size,
const
int
descriptors_quadstep,
const
int
cdescr_size,
const
int
cnblocks_win_x,
const
int
cnblocks_win_y,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
{
int
tid
=
get_local_id
(
0
)
;
int
gidX
=
get_group_id
(
0
)
;
int
gidY
=
get_group_id
(
1
)
;
//
Get
left
top
corner
of
the
window
in
src
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
//
Get
left
top
corner
of
the
window
in
dst
__global
float*
descriptor
=
descriptors
+
(
gidY
*
get_num_groups
(
0
)
+
gidX
)
*
descriptors_quadstep
;
__global
float*
descriptor
=
descriptors
+
(
gidY
*
get_num_groups
(
0
)
+
gidX
)
*
descriptors_quadstep
;
//
Copy
elements
from
src
to
dst
for
(
int
i
=
tid
; i < cdescr_size; i += NTHREADS)
...
...
@@ -376,16 +622,19 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
int
y
=
block_idx
/
cnblocks_win_x
;
int
x
=
block_idx
-
y
*
cnblocks_win_x
;
descriptor[
(
x
*
cnblocks_win_y
+
y
)
*
cblock_hist_size
+
idx_in_block]
=
hist[
(
y
*
img_block_width
+
x
)
*
cblock_hist_size
+
idx_in_block]
;
descriptor[
(
x
*
cnblocks_win_y
+
y
)
*
cblock_hist_size
+
idx_in_block]
=
hist[
(
y
*
img_block_width
+
x
)
*
cblock_hist_size
+
idx_in_block]
;
}
}
//----------------------------------------------------------------------------
//
Gradients
computation
__kernel
void
compute_gradients_8UC4_kernel
(
const
int
height,
const
int
width,
const
int
img_step,
const
int
grad_quadstep,
const
int
qangle_step,
const
__global
uchar4
*
img,
__global
float
*
grad,
__global
uchar
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
__kernel
void
compute_gradients_8UC4_kernel
(
const
int
height,
const
int
width,
const
int
img_step,
const
int
grad_quadstep,
const
int
qangle_step,
const
__global
uchar4
*
img,
__global
float
*
grad,
__global
uchar
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
tid
=
get_local_id
(
0
)
;
...
...
@@ -426,8 +675,10 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
width
)
{
float3
a
=
(
float3
)
(
sh_row[tid],
sh_row[tid
+
(
NTHREADS
+
2
)
],
sh_row[tid
+
2
*
(
NTHREADS
+
2
)
]
)
;
float3
b
=
(
float3
)
(
sh_row[tid
+
2],
sh_row[tid
+
2
+
(
NTHREADS
+
2
)
],
sh_row[tid
+
2
+
2
*
(
NTHREADS
+
2
)
]
)
;
float3
a
=
(
float3
)
(
sh_row[tid],
sh_row[tid
+
(
NTHREADS
+
2
)
],
sh_row[tid
+
2
*
(
NTHREADS
+
2
)
]
)
;
float3
b
=
(
float3
)
(
sh_row[tid
+
2],
sh_row[tid
+
2
+
(
NTHREADS
+
2
)
],
sh_row[tid
+
2
+
2
*
(
NTHREADS
+
2
)
]
)
;
float3
dx
;
if
(
correct_gamma
==
1
)
...
...
@@ -482,9 +733,11 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
}
}
__kernel
void
compute_gradients_8UC1_kernel
(
const
int
height,
const
int
width,
const
int
img_step,
const
int
grad_quadstep,
const
int
qangle_step,
__global
const
uchar
*
img,
__global
float
*
grad,
__global
uchar
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
__kernel
void
compute_gradients_8UC1_kernel
(
const
int
height,
const
int
width,
const
int
img_step,
const
int
grad_quadstep,
const
int
qangle_step,
__global
const
uchar
*
img,
__global
float
*
grad,
__global
uchar
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
tid
=
get_local_id
(
0
)
;
...
...
@@ -539,43 +792,4 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
grad[
(
gidY
*
grad_quadstep
+
x
)
<<
1
]
=
mag
*
(
1.f
-
ang
)
;
grad[
((
gidY
*
grad_quadstep
+
x
)
<<
1
)
+
1
]
=
mag
*
ang
;
}
}
//----------------------------------------------------------------------------
//
Resize
__kernel
void
resize_8UC4_kernel
(
__global
uchar4
*
dst,
__global
const
uchar4
*
src,
int
dst_offset,
int
src_offset,
int
dst_step,
int
src_step,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
float
ifx,
float
ify
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
int
sx
=
(
int
)
floor
(
dx*ifx+0.5f
)
;
int
sy
=
(
int
)
floor
(
dy*ify+0.5f
)
;
sx
=
min
(
sx,
src_cols-1
)
;
sy
=
min
(
sy,
src_rows-1
)
;
int
dpos
=
(
dst_offset>>2
)
+
dy
*
(
dst_step>>2
)
+
dx
;
int
spos
=
(
src_offset>>2
)
+
sy
*
(
src_step>>2
)
+
sx
;
if
(
dx<dst_cols
&&
dy<dst_rows
)
dst[dpos]
=
src[spos]
;
}
__kernel
void
resize_8UC1_kernel
(
__global
uchar
*
dst,
__global
const
uchar
*
src,
int
dst_offset,
int
src_offset,
int
dst_step,
int
src_step,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
float
ifx,
float
ify
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
int
sx
=
(
int
)
floor
(
dx*ifx+0.5f
)
;
int
sy
=
(
int
)
floor
(
dy*ify+0.5f
)
;
sx
=
min
(
sx,
src_cols-1
)
;
sy
=
min
(
sy,
src_rows-1
)
;
int
dpos
=
dst_offset
+
dy
*
dst_step
+
dx
;
int
spos
=
src_offset
+
sy
*
src_step
+
sx
;
if
(
dx<dst_cols
&&
dy<dst_rows
)
dst[dpos]
=
src[spos]
;
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment