Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
22146e4b
Commit
22146e4b
authored
Feb 04, 2014
by
Andrey Pavlenko
Committed by
OpenCV Buildbot
Feb 04, 2014
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #2234 from KonstantinMatskevich:ocl_tapi_hog
parents
bd6620fa
3495c595
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
1526 additions
and
18 deletions
+1526
-18
objdetect.hpp
modules/objdetect/include/opencv2/objdetect.hpp
+14
-10
perf_hogdetect.cpp
modules/objdetect/perf/opencl/perf_hogdetect.cpp
+94
-0
hog.cpp
modules/objdetect/src/hog.cpp
+571
-8
objdetect_hog.cl
modules/objdetect/src/opencl/objdetect_hog.cl
+726
-0
test_hogdetector.cpp
modules/objdetect/test/opencl/test_hogdetector.cpp
+121
-0
No files found.
modules/objdetect/include/opencv2/objdetect.hpp
View file @
22146e4b
...
...
@@ -285,10 +285,11 @@ public:
CV_WRAP
virtual
void
save
(
const
String
&
filename
,
const
String
&
objname
=
String
())
const
;
virtual
void
copyTo
(
HOGDescriptor
&
c
)
const
;
CV_WRAP
virtual
void
compute
(
const
Mat
&
img
,
CV_WRAP
virtual
void
compute
(
InputArray
img
,
CV_OUT
std
::
vector
<
float
>&
descriptors
,
Size
winStride
=
Size
(),
Size
padding
=
Size
(),
const
std
::
vector
<
Point
>&
locations
=
std
::
vector
<
Point
>
())
const
;
//with found weights output
CV_WRAP
virtual
void
detect
(
const
Mat
&
img
,
CV_OUT
std
::
vector
<
Point
>&
foundLocations
,
CV_OUT
std
::
vector
<
double
>&
weights
,
...
...
@@ -300,13 +301,14 @@ public:
double
hitThreshold
=
0
,
Size
winStride
=
Size
(),
Size
padding
=
Size
(),
const
std
::
vector
<
Point
>&
searchLocations
=
std
::
vector
<
Point
>
())
const
;
//with result weights output
CV_WRAP
virtual
void
detectMultiScale
(
const
Mat
&
img
,
CV_OUT
std
::
vector
<
Rect
>&
foundLocations
,
CV_WRAP
virtual
void
detectMultiScale
(
InputArray
img
,
CV_OUT
std
::
vector
<
Rect
>&
foundLocations
,
CV_OUT
std
::
vector
<
double
>&
foundWeights
,
double
hitThreshold
=
0
,
Size
winStride
=
Size
(),
Size
padding
=
Size
(),
double
scale
=
1.05
,
double
finalThreshold
=
2.0
,
bool
useMeanshiftGrouping
=
false
)
const
;
//without found weights output
virtual
void
detectMultiScale
(
const
Mat
&
img
,
CV_OUT
std
::
vector
<
Rect
>&
foundLocations
,
virtual
void
detectMultiScale
(
InputArray
img
,
CV_OUT
std
::
vector
<
Rect
>&
foundLocations
,
double
hitThreshold
=
0
,
Size
winStride
=
Size
(),
Size
padding
=
Size
(),
double
scale
=
1.05
,
double
finalThreshold
=
2.0
,
bool
useMeanshiftGrouping
=
false
)
const
;
...
...
@@ -328,25 +330,27 @@ public:
CV_PROP
double
L2HysThreshold
;
CV_PROP
bool
gammaCorrection
;
CV_PROP
std
::
vector
<
float
>
svmDetector
;
UMat
oclSvmDetector
;
float
free_coef
;
CV_PROP
int
nlevels
;
// evaluate specified ROI and return confidence value for each location
virtual
void
detectROI
(
const
cv
::
Mat
&
img
,
const
std
::
vector
<
cv
::
Point
>
&
locations
,
// evaluate specified ROI and return confidence value for each location
virtual
void
detectROI
(
const
cv
::
Mat
&
img
,
const
std
::
vector
<
cv
::
Point
>
&
locations
,
CV_OUT
std
::
vector
<
cv
::
Point
>&
foundLocations
,
CV_OUT
std
::
vector
<
double
>&
confidences
,
double
hitThreshold
=
0
,
cv
::
Size
winStride
=
Size
(),
cv
::
Size
padding
=
Size
())
const
;
// evaluate specified ROI and return confidence value for each location in multiple scales
virtual
void
detectMultiScaleROI
(
const
cv
::
Mat
&
img
,
// evaluate specified ROI and return confidence value for each location in multiple scales
virtual
void
detectMultiScaleROI
(
const
cv
::
Mat
&
img
,
CV_OUT
std
::
vector
<
cv
::
Rect
>&
foundLocations
,
std
::
vector
<
DetectionROI
>&
locations
,
double
hitThreshold
=
0
,
int
groupThreshold
=
0
)
const
;
// read/parse Dalal's alt model file
void
readALTModel
(
String
modelfile
);
void
groupRectangles
(
std
::
vector
<
cv
::
Rect
>&
rectList
,
std
::
vector
<
double
>&
weights
,
int
groupThreshold
,
double
eps
)
const
;
// read/parse Dalal's alt model file
void
readALTModel
(
String
modelfile
);
void
groupRectangles
(
std
::
vector
<
cv
::
Rect
>&
rectList
,
std
::
vector
<
double
>&
weights
,
int
groupThreshold
,
double
eps
)
const
;
};
...
...
modules/objdetect/perf/opencl/perf_hogdetect.cpp
0 → 100644
View file @
22146e4b
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Fangfang Bai, fangfang@multicorewareinc.com
// Jin Ma, jin@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "perf_precomp.hpp"
#include "opencv2/ts/ocl_perf.hpp"
#ifdef HAVE_OPENCL
namespace
cvtest
{
namespace
ocl
{
///////////// HOG////////////////////////
struct
RectLess
:
public
std
::
binary_function
<
cv
::
Rect
,
cv
::
Rect
,
bool
>
{
bool
operator
()(
const
cv
::
Rect
&
a
,
const
cv
::
Rect
&
b
)
const
{
if
(
a
.
x
!=
b
.
x
)
return
a
.
x
<
b
.
x
;
else
if
(
a
.
y
!=
b
.
y
)
return
a
.
y
<
b
.
y
;
else
if
(
a
.
width
!=
b
.
width
)
return
a
.
width
<
b
.
width
;
else
return
a
.
height
<
b
.
height
;
}
};
OCL_PERF_TEST
(
HOGFixture
,
HOG
)
{
UMat
src
;
imread
(
getDataPath
(
"gpu/hog/road.png"
),
cv
::
IMREAD_GRAYSCALE
).
copyTo
(
src
);
ASSERT_FALSE
(
src
.
empty
());
vector
<
cv
::
Rect
>
found_locations
;
declare
.
in
(
src
);
HOGDescriptor
hog
;
hog
.
setSVMDetector
(
hog
.
getDefaultPeopleDetector
());
OCL_TEST_CYCLE
()
hog
.
detectMultiScale
(
src
,
found_locations
);
std
::
sort
(
found_locations
.
begin
(),
found_locations
.
end
(),
RectLess
());
SANITY_CHECK
(
found_locations
,
1
+
DBL_EPSILON
);
}
}
}
#endif
modules/objdetect/src/hog.cpp
View file @
22146e4b
...
...
@@ -42,6 +42,7 @@
#include "precomp.hpp"
#include "opencv2/core/core_c.h"
#include "opencl_kernels.hpp"
#include <cstdio>
#include <iterator>
...
...
@@ -58,6 +59,29 @@
namespace
cv
{
#define NTHREADS 256
enum
{
DESCR_FORMAT_COL_BY_COL
,
DESCR_FORMAT_ROW_BY_ROW
};
static
int
numPartsWithin
(
int
size
,
int
part_size
,
int
stride
)
{
return
(
size
-
part_size
+
stride
)
/
stride
;
}
static
Size
numPartsWithin
(
cv
::
Size
size
,
cv
::
Size
part_size
,
cv
::
Size
stride
)
{
return
Size
(
numPartsWithin
(
size
.
width
,
part_size
.
width
,
stride
.
width
),
numPartsWithin
(
size
.
height
,
part_size
.
height
,
stride
.
height
));
}
static
size_t
getBlockHistogramSize
(
Size
block_size
,
Size
cell_size
,
int
nbins
)
{
Size
cells_per_block
=
Size
(
block_size
.
width
/
cell_size
.
width
,
block_size
.
height
/
cell_size
.
height
);
return
(
size_t
)(
nbins
*
cells_per_block
.
area
());
}
size_t
HOGDescriptor
::
getDescriptorSize
()
const
{
CV_Assert
(
blockSize
.
width
%
cellSize
.
width
==
0
&&
...
...
@@ -88,7 +112,24 @@ bool HOGDescriptor::checkDetectorSize() const
void
HOGDescriptor
::
setSVMDetector
(
InputArray
_svmDetector
)
{
_svmDetector
.
getMat
().
convertTo
(
svmDetector
,
CV_32F
);
CV_Assert
(
checkDetectorSize
()
);
CV_Assert
(
checkDetectorSize
());
Mat
detector_reordered
(
1
,
(
int
)
svmDetector
.
size
(),
CV_32FC1
);
size_t
block_hist_size
=
getBlockHistogramSize
(
blockSize
,
cellSize
,
nbins
);
cv
::
Size
blocks_per_img
=
numPartsWithin
(
winSize
,
blockSize
,
blockStride
);
for
(
int
i
=
0
;
i
<
blocks_per_img
.
height
;
++
i
)
for
(
int
j
=
0
;
j
<
blocks_per_img
.
width
;
++
j
)
{
const
float
*
src
=
&
svmDetector
[
0
]
+
(
j
*
blocks_per_img
.
height
+
i
)
*
block_hist_size
;
float
*
dst
=
(
float
*
)
detector_reordered
.
data
+
(
i
*
blocks_per_img
.
width
+
j
)
*
block_hist_size
;
for
(
size_t
k
=
0
;
k
<
block_hist_size
;
++
k
)
dst
[
k
]
=
src
[
k
];
}
size_t
descriptor_size
=
getDescriptorSize
();
free_coef
=
svmDetector
.
size
()
>
descriptor_size
?
svmDetector
[
descriptor_size
]
:
0
;
detector_reordered
.
copyTo
(
oclSvmDetector
);
}
#define CV_TYPE_NAME_HOG_DESCRIPTOR "opencv-object-detector-hog"
...
...
@@ -1029,7 +1070,318 @@ static inline int gcd(int a, int b)
return
a
;
}
void
HOGDescriptor
::
compute
(
const
Mat
&
img
,
std
::
vector
<
float
>&
descriptors
,
#ifdef HAVE_OPENCL
static
bool
ocl_compute_gradients_8UC1
(
int
height
,
int
width
,
InputArray
_img
,
float
angle_scale
,
UMat
grad
,
UMat
qangle
,
bool
correct_gamma
,
int
nbins
)
{
ocl
::
Kernel
k
(
"compute_gradients_8UC1_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
);
if
(
k
.
empty
())
return
false
;
UMat
img
=
_img
.
getUMat
();
size_t
localThreads
[
3
]
=
{
NTHREADS
,
1
,
1
};
size_t
globalThreads
[
3
]
=
{
width
,
height
,
1
};
char
correctGamma
=
(
correct_gamma
)
?
1
:
0
;
int
grad_quadstep
=
(
int
)
grad
.
step
>>
3
;
int
qangle_step_shift
=
0
;
int
qangle_step
=
(
int
)
qangle
.
step
>>
(
1
+
qangle_step_shift
);
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
height
);
idx
=
k
.
set
(
idx
,
width
);
idx
=
k
.
set
(
idx
,
(
int
)
img
.
step1
());
idx
=
k
.
set
(
idx
,
grad_quadstep
);
idx
=
k
.
set
(
idx
,
qangle_step
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
img
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
grad
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
qangle
));
idx
=
k
.
set
(
idx
,
angle_scale
);
idx
=
k
.
set
(
idx
,
correctGamma
);
idx
=
k
.
set
(
idx
,
nbins
);
return
k
.
run
(
2
,
globalThreads
,
localThreads
,
false
);
}
static
bool
ocl_computeGradient
(
InputArray
img
,
UMat
grad
,
UMat
qangle
,
int
nbins
,
Size
effect_size
,
bool
gamma_correction
)
{
float
angleScale
=
(
float
)(
nbins
/
CV_PI
);
return
ocl_compute_gradients_8UC1
(
effect_size
.
height
,
effect_size
.
width
,
img
,
angleScale
,
grad
,
qangle
,
gamma_correction
,
nbins
);
}
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2
static
bool
ocl_compute_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
height
,
int
width
,
UMat
grad
,
UMat
qangle
,
UMat
gauss_w_lut
,
UMat
block_hists
,
size_t
block_hist_size
)
{
ocl
::
Kernel
k
(
"compute_hists_lut_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
);
if
(
k
.
empty
())
return
false
;
bool
is_cpu
=
cv
::
ocl
::
Device
::
getDefault
().
type
()
==
cv
::
ocl
::
Device
::
TYPE_CPU
;
cv
::
String
opts
;
if
(
is_cpu
)
opts
=
"-D CPU "
;
else
opts
=
cv
::
format
(
"-D WAVE_SIZE=%d"
,
k
.
preferedWorkGroupSizeMultiple
());
k
.
create
(
"compute_hists_lut_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_height
=
(
height
-
CELLS_PER_BLOCK_Y
*
CELL_HEIGHT
+
block_stride_y
)
/
block_stride_y
;
int
blocks_total
=
img_block_width
*
img_block_height
;
int
qangle_step_shift
=
0
;
int
grad_quadstep
=
(
int
)
grad
.
step
>>
2
;
int
qangle_step
=
(
int
)
qangle
.
step
>>
qangle_step_shift
;
int
blocks_in_group
=
4
;
size_t
localThreads
[
3
]
=
{
blocks_in_group
*
24
,
2
,
1
};
size_t
globalThreads
[
3
]
=
{((
img_block_width
*
img_block_height
+
blocks_in_group
-
1
)
/
blocks_in_group
)
*
localThreads
[
0
],
2
,
1
};
int
hists_size
=
(
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
)
*
sizeof
(
float
);
int
final_hists_size
=
(
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
)
*
sizeof
(
float
);
int
smem
=
(
hists_size
+
final_hists_size
)
*
blocks_in_group
;
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
block_stride_x
);
idx
=
k
.
set
(
idx
,
block_stride_y
);
idx
=
k
.
set
(
idx
,
nbins
);
idx
=
k
.
set
(
idx
,
(
int
)
block_hist_size
);
idx
=
k
.
set
(
idx
,
img_block_width
);
idx
=
k
.
set
(
idx
,
blocks_in_group
);
idx
=
k
.
set
(
idx
,
blocks_total
);
idx
=
k
.
set
(
idx
,
grad_quadstep
);
idx
=
k
.
set
(
idx
,
qangle_step
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
grad
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
qangle
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
gauss_w_lut
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
block_hists
));
idx
=
k
.
set
(
idx
,
(
void
*
)
NULL
,
(
size_t
)
smem
);
return
k
.
run
(
2
,
globalThreads
,
localThreads
,
false
);
}
static
int
power_2up
(
unsigned
int
n
)
{
for
(
unsigned
int
i
=
1
;
i
<=
1024
;
i
<<=
1
)
if
(
n
<
i
)
return
i
;
return
-
1
;
// Input is too big
}
static
bool
ocl_normalize_hists
(
int
nbins
,
int
block_stride_x
,
int
block_stride_y
,
int
height
,
int
width
,
UMat
block_hists
,
float
threshold
)
{
int
block_hist_size
=
nbins
*
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
img_block_height
=
(
height
-
CELLS_PER_BLOCK_Y
*
CELL_HEIGHT
+
block_stride_y
)
/
block_stride_y
;
int
nthreads
;
size_t
globalThreads
[
3
]
=
{
1
,
1
,
1
};
size_t
localThreads
[
3
]
=
{
1
,
1
,
1
};
int
idx
=
0
;
bool
is_cpu
=
cv
::
ocl
::
Device
::
getDefault
().
type
()
==
cv
::
ocl
::
Device
::
TYPE_CPU
;
cv
::
String
opts
;
ocl
::
Kernel
k
;
if
(
nbins
==
9
)
{
k
.
create
(
"normalize_hists_36_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
""
);
if
(
k
.
empty
())
return
false
;
if
(
is_cpu
)
opts
=
"-D CPU "
;
else
opts
=
cv
::
format
(
"-D WAVE_SIZE=%d"
,
k
.
preferedWorkGroupSizeMultiple
());
k
.
create
(
"normalize_hists_36_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
int
blocks_in_group
=
NTHREADS
/
block_hist_size
;
nthreads
=
blocks_in_group
*
block_hist_size
;
int
num_groups
=
(
img_block_width
*
img_block_height
+
blocks_in_group
-
1
)
/
blocks_in_group
;
globalThreads
[
0
]
=
nthreads
*
num_groups
;
localThreads
[
0
]
=
nthreads
;
}
else
{
k
.
create
(
"normalize_hists_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
""
);
if
(
k
.
empty
())
return
false
;
if
(
is_cpu
)
opts
=
"-D CPU "
;
else
opts
=
cv
::
format
(
"-D WAVE_SIZE=%d"
,
k
.
preferedWorkGroupSizeMultiple
());
k
.
create
(
"normalize_hists_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
nthreads
=
power_2up
(
block_hist_size
);
globalThreads
[
0
]
=
img_block_width
*
nthreads
;
globalThreads
[
1
]
=
img_block_height
;
localThreads
[
0
]
=
nthreads
;
if
((
nthreads
<
32
)
||
(
nthreads
>
512
)
)
return
false
;
idx
=
k
.
set
(
idx
,
nthreads
);
idx
=
k
.
set
(
idx
,
block_hist_size
);
idx
=
k
.
set
(
idx
,
img_block_width
);
}
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadWrite
(
block_hists
));
idx
=
k
.
set
(
idx
,
threshold
);
idx
=
k
.
set
(
idx
,
(
void
*
)
NULL
,
nthreads
*
sizeof
(
float
));
return
k
.
run
(
2
,
globalThreads
,
localThreads
,
false
);
}
static
bool
ocl_extract_descrs_by_rows
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
UMat
block_hists
,
UMat
descriptors
,
int
block_hist_size
,
int
descr_size
,
int
descr_width
)
{
ocl
::
Kernel
k
(
"extract_descrs_by_rows_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
);
if
(
k
.
empty
())
return
false
;
int
win_block_stride_x
=
win_stride_x
/
block_stride_x
;
int
win_block_stride_y
=
win_stride_y
/
block_stride_y
;
int
img_win_width
=
(
width
-
win_width
+
win_stride_x
)
/
win_stride_x
;
int
img_win_height
=
(
height
-
win_height
+
win_stride_y
)
/
win_stride_y
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
descriptors_quadstep
=
(
int
)
descriptors
.
step
>>
2
;
size_t
globalThreads
[
3
]
=
{
img_win_width
*
NTHREADS
,
img_win_height
,
1
};
size_t
localThreads
[
3
]
=
{
NTHREADS
,
1
,
1
};
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
block_hist_size
);
idx
=
k
.
set
(
idx
,
descriptors_quadstep
);
idx
=
k
.
set
(
idx
,
descr_size
);
idx
=
k
.
set
(
idx
,
descr_width
);
idx
=
k
.
set
(
idx
,
img_block_width
);
idx
=
k
.
set
(
idx
,
win_block_stride_x
);
idx
=
k
.
set
(
idx
,
win_block_stride_y
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
block_hists
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
descriptors
));
return
k
.
run
(
2
,
globalThreads
,
localThreads
,
false
);
}
static
bool
ocl_extract_descrs_by_cols
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
UMat
block_hists
,
UMat
descriptors
,
int
block_hist_size
,
int
descr_size
,
int
nblocks_win_x
,
int
nblocks_win_y
)
{
ocl
::
Kernel
k
(
"extract_descrs_by_cols_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
);
if
(
k
.
empty
())
return
false
;
int
win_block_stride_x
=
win_stride_x
/
block_stride_x
;
int
win_block_stride_y
=
win_stride_y
/
block_stride_y
;
int
img_win_width
=
(
width
-
win_width
+
win_stride_x
)
/
win_stride_x
;
int
img_win_height
=
(
height
-
win_height
+
win_stride_y
)
/
win_stride_y
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
int
descriptors_quadstep
=
(
int
)
descriptors
.
step
>>
2
;
size_t
globalThreads
[
3
]
=
{
img_win_width
*
NTHREADS
,
img_win_height
,
1
};
size_t
localThreads
[
3
]
=
{
NTHREADS
,
1
,
1
};
int
idx
=
0
;
idx
=
k
.
set
(
idx
,
block_hist_size
);
idx
=
k
.
set
(
idx
,
descriptors_quadstep
);
idx
=
k
.
set
(
idx
,
descr_size
);
idx
=
k
.
set
(
idx
,
nblocks_win_x
);
idx
=
k
.
set
(
idx
,
nblocks_win_y
);
idx
=
k
.
set
(
idx
,
img_block_width
);
idx
=
k
.
set
(
idx
,
win_block_stride_x
);
idx
=
k
.
set
(
idx
,
win_block_stride_y
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
block_hists
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
descriptors
));
return
k
.
run
(
2
,
globalThreads
,
localThreads
,
false
);
}
static
bool
ocl_compute
(
InputArray
_img
,
Size
win_stride
,
std
::
vector
<
float
>&
_descriptors
,
int
descr_format
,
Size
blockSize
,
Size
cellSize
,
int
nbins
,
Size
blockStride
,
Size
winSize
,
float
sigma
,
bool
gammaCorrection
,
double
L2HysThreshold
)
{
Size
imgSize
=
_img
.
size
();
Size
effect_size
=
imgSize
;
UMat
grad
(
imgSize
,
CV_32FC2
);
UMat
qangle
(
imgSize
,
CV_8UC2
);
const
size_t
block_hist_size
=
getBlockHistogramSize
(
blockSize
,
cellSize
,
nbins
);
const
Size
blocks_per_img
=
numPartsWithin
(
imgSize
,
blockSize
,
blockStride
);
UMat
block_hists
(
1
,
static_cast
<
int
>
(
block_hist_size
*
blocks_per_img
.
area
())
+
256
,
CV_32F
);
Size
wins_per_img
=
numPartsWithin
(
imgSize
,
winSize
,
win_stride
);
UMat
labels
(
1
,
wins_per_img
.
area
(),
CV_8U
);
float
scale
=
1.
f
/
(
2.
f
*
sigma
*
sigma
);
Mat
gaussian_lut
(
1
,
512
,
CV_32FC1
);
int
idx
=
0
;
for
(
int
i
=-
8
;
i
<
8
;
i
++
)
for
(
int
j
=-
8
;
j
<
8
;
j
++
)
gaussian_lut
.
at
<
float
>
(
idx
++
)
=
std
::
exp
(
-
(
j
*
j
+
i
*
i
)
*
scale
);
for
(
int
i
=-
8
;
i
<
8
;
i
++
)
for
(
int
j
=-
8
;
j
<
8
;
j
++
)
gaussian_lut
.
at
<
float
>
(
idx
++
)
=
(
8.
f
-
fabs
(
j
+
0.5
f
))
*
(
8.
f
-
fabs
(
i
+
0.5
f
))
/
64.
f
;
if
(
!
ocl_computeGradient
(
_img
,
grad
,
qangle
,
nbins
,
effect_size
,
gammaCorrection
))
return
false
;
UMat
gauss_w_lut
;
gaussian_lut
.
copyTo
(
gauss_w_lut
);
if
(
!
ocl_compute_hists
(
nbins
,
blockStride
.
width
,
blockStride
.
height
,
effect_size
.
height
,
effect_size
.
width
,
grad
,
qangle
,
gauss_w_lut
,
block_hists
,
block_hist_size
))
return
false
;
if
(
!
ocl_normalize_hists
(
nbins
,
blockStride
.
width
,
blockStride
.
height
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
(
float
)
L2HysThreshold
))
return
false
;
Size
blocks_per_win
=
numPartsWithin
(
winSize
,
blockSize
,
blockStride
);
wins_per_img
=
numPartsWithin
(
effect_size
,
winSize
,
win_stride
);
int
descr_size
=
blocks_per_win
.
area
()
*
(
int
)
block_hist_size
;
int
descr_width
=
(
int
)
block_hist_size
*
blocks_per_win
.
width
;
UMat
descriptors
(
wins_per_img
.
area
(),
static_cast
<
int
>
(
blocks_per_win
.
area
()
*
block_hist_size
),
CV_32F
);
switch
(
descr_format
)
{
case
DESCR_FORMAT_ROW_BY_ROW
:
if
(
!
ocl_extract_descrs_by_rows
(
winSize
.
height
,
winSize
.
width
,
blockStride
.
height
,
blockStride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
descriptors
,
(
int
)
block_hist_size
,
descr_size
,
descr_width
))
return
false
;
break
;
case
DESCR_FORMAT_COL_BY_COL
:
if
(
!
ocl_extract_descrs_by_cols
(
winSize
.
height
,
winSize
.
width
,
blockStride
.
height
,
blockStride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
descriptors
,
(
int
)
block_hist_size
,
descr_size
,
blocks_per_win
.
width
,
blocks_per_win
.
height
))
return
false
;
break
;
default
:
return
false
;
}
descriptors
.
reshape
(
1
,
(
int
)
descriptors
.
total
()).
getMat
(
ACCESS_READ
).
copyTo
(
_descriptors
);
return
true
;
}
#endif //HAVE_OPENCL
void
HOGDescriptor
::
compute
(
InputArray
_img
,
std
::
vector
<
float
>&
descriptors
,
Size
winStride
,
Size
padding
,
const
std
::
vector
<
Point
>&
locations
)
const
{
if
(
winStride
==
Size
()
)
...
...
@@ -1037,11 +1389,18 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
Size
cacheStride
(
gcd
(
winStride
.
width
,
blockStride
.
width
),
gcd
(
winStride
.
height
,
blockStride
.
height
));
Size
imgSize
=
_img
.
size
();
size_t
nwindows
=
locations
.
size
();
padding
.
width
=
(
int
)
alignSize
(
std
::
max
(
padding
.
width
,
0
),
cacheStride
.
width
);
padding
.
height
=
(
int
)
alignSize
(
std
::
max
(
padding
.
height
,
0
),
cacheStride
.
height
);
Size
paddedImgSize
(
img
.
cols
+
padding
.
width
*
2
,
img
.
rows
+
padding
.
height
*
2
);
Size
paddedImgSize
(
imgSize
.
width
+
padding
.
width
*
2
,
imgSize
.
height
+
padding
.
height
*
2
);
CV_OCL_RUN
(
_img
.
dims
()
<=
2
&&
_img
.
type
()
==
CV_8UC1
&&
_img
.
isUMat
(),
ocl_compute
(
_img
,
winStride
,
descriptors
,
DESCR_FORMAT_COL_BY_COL
,
blockSize
,
cellSize
,
nbins
,
blockStride
,
winSize
,
(
float
)
getWinSigma
(),
gammaCorrection
,
L2HysThreshold
))
Mat
img
=
_img
.
getMat
();
HOGCache
cache
(
this
,
img
,
padding
,
padding
,
nwindows
==
0
,
cacheStride
);
if
(
!
nwindows
)
...
...
@@ -1263,20 +1622,215 @@ private:
Mutex
*
mtx
;
};
#ifdef HAVE_OPENCL
static
bool
ocl_classify_hists
(
int
win_height
,
int
win_width
,
int
block_stride_y
,
int
block_stride_x
,
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
const
UMat
&
block_hists
,
UMat
detector
,
float
free_coef
,
float
threshold
,
UMat
&
labels
,
Size
descr_size
,
int
block_hist_size
)
{
int
nthreads
;
bool
is_cpu
=
cv
::
ocl
::
Device
::
getDefault
().
type
()
==
cv
::
ocl
::
Device
::
TYPE_CPU
;
cv
::
String
opts
;
ocl
::
Kernel
k
;
int
idx
=
0
;
switch
(
descr_size
.
width
)
{
case
180
:
nthreads
=
180
;
k
.
create
(
"classify_hists_180_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
""
);
if
(
k
.
empty
())
return
false
;
if
(
is_cpu
)
opts
=
"-D CPU "
;
else
opts
=
cv
::
format
(
"-D WAVE_SIZE=%d"
,
k
.
preferedWorkGroupSizeMultiple
());
k
.
create
(
"classify_hists_180_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
idx
=
k
.
set
(
idx
,
descr_size
.
width
);
idx
=
k
.
set
(
idx
,
descr_size
.
height
);
break
;
case
252
:
nthreads
=
256
;
k
.
create
(
"classify_hists_252_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
""
);
if
(
k
.
empty
())
return
false
;
if
(
is_cpu
)
opts
=
"-D CPU "
;
else
opts
=
cv
::
format
(
"-D WAVE_SIZE=%d"
,
k
.
preferedWorkGroupSizeMultiple
());
k
.
create
(
"classify_hists_252_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
idx
=
k
.
set
(
idx
,
descr_size
.
width
);
idx
=
k
.
set
(
idx
,
descr_size
.
height
);
break
;
default:
nthreads
=
256
;
k
.
create
(
"classify_hists_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
""
);
if
(
k
.
empty
())
return
false
;
if
(
is_cpu
)
opts
=
"-D CPU "
;
else
opts
=
cv
::
format
(
"-D WAVE_SIZE=%d"
,
k
.
preferedWorkGroupSizeMultiple
());
k
.
create
(
"classify_hists_kernel"
,
ocl
::
objdetect
::
objdetect_hog_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
idx
=
k
.
set
(
idx
,
descr_size
.
area
());
idx
=
k
.
set
(
idx
,
descr_size
.
height
);
}
int
win_block_stride_x
=
win_stride_x
/
block_stride_x
;
int
win_block_stride_y
=
win_stride_y
/
block_stride_y
;
int
img_win_width
=
(
width
-
win_width
+
win_stride_x
)
/
win_stride_x
;
int
img_win_height
=
(
height
-
win_height
+
win_stride_y
)
/
win_stride_y
;
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
size_t
globalThreads
[
3
]
=
{
img_win_width
*
nthreads
,
img_win_height
,
1
};
size_t
localThreads
[
3
]
=
{
nthreads
,
1
,
1
};
idx
=
k
.
set
(
idx
,
block_hist_size
);
idx
=
k
.
set
(
idx
,
img_win_width
);
idx
=
k
.
set
(
idx
,
img_block_width
);
idx
=
k
.
set
(
idx
,
win_block_stride_x
);
idx
=
k
.
set
(
idx
,
win_block_stride_y
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
block_hists
));
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrReadOnly
(
detector
));
idx
=
k
.
set
(
idx
,
free_coef
);
idx
=
k
.
set
(
idx
,
threshold
);
idx
=
k
.
set
(
idx
,
ocl
::
KernelArg
::
PtrWriteOnly
(
labels
));
return
k
.
run
(
2
,
globalThreads
,
localThreads
,
false
);
}
static
bool
ocl_detect
(
InputArray
img
,
std
::
vector
<
Point
>
&
hits
,
double
hit_threshold
,
Size
win_stride
,
const
UMat
&
oclSvmDetector
,
Size
blockSize
,
Size
cellSize
,
int
nbins
,
Size
blockStride
,
Size
winSize
,
bool
gammaCorrection
,
double
L2HysThreshold
,
float
sigma
,
float
free_coef
)
{
hits
.
clear
();
if
(
oclSvmDetector
.
empty
())
return
false
;
Size
imgSize
=
img
.
size
();
Size
effect_size
=
imgSize
;
UMat
grad
(
imgSize
,
CV_32FC2
);
UMat
qangle
(
imgSize
,
CV_8UC2
);
const
size_t
block_hist_size
=
getBlockHistogramSize
(
blockSize
,
cellSize
,
nbins
);
const
Size
blocks_per_img
=
numPartsWithin
(
imgSize
,
blockSize
,
blockStride
);
UMat
block_hists
(
1
,
static_cast
<
int
>
(
block_hist_size
*
blocks_per_img
.
area
())
+
256
,
CV_32F
);
Size
wins_per_img
=
numPartsWithin
(
imgSize
,
winSize
,
win_stride
);
UMat
labels
(
1
,
wins_per_img
.
area
(),
CV_8U
);
float
scale
=
1.
f
/
(
2.
f
*
sigma
*
sigma
);
Mat
gaussian_lut
(
1
,
512
,
CV_32FC1
);
int
idx
=
0
;
for
(
int
i
=-
8
;
i
<
8
;
i
++
)
for
(
int
j
=-
8
;
j
<
8
;
j
++
)
gaussian_lut
.
at
<
float
>
(
idx
++
)
=
std
::
exp
(
-
(
j
*
j
+
i
*
i
)
*
scale
);
for
(
int
i
=-
8
;
i
<
8
;
i
++
)
for
(
int
j
=-
8
;
j
<
8
;
j
++
)
gaussian_lut
.
at
<
float
>
(
idx
++
)
=
(
8.
f
-
fabs
(
j
+
0.5
f
))
*
(
8.
f
-
fabs
(
i
+
0.5
f
))
/
64.
f
;
if
(
!
ocl_computeGradient
(
img
,
grad
,
qangle
,
nbins
,
effect_size
,
gammaCorrection
))
return
false
;
UMat
gauss_w_lut
;
gaussian_lut
.
copyTo
(
gauss_w_lut
);
if
(
!
ocl_compute_hists
(
nbins
,
blockStride
.
width
,
blockStride
.
height
,
effect_size
.
height
,
effect_size
.
width
,
grad
,
qangle
,
gauss_w_lut
,
block_hists
,
block_hist_size
))
return
false
;
if
(
!
ocl_normalize_hists
(
nbins
,
blockStride
.
width
,
blockStride
.
height
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
(
float
)
L2HysThreshold
))
return
false
;
Size
blocks_per_win
=
numPartsWithin
(
winSize
,
blockSize
,
blockStride
);
Size
descr_size
((
int
)
block_hist_size
*
blocks_per_win
.
width
,
blocks_per_win
.
height
);
if
(
!
ocl_classify_hists
(
winSize
.
height
,
winSize
.
width
,
blockStride
.
height
,
blockStride
.
width
,
win_stride
.
height
,
win_stride
.
width
,
effect_size
.
height
,
effect_size
.
width
,
block_hists
,
oclSvmDetector
,
free_coef
,
(
float
)
hit_threshold
,
labels
,
descr_size
,
(
int
)
block_hist_size
))
return
false
;
Mat
labels_host
=
labels
.
getMat
(
ACCESS_READ
);
unsigned
char
*
vec
=
labels_host
.
ptr
();
for
(
int
i
=
0
;
i
<
wins_per_img
.
area
();
i
++
)
{
int
y
=
i
/
wins_per_img
.
width
;
int
x
=
i
-
wins_per_img
.
width
*
y
;
if
(
vec
[
i
])
{
hits
.
push_back
(
Point
(
x
*
win_stride
.
width
,
y
*
win_stride
.
height
));
}
}
return
true
;
}
static
bool
ocl_detectMultiScale
(
InputArray
_img
,
std
::
vector
<
Rect
>
&
found_locations
,
std
::
vector
<
double
>&
level_scale
,
double
hit_threshold
,
Size
win_stride
,
double
group_threshold
,
const
UMat
&
oclSvmDetector
,
Size
blockSize
,
Size
cellSize
,
int
nbins
,
Size
blockStride
,
Size
winSize
,
bool
gammaCorrection
,
double
L2HysThreshold
,
float
sigma
,
float
free_coef
)
{
std
::
vector
<
Rect
>
all_candidates
;
std
::
vector
<
Point
>
locations
;
UMat
image_scale
;
Size
imgSize
=
_img
.
size
();
image_scale
.
create
(
imgSize
,
_img
.
type
());
for
(
size_t
i
=
0
;
i
<
level_scale
.
size
()
;
i
++
)
{
double
scale
=
level_scale
[
i
];
Size
effect_size
=
Size
(
cvRound
(
imgSize
.
width
/
scale
),
cvRound
(
imgSize
.
height
/
scale
));
if
(
effect_size
==
imgSize
)
{
if
(
!
ocl_detect
(
_img
,
locations
,
hit_threshold
,
win_stride
,
oclSvmDetector
,
blockSize
,
cellSize
,
nbins
,
blockStride
,
winSize
,
gammaCorrection
,
L2HysThreshold
,
sigma
,
free_coef
))
return
false
;
}
else
{
resize
(
_img
,
image_scale
,
effect_size
);
if
(
!
ocl_detect
(
image_scale
,
locations
,
hit_threshold
,
win_stride
,
oclSvmDetector
,
blockSize
,
cellSize
,
nbins
,
blockStride
,
winSize
,
gammaCorrection
,
L2HysThreshold
,
sigma
,
free_coef
))
return
false
;
}
Size
scaled_win_size
(
cvRound
(
winSize
.
width
*
scale
),
cvRound
(
winSize
.
height
*
scale
));
for
(
size_t
j
=
0
;
j
<
locations
.
size
();
j
++
)
all_candidates
.
push_back
(
Rect
(
Point2d
(
locations
[
j
])
*
scale
,
scaled_win_size
));
}
found_locations
.
assign
(
all_candidates
.
begin
(),
all_candidates
.
end
());
cv
::
groupRectangles
(
found_locations
,
(
int
)
group_threshold
,
0.2
);
return
true
;
}
#endif //HAVE_OPENCL
void
HOGDescriptor
::
detectMultiScale
(
const
Mat
&
img
,
std
::
vector
<
Rect
>&
foundLocations
,
std
::
vector
<
double
>&
foundWeights
,
InputArray
_
img
,
std
::
vector
<
Rect
>&
foundLocations
,
std
::
vector
<
double
>&
foundWeights
,
double
hitThreshold
,
Size
winStride
,
Size
padding
,
double
scale0
,
double
finalThreshold
,
bool
useMeanshiftGrouping
)
const
{
double
scale
=
1.
;
int
levels
=
0
;
Size
imgSize
=
_img
.
size
();
std
::
vector
<
double
>
levelScale
;
for
(
levels
=
0
;
levels
<
nlevels
;
levels
++
)
{
levelScale
.
push_back
(
scale
);
if
(
cvRound
(
img
.
cols
/
scale
)
<
winSize
.
width
||
cvRound
(
img
.
rows
/
scale
)
<
winSize
.
height
||
if
(
cvRound
(
img
Size
.
width
/
scale
)
<
winSize
.
width
||
cvRound
(
imgSize
.
height
/
scale
)
<
winSize
.
height
||
scale0
<=
1
)
break
;
scale
*=
scale0
;
...
...
@@ -1284,12 +1838,21 @@ void HOGDescriptor::detectMultiScale(
levels
=
std
::
max
(
levels
,
1
);
levelScale
.
resize
(
levels
);
if
(
winStride
==
Size
())
winStride
=
blockStride
;
CV_OCL_RUN
(
_img
.
dims
()
<=
2
&&
_img
.
type
()
==
CV_8UC1
&&
scale0
>
1
&&
winStride
.
width
%
blockStride
.
width
==
0
&&
winStride
.
height
%
blockStride
.
height
==
0
&&
padding
==
Size
(
0
,
0
)
&&
_img
.
isUMat
(),
ocl_detectMultiScale
(
_img
,
foundLocations
,
levelScale
,
hitThreshold
,
winStride
,
finalThreshold
,
oclSvmDetector
,
blockSize
,
cellSize
,
nbins
,
blockStride
,
winSize
,
gammaCorrection
,
L2HysThreshold
,
(
float
)
getWinSigma
(),
free_coef
));
std
::
vector
<
Rect
>
allCandidates
;
std
::
vector
<
double
>
tempScales
;
std
::
vector
<
double
>
tempWeights
;
std
::
vector
<
double
>
foundScales
;
Mutex
mtx
;
Mutex
mtx
;
Mat
img
=
_img
.
getMat
();
Range
range
(
0
,
(
int
)
levelScale
.
size
());
HOGInvoker
invoker
(
this
,
img
,
hitThreshold
,
winStride
,
padding
,
&
levelScale
[
0
],
&
allCandidates
,
&
mtx
,
&
tempWeights
,
&
tempScales
);
parallel_for_
(
range
,
invoker
);
...
...
@@ -1306,7 +1869,7 @@ void HOGDescriptor::detectMultiScale(
groupRectangles
(
foundLocations
,
foundWeights
,
(
int
)
finalThreshold
,
0.2
);
}
void
HOGDescriptor
::
detectMultiScale
(
const
Mat
&
img
,
std
::
vector
<
Rect
>&
foundLocations
,
void
HOGDescriptor
::
detectMultiScale
(
InputArray
img
,
std
::
vector
<
Rect
>&
foundLocations
,
double
hitThreshold
,
Size
winStride
,
Size
padding
,
double
scale0
,
double
finalThreshold
,
bool
useMeanshiftGrouping
)
const
{
...
...
modules/objdetect/src/opencl/objdetect_hog.cl
0 → 100644
View file @
22146e4b
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Multicoreware,
Inc.,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Wenju
He,
wenju@multicorewareinc.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
materials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
define
CELL_WIDTH
8
#
define
CELL_HEIGHT
8
#
define
CELLS_PER_BLOCK_X
2
#
define
CELLS_PER_BLOCK_Y
2
#
define
NTHREADS
256
#
define
CV_PI_F
3.1415926535897932384626433832795f
#
ifdef
INTEL_DEVICE
#
define
QANGLE_TYPE
int
#
define
QANGLE_TYPE2
int2
#
else
#
define
QANGLE_TYPE
uchar
#
define
QANGLE_TYPE2
uchar2
#
endif
//----------------------------------------------------------------------------
//
Histogram
computation
//
12
threads
for
a
cell,
12x4
threads
per
block
//
Use
pre-computed
gaussian
and
interp_weight
lookup
tables
__kernel
void
compute_hists_lut_kernel
(
const
int
cblock_stride_x,
const
int
cblock_stride_y,
const
int
cnbins,
const
int
cblock_hist_size,
const
int
img_block_width,
const
int
blocks_in_group,
const
int
blocks_total,
const
int
grad_quadstep,
const
int
qangle_step,
__global
const
float*
grad,
__global
const
QANGLE_TYPE*
qangle,
__global
const
float*
gauss_w_lut,
__global
float*
block_hists,
__local
float*
smem
)
{
const
int
lx
=
get_local_id
(
0
)
;
const
int
lp
=
lx
/
24
; /* local group id */
const
int
gid
=
get_group_id
(
0
)
*
blocks_in_group
+
lp
;/* global group id */
const
int
gidY
=
gid
/
img_block_width
;
const
int
gidX
=
gid
-
gidY
*
img_block_width
;
const
int
lidX
=
lx
-
lp
*
24
;
const
int
lidY
=
get_local_id
(
1
)
;
const
int
cell_x
=
lidX
/
12
;
const
int
cell_y
=
lidY
;
const
int
cell_thread_x
=
lidX
-
cell_x
*
12
;
__local
float*
hists
=
smem
+
lp
*
cnbins
*
(
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
+
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
)
;
__local
float*
final_hist
=
hists
+
cnbins
*
(
CELLS_PER_BLOCK_X
*
CELLS_PER_BLOCK_Y
*
12
)
;
const
int
offset_x
=
gidX
*
cblock_stride_x
+
(
cell_x
<<
2
)
+
cell_thread_x
;
const
int
offset_y
=
gidY
*
cblock_stride_y
+
(
cell_y
<<
2
)
;
__global
const
float*
grad_ptr
=
(
gid
<
blocks_total
)
?
grad
+
offset_y
*
grad_quadstep
+
(
offset_x
<<
1
)
:
grad
;
__global
const
QANGLE_TYPE*
qangle_ptr
=
(
gid
<
blocks_total
)
?
qangle
+
offset_y
*
qangle_step
+
(
offset_x
<<
1
)
:
qangle
;
__local
float*
hist
=
hists
+
12
*
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
+
cell_thread_x
;
for
(
int
bin_id
=
0
; bin_id < cnbins; ++bin_id)
hist[bin_id
*
48]
=
0.f
;
const
int
dist_x
=
-4
+
cell_thread_x
-
4
*
cell_x
;
const
int
dist_center_x
=
dist_x
-
4
*
(
1
-
2
*
cell_x
)
;
const
int
dist_y_begin
=
-4
-
4
*
lidY
;
for
(
int
dist_y
=
dist_y_begin
; dist_y < dist_y_begin + 12; ++dist_y)
{
float2
vote
=
(
float2
)
(
grad_ptr[0],
grad_ptr[1]
)
;
QANGLE_TYPE2
bin
=
(
QANGLE_TYPE2
)
(
qangle_ptr[0],
qangle_ptr[1]
)
;
grad_ptr
+=
grad_quadstep
;
qangle_ptr
+=
qangle_step
;
int
dist_center_y
=
dist_y
-
4
*
(
1
-
2
*
cell_y
)
;
int
idx
=
(
dist_center_y
+
8
)
*
16
+
(
dist_center_x
+
8
)
;
float
gaussian
=
gauss_w_lut[idx]
;
idx
=
(
dist_y
+
8
)
*
16
+
(
dist_x
+
8
)
;
float
interp_weight
=
gauss_w_lut[256+idx]
;
hist[bin.x
*
48]
+=
gaussian
*
interp_weight
*
vote.x
;
hist[bin.y
*
48]
+=
gaussian
*
interp_weight
*
vote.y
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
volatile
__local
float*
hist_
=
hist
;
for
(
int
bin_id
=
0
; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if
(
cell_thread_x
<
6
)
hist_[0]
+=
hist_[6]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
cell_thread_x
<
3
)
hist_[0]
+=
hist_[3]
;
#
ifdef
CPU
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
cell_thread_x
==
0
)
final_hist[
(
cell_x
*
2
+
cell_y
)
*
cnbins
+
bin_id]
=
hist_[0]
+
hist_[1]
+
hist_[2]
;
}
#
ifdef
CPU
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
int
tid
=
(
cell_y
*
CELLS_PER_BLOCK_Y
+
cell_x
)
*
12
+
cell_thread_x
;
if
((
tid
<
cblock_hist_size
)
&&
(
gid
<
blocks_total
))
{
__global
float*
block_hist
=
block_hists
+
(
gidY
*
img_block_width
+
gidX
)
*
cblock_hist_size
;
block_hist[tid]
=
final_hist[tid]
;
}
}
//-------------------------------------------------------------
//
Normalization
of
histograms
via
L2Hys_norm
//
optimized
for
the
case
of
9
bins
__kernel
void
normalize_hists_36_kernel
(
__global
float*
block_hists,
const
float
threshold,
__local
float
*squares
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gid
=
get_global_id
(
0
)
;
const
int
bid
=
tid
/
36
; /* block-hist id, (0 - 6) */
const
int
boffset
=
bid
*
36
; /* block-hist offset in the work-group */
const
int
hid
=
tid
-
boffset
; /* histogram bin id, (0 - 35) */
float
elem
=
block_hists[gid]
;
squares[tid]
=
elem
*
elem
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float*
smem
=
squares
+
boffset
;
float
sum
=
smem[hid]
;
if
(
hid
<
18
)
smem[hid]
=
sum
=
sum
+
smem[hid
+
18]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
hid
<
9
)
smem[hid]
=
sum
=
sum
+
smem[hid
+
9]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
hid
<
4
)
smem[hid]
=
sum
+
smem[hid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
smem[0]
+
smem[1]
+
smem[2]
+
smem[3]
+
smem[8]
;
elem
=
elem
/
(
sqrt
(
sum
)
+
3.6f
)
;
elem
=
min
(
elem,
threshold
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
squares[tid]
=
elem
*
elem
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
smem[hid]
;
if
(
hid
<
18
)
smem[hid]
=
sum
=
sum
+
smem[hid
+
18]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
hid
<
9
)
smem[hid]
=
sum
=
sum
+
smem[hid
+
9]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
hid
<
4
)
smem[hid]
=
sum
+
smem[hid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
smem[0]
+
smem[1]
+
smem[2]
+
smem[3]
+
smem[8]
;
block_hists[gid]
=
elem
/
(
sqrt
(
sum
)
+
1e-3f
)
;
}
//-------------------------------------------------------------
//
Normalization
of
histograms
via
L2Hys_norm
//
inline
float
reduce_smem
(
volatile
__local
float*
smem,
int
size
)
{
unsigned
int
tid
=
get_local_id
(
0
)
;
float
sum
=
smem[tid]
;
if
(
size
>=
512
)
{
if
(
tid
<
256
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
256]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
256
)
{
if
(
tid
<
128
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
128]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
128
)
{
if
(
tid
<
64
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
#
ifdef
CPU
if
(
size
>=
64
)
{
if
(
tid
<
32
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
32
)
{
if
(
tid
<
16
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
16
)
{
if
(
tid
<
8
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
8
)
{
if
(
tid
<
4
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
4
)
{
if
(
tid
<
2
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
if
(
size
>=
2
)
{
if
(
tid
<
1
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
1]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
; }
#
else
if
(
tid
<
32
)
{
if
(
size
>=
64
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
32]
;
#
if
WAVE_SIZE
<
32
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
if
(
size
>=
32
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
16]
;
if
(
size
>=
16
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
8]
;
if
(
size
>=
8
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
4]
;
if
(
size
>=
4
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
2]
;
if
(
size
>=
2
)
smem[tid]
=
sum
=
sum
+
smem[tid
+
1]
;
}
#
endif
return
sum
;
}
__kernel
void
normalize_hists_kernel
(
const
int
nthreads,
const
int
block_hist_size,
const
int
img_block_width,
__global
float*
block_hists,
const
float
threshold,
__local
float
*squares
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global
float*
hist
=
block_hists
+
(
gidY
*
img_block_width
+
gidX
)
*
block_hist_size
+
tid
;
float
elem
=
0.f
;
if
(
tid
<
block_hist_size
)
elem
=
hist[0]
;
squares[tid]
=
elem
*
elem
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
sum
=
reduce_smem
(
squares,
nthreads
)
;
float
scale
=
1.0f
/
(
sqrt
(
sum
)
+
0.1f
*
block_hist_size
)
;
elem
=
min
(
elem
*
scale,
threshold
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
squares[tid]
=
elem
*
elem
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
reduce_smem
(
squares,
nthreads
)
;
scale
=
1.0f
/
(
sqrt
(
sum
)
+
1e-3f
)
;
if
(
tid
<
block_hist_size
)
hist[0]
=
elem
*
scale
;
}
//---------------------------------------------------------------------
//
Linear
SVM
based
classification
//
48x96
window,
9
bins
and
default
parameters
//
180
threads,
each
thread
corresponds
to
a
bin
in
a
row
__kernel
void
classify_hists_180_kernel
(
const
int
cdescr_width,
const
int
cdescr_height,
const
int
cblock_hist_size,
const
int
img_win_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float
*
block_hists,
__global
const
float*
coefs,
float
free_coef,
float
threshold,
__global
uchar*
labels
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
float
product
=
0.f
;
for
(
int
i
=
0
; i < cdescr_height; i++)
{
product
+=
coefs[i
*
cdescr_width
+
tid]
*
hist[i
*
img_block_width
*
cblock_hist_size
+
tid]
;
}
__local
float
products[180]
;
products[tid]
=
product
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
90
)
products[tid]
=
product
=
product
+
products[tid
+
90]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
45
)
products[tid]
=
product
=
product
+
products[tid
+
45]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
volatile
__local
float*
smem
=
products
;
#
ifdef
CPU
if
(
tid
<
13
)
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<8
)
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<4
)
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<2
)
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
else
if
(
tid
<
13
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
}
#
if
WAVE_SIZE
<
32
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
tid
<
16
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
}
#
endif
if
(
tid
==
0
)
{
product
=
product
+
smem[tid
+
1]
;
labels[gidY
*
img_win_width
+
gidX]
=
(
product
+
free_coef
>=
threshold
)
;
}
}
//---------------------------------------------------------------------
//
Linear
SVM
based
classification
//
64x128
window,
9
bins
and
default
parameters
//
256
threads,
252
of
them
are
used
__kernel
void
classify_hists_252_kernel
(
const
int
cdescr_width,
const
int
cdescr_height,
const
int
cblock_hist_size,
const
int
img_win_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float
*
block_hists,
__global
const
float*
coefs,
float
free_coef,
float
threshold,
__global
uchar*
labels
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
float
product
=
0.f
;
if
(
tid
<
cdescr_width
)
{
for
(
int
i
=
0
; i < cdescr_height; i++)
product
+=
coefs[i
*
cdescr_width
+
tid]
*
hist[i
*
img_block_width
*
cblock_hist_size
+
tid]
;
}
__local
float
products[NTHREADS]
;
products[tid]
=
product
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
128
)
products[tid]
=
product
=
product
+
products[tid
+
128]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
64
)
products[tid]
=
product
=
product
+
products[tid
+
64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
volatile
__local
float*
smem
=
products
;
#
ifdef
CPU
if
(
tid<32
)
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<16
)
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<8
)
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<4
)
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<2
)
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
else
if
(
tid
<
32
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
#
if
WAVE_SIZE
<
32
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
}
#
endif
if
(
tid
==
0
)
{
product
=
product
+
smem[tid
+
1]
;
labels[gidY
*
img_win_width
+
gidX]
=
(
product
+
free_coef
>=
threshold
)
;
}
}
//---------------------------------------------------------------------
//
Linear
SVM
based
classification
//
256
threads
__kernel
void
classify_hists_kernel
(
const
int
cdescr_size,
const
int
cdescr_width,
const
int
cblock_hist_size,
const
int
img_win_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float
*
block_hists,
__global
const
float*
coefs,
float
free_coef,
float
threshold,
__global
uchar*
labels
)
{
const
int
tid
=
get_local_id
(
0
)
;
const
int
gidX
=
get_group_id
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
float
product
=
0.f
;
for
(
int
i
=
tid
; i < cdescr_size; i += NTHREADS)
{
int
offset_y
=
i
/
cdescr_width
;
int
offset_x
=
i
-
offset_y
*
cdescr_width
;
product
+=
coefs[i]
*
hist[offset_y
*
img_block_width
*
cblock_hist_size
+
offset_x]
;
}
__local
float
products[NTHREADS]
;
products[tid]
=
product
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
128
)
products[tid]
=
product
=
product
+
products[tid
+
128]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
64
)
products[tid]
=
product
=
product
+
products[tid
+
64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
volatile
__local
float*
smem
=
products
;
#
ifdef
CPU
if
(
tid<32
)
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<16
)
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<8
)
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<4
)
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid<2
)
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
else
if
(
tid
<
32
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
32]
;
#
if
WAVE_SIZE
<
32
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
tid
<
16
)
{
#
endif
smem[tid]
=
product
=
product
+
smem[tid
+
16]
;
smem[tid]
=
product
=
product
+
smem[tid
+
8]
;
smem[tid]
=
product
=
product
+
smem[tid
+
4]
;
smem[tid]
=
product
=
product
+
smem[tid
+
2]
;
}
#
endif
if
(
tid
==
0
)
{
smem[tid]
=
product
=
product
+
smem[tid
+
1]
;
labels[gidY
*
img_win_width
+
gidX]
=
(
product
+
free_coef
>=
threshold
)
;
}
}
//----------------------------------------------------------------------------
//
Extract
descriptors
__kernel
void
extract_descrs_by_rows_kernel
(
const
int
cblock_hist_size,
const
int
descriptors_quadstep,
const
int
cdescr_size,
const
int
cdescr_width,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
{
int
tid
=
get_local_id
(
0
)
;
int
gidX
=
get_group_id
(
0
)
;
int
gidY
=
get_group_id
(
1
)
;
//
Get
left
top
corner
of
the
window
in
src
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
//
Get
left
top
corner
of
the
window
in
dst
__global
float*
descriptor
=
descriptors
+
(
gidY
*
get_num_groups
(
0
)
+
gidX
)
*
descriptors_quadstep
;
//
Copy
elements
from
src
to
dst
for
(
int
i
=
tid
; i < cdescr_size; i += NTHREADS)
{
int
offset_y
=
i
/
cdescr_width
;
int
offset_x
=
i
-
offset_y
*
cdescr_width
;
descriptor[i]
=
hist[offset_y
*
img_block_width
*
cblock_hist_size
+
offset_x]
;
}
}
__kernel
void
extract_descrs_by_cols_kernel
(
const
int
cblock_hist_size,
const
int
descriptors_quadstep,
const
int
cdescr_size,
const
int
cnblocks_win_x,
const
int
cnblocks_win_y,
const
int
img_block_width,
const
int
win_block_stride_x,
const
int
win_block_stride_y,
__global
const
float*
block_hists,
__global
float*
descriptors
)
{
int
tid
=
get_local_id
(
0
)
;
int
gidX
=
get_group_id
(
0
)
;
int
gidY
=
get_group_id
(
1
)
;
//
Get
left
top
corner
of
the
window
in
src
__global
const
float*
hist
=
block_hists
+
(
gidY
*
win_block_stride_y
*
img_block_width
+
gidX
*
win_block_stride_x
)
*
cblock_hist_size
;
//
Get
left
top
corner
of
the
window
in
dst
__global
float*
descriptor
=
descriptors
+
(
gidY
*
get_num_groups
(
0
)
+
gidX
)
*
descriptors_quadstep
;
//
Copy
elements
from
src
to
dst
for
(
int
i
=
tid
; i < cdescr_size; i += NTHREADS)
{
int
block_idx
=
i
/
cblock_hist_size
;
int
idx_in_block
=
i
-
block_idx
*
cblock_hist_size
;
int
y
=
block_idx
/
cnblocks_win_x
;
int
x
=
block_idx
-
y
*
cnblocks_win_x
;
descriptor[
(
x
*
cnblocks_win_y
+
y
)
*
cblock_hist_size
+
idx_in_block]
=
hist[
(
y
*
img_block_width
+
x
)
*
cblock_hist_size
+
idx_in_block]
;
}
}
//----------------------------------------------------------------------------
//
Gradients
computation
__kernel
void
compute_gradients_8UC4_kernel
(
const
int
height,
const
int
width,
const
int
img_step,
const
int
grad_quadstep,
const
int
qangle_step,
const
__global
uchar4
*
img,
__global
float
*
grad,
__global
QANGLE_TYPE
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
tid
=
get_local_id
(
0
)
;
const
int
gSizeX
=
get_local_size
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global
const
uchar4*
row
=
img
+
gidY
*
img_step
;
__local
float
sh_row[
(
NTHREADS
+
2
)
*
3]
;
uchar4
val
;
if
(
x
<
width
)
val
=
row[x]
;
else
val
=
row[width
-
2]
;
sh_row[tid
+
1]
=
val.x
;
sh_row[tid
+
1
+
(
NTHREADS
+
2
)
]
=
val.y
;
sh_row[tid
+
1
+
2
*
(
NTHREADS
+
2
)
]
=
val.z
;
if
(
tid
==
0
)
{
val
=
row[max
(
x
-
1
,
1
)
]
;
sh_row[0]
=
val.x
;
sh_row[
(
NTHREADS
+
2
)
]
=
val.y
;
sh_row[2
*
(
NTHREADS
+
2
)
]
=
val.z
;
}
if
(
tid
==
gSizeX
-
1
)
{
val
=
row[min
(
x
+
1
,
width
-
2
)
]
;
sh_row[gSizeX
+
1]
=
val.x
;
sh_row[gSizeX
+
1
+
(
NTHREADS
+
2
)
]
=
val.y
;
sh_row[gSizeX
+
1
+
2
*
(
NTHREADS
+
2
)
]
=
val.z
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
width
)
{
float3
a
=
(
float3
)
(
sh_row[tid],
sh_row[tid
+
(
NTHREADS
+
2
)
],
sh_row[tid
+
2
*
(
NTHREADS
+
2
)
]
)
;
float3
b
=
(
float3
)
(
sh_row[tid
+
2],
sh_row[tid
+
2
+
(
NTHREADS
+
2
)
],
sh_row[tid
+
2
+
2
*
(
NTHREADS
+
2
)
]
)
;
float3
dx
;
if
(
correct_gamma
==
1
)
dx
=
sqrt
(
b
)
-
sqrt
(
a
)
;
else
dx
=
b
-
a
;
float3
dy
=
(
float3
)
0.f
;
if
(
gidY
>
0
&&
gidY
<
height
-
1
)
{
a
=
convert_float3
(
img[
(
gidY
-
1
)
*
img_step
+
x].xyz
)
;
b
=
convert_float3
(
img[
(
gidY
+
1
)
*
img_step
+
x].xyz
)
;
if
(
correct_gamma
==
1
)
dy
=
sqrt
(
b
)
-
sqrt
(
a
)
;
else
dy
=
b
-
a
;
}
float
best_dx
=
dx.x
;
float
best_dy
=
dy.x
;
float
mag0
=
dx.x
*
dx.x
+
dy.x
*
dy.x
;
float
mag1
=
dx.y
*
dx.y
+
dy.y
*
dy.y
;
if
(
mag0
<
mag1
)
{
best_dx
=
dx.y
;
best_dy
=
dy.y
;
mag0
=
mag1
;
}
mag1
=
dx.z
*
dx.z
+
dy.z
*
dy.z
;
if
(
mag0
<
mag1
)
{
best_dx
=
dx.z
;
best_dy
=
dy.z
;
mag0
=
mag1
;
}
mag0
=
sqrt
(
mag0
)
;
float
ang
=
(
atan2
(
best_dy,
best_dx
)
+
CV_PI_F
)
*
angle_scale
-
0.5f
;
int
hidx
=
(
int
)
floor
(
ang
)
;
ang
-=
hidx
;
hidx
=
(
hidx
+
cnbins
)
%
cnbins
;
qangle[
(
gidY
*
qangle_step
+
x
)
<<
1]
=
hidx
;
qangle[
((
gidY
*
qangle_step
+
x
)
<<
1
)
+
1]
=
(
hidx
+
1
)
%
cnbins
;
grad[
(
gidY
*
grad_quadstep
+
x
)
<<
1]
=
mag0
*
(
1.f
-
ang
)
;
grad[
((
gidY
*
grad_quadstep
+
x
)
<<
1
)
+
1]
=
mag0
*
ang
;
}
}
__kernel
void
compute_gradients_8UC1_kernel
(
const
int
height,
const
int
width,
const
int
img_step,
const
int
grad_quadstep,
const
int
qangle_step,
__global
const
uchar
*
img,
__global
float
*
grad,
__global
QANGLE_TYPE
*
qangle,
const
float
angle_scale,
const
char
correct_gamma,
const
int
cnbins
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
tid
=
get_local_id
(
0
)
;
const
int
gSizeX
=
get_local_size
(
0
)
;
const
int
gidY
=
get_group_id
(
1
)
;
__global
const
uchar*
row
=
img
+
gidY
*
img_step
;
__local
float
sh_row[NTHREADS
+
2]
;
if
(
x
<
width
)
sh_row[tid
+
1]
=
row[x]
;
else
sh_row[tid
+
1]
=
row[width
-
2]
;
if
(
tid
==
0
)
sh_row[0]
=
row[max
(
x
-
1
,
1
)
]
;
if
(
tid
==
gSizeX
-
1
)
sh_row[gSizeX
+
1]
=
row[min
(
x
+
1
,
width
-
2
)
]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
width
)
{
float
dx
;
if
(
correct_gamma
==
1
)
dx
=
sqrt
(
sh_row[tid
+
2]
)
-
sqrt
(
sh_row[tid]
)
;
else
dx
=
sh_row[tid
+
2]
-
sh_row[tid]
;
float
dy
=
0.f
;
if
(
gidY
>
0
&&
gidY
<
height
-
1
)
{
float
a
=
(
float
)
img[
(
gidY
+
1
)
*
img_step
+
x
]
;
float
b
=
(
float
)
img[
(
gidY
-
1
)
*
img_step
+
x
]
;
if
(
correct_gamma
==
1
)
dy
=
sqrt
(
a
)
-
sqrt
(
b
)
;
else
dy
=
a
-
b
;
}
float
mag
=
sqrt
(
dx
*
dx
+
dy
*
dy
)
;
float
ang
=
(
atan2
(
dy,
dx
)
+
CV_PI_F
)
*
angle_scale
-
0.5f
;
int
hidx
=
(
int
)
floor
(
ang
)
;
ang
-=
hidx
;
hidx
=
(
hidx
+
cnbins
)
%
cnbins
;
qangle[
(
gidY
*
qangle_step
+
x
)
<<
1
]
=
hidx
;
qangle[
((
gidY
*
qangle_step
+
x
)
<<
1
)
+
1
]
=
(
hidx
+
1
)
%
cnbins
;
grad[
(
gidY
*
grad_quadstep
+
x
)
<<
1
]
=
mag
*
(
1.f
-
ang
)
;
grad[
((
gidY
*
grad_quadstep
+
x
)
<<
1
)
+
1
]
=
mag
*
ang
;
}
}
modules/objdetect/test/opencl/test_hogdetector.cpp
0 → 100644
View file @
22146e4b
///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Shengen Yan, yanshengen@gmail.com
// Jiang Liyuan,jlyuan001.good@163.com
// Rock Li, Rock.Li@amd.com
// Zailong Wu, bullet@yeah.net
// Yao Wang, bitwangyaoyao@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "test_precomp.hpp"
#include "opencv2/ts/ocl_test.hpp"
#ifdef HAVE_OPENCL
namespace
cvtest
{
namespace
ocl
{
///////////////////// HOG /////////////////////////////
PARAM_TEST_CASE
(
HOG
,
Size
,
MatType
)
{
Size
winSize
;
int
type
;
Mat
img
;
UMat
uimg
;
virtual
void
SetUp
()
{
winSize
=
GET_PARAM
(
0
);
type
=
GET_PARAM
(
1
);
img
=
readImage
(
"cascadeandhog/images/image_00000000_0.png"
,
IMREAD_GRAYSCALE
);
ASSERT_FALSE
(
img
.
empty
());
img
.
copyTo
(
uimg
);
}
};
OCL_TEST_P
(
HOG
,
GetDescriptors
)
{
HOGDescriptor
hog
;
hog
.
gammaCorrection
=
true
;
hog
.
setSVMDetector
(
hog
.
getDefaultPeopleDetector
());
std
::
vector
<
float
>
cpu_descriptors
;
std
::
vector
<
float
>
gpu_descriptors
;
OCL_OFF
(
hog
.
compute
(
img
,
cpu_descriptors
,
hog
.
winSize
));
OCL_ON
(
hog
.
compute
(
uimg
,
gpu_descriptors
,
hog
.
winSize
));
Mat
cpu_desc
(
cpu_descriptors
),
gpu_desc
(
gpu_descriptors
);
EXPECT_MAT_SIMILAR
(
cpu_desc
,
gpu_desc
,
1e-1
);
}
OCL_TEST_P
(
HOG
,
Detect
)
{
HOGDescriptor
hog
;
hog
.
winSize
=
winSize
;
hog
.
gammaCorrection
=
true
;
if
(
winSize
.
width
==
48
&&
winSize
.
height
==
96
)
hog
.
setSVMDetector
(
hog
.
getDaimlerPeopleDetector
());
else
hog
.
setSVMDetector
(
hog
.
getDefaultPeopleDetector
());
std
::
vector
<
Rect
>
cpu_found
;
std
::
vector
<
Rect
>
gpu_found
;
OCL_OFF
(
hog
.
detectMultiScale
(
img
,
cpu_found
,
0
,
Size
(
8
,
8
),
Size
(
0
,
0
),
1.05
,
6
));
OCL_ON
(
hog
.
detectMultiScale
(
uimg
,
gpu_found
,
0
,
Size
(
8
,
8
),
Size
(
0
,
0
),
1.05
,
6
));
EXPECT_LT
(
checkRectSimilarity
(
img
.
size
(),
cpu_found
,
gpu_found
),
1.0
);
}
INSTANTIATE_TEST_CASE_P
(
OCL_ObjDetect
,
HOG
,
testing
::
Combine
(
testing
::
Values
(
Size
(
64
,
128
),
Size
(
48
,
96
)),
testing
::
Values
(
MatType
(
CV_8UC1
)
)
)
);
}}
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment