Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
d11921f1
Commit
d11921f1
authored
Oct 30, 2013
by
Roman Donchenko
Committed by
OpenCV Buildbot
Oct 30, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1675 from krodyush:opencl-optimization
parents
9f162cb8
973137c7
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
448 additions
and
11 deletions
+448
-11
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+3
-1
cl_context.cpp
modules/ocl/src/cl_context.cpp
+14
-1
haar.cpp
modules/ocl/src/haar.cpp
+129
-7
imgproc.cpp
modules/ocl/src/imgproc.cpp
+54
-2
haarobjectdetect.cl
modules/ocl/src/opencl/haarobjectdetect.cl
+139
-0
imgproc_sobel3.cl
modules/ocl/src/opencl/imgproc_sobel3.cl
+109
-0
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
d11921f1
...
...
@@ -111,6 +111,7 @@ namespace cv
bool
haveDoubleSupport
;
bool
isUnifiedMemory
;
// 1 means integrated GPU, otherwise this value is 0
bool
isIntelDevice
;
std
::
string
compilationExtraOptions
;
...
...
@@ -157,7 +158,8 @@ namespace cv
{
FEATURE_CL_DOUBLE
=
1
,
FEATURE_CL_UNIFIED_MEM
,
FEATURE_CL_VER_1_2
FEATURE_CL_VER_1_2
,
FEATURE_CL_INTEL_DEVICE
};
// Represents OpenCL context, interface
...
...
modules/ocl/src/cl_context.cpp
View file @
d11921f1
...
...
@@ -448,6 +448,17 @@ static int initializeOpenCLDevices()
{
deviceInfo
.
info
.
haveDoubleSupport
=
false
;
}
size_t
intel_platform
=
platformInfo
.
info
.
platformVendor
.
find
(
"Intel"
);
if
(
intel_platform
!=
std
::
string
::
npos
)
{
deviceInfo
.
info
.
compilationExtraOptions
+=
" -D INTEL_DEVICE"
;
deviceInfo
.
info
.
isIntelDevice
=
true
;
}
else
{
deviceInfo
.
info
.
isIntelDevice
=
false
;
}
}
}
}
...
...
@@ -471,7 +482,7 @@ DeviceInfo::DeviceInfo()
deviceVendorId
(
-
1
),
maxWorkGroupSize
(
0
),
maxComputeUnits
(
0
),
localMemorySize
(
0
),
maxMemAllocSize
(
0
),
deviceVersionMajor
(
0
),
deviceVersionMinor
(
0
),
haveDoubleSupport
(
false
),
isUnifiedMemory
(
false
),
haveDoubleSupport
(
false
),
isUnifiedMemory
(
false
),
isIntelDevice
(
false
),
platform
(
NULL
)
{
// nothing
...
...
@@ -572,6 +583,8 @@ bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const
{
switch
(
featureType
)
{
case
FEATURE_CL_INTEL_DEVICE
:
return
deviceInfo
.
isIntelDevice
;
case
FEATURE_CL_DOUBLE
:
return
deviceInfo
.
haveDoubleSupport
;
case
FEATURE_CL_UNIFIED_MEM
:
...
...
modules/ocl/src/haar.cpp
View file @
d11921f1
...
...
@@ -849,16 +849,138 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args
.
push_back
(
make_pair
(
sizeof
(
cl_int4
)
,
(
void
*
)
&
pq
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
correction
));
const
char
*
build_options
=
gcascade
->
is_stump_based
?
"-D STUMP_BASED=1"
:
"-D STUMP_BASED=0"
;
if
(
gcascade
->
is_stump_based
&&
gsum
.
clCxt
->
supportsFeature
(
FEATURE_CL_INTEL_DEVICE
))
{
//setup local group size
localThreads
[
0
]
=
8
;
localThreads
[
1
]
=
16
;
localThreads
[
2
]
=
1
;
//init maximal number of workgroups
int
WGNumX
=
1
+
(
sizev
[
0
].
width
/
(
localThreads
[
0
]));
int
WGNumY
=
1
+
(
sizev
[
0
].
height
/
(
localThreads
[
1
]));
int
WGNumZ
=
loopcount
;
int
WGNum
=
0
;
//accurate number of non -empty workgroups
oclMat
oclWGInfo
(
1
,
sizeof
(
cl_int4
)
*
WGNumX
*
WGNumY
*
WGNumZ
,
CV_8U
);
{
cl_int4
*
pWGInfo
=
(
cl_int4
*
)
clEnqueueMapBuffer
(
getClCommandQueue
(
oclWGInfo
.
clCxt
),(
cl_mem
)
oclWGInfo
.
datastart
,
true
,
CL_MAP_WRITE
,
0
,
oclWGInfo
.
step
,
0
,
0
,
0
,
&
status
);
openCLVerifyCall
(
status
);
for
(
int
z
=
0
;
z
<
WGNumZ
;
++
z
)
{
int
Width
=
(
scaleinfo
[
z
].
width_height
>>
16
)
&
0xFFFF
;
int
Height
=
(
scaleinfo
[
z
].
width_height
>>
0
)
&
0xFFFF
;
for
(
int
y
=
0
;
y
<
WGNumY
;
++
y
)
{
int
gy
=
y
*
localThreads
[
1
];
if
(
gy
>=
(
Height
-
cascade
->
orig_window_size
.
height
))
continue
;
// no data to process
for
(
int
x
=
0
;
x
<
WGNumX
;
++
x
)
{
int
gx
=
x
*
localThreads
[
0
];
if
(
gx
>=
(
Width
-
cascade
->
orig_window_size
.
width
))
continue
;
// no data to process
// save no-empty workgroup info into array
pWGInfo
[
WGNum
].
s
[
0
]
=
scaleinfo
[
z
].
width_height
;
pWGInfo
[
WGNum
].
s
[
1
]
=
(
gx
<<
16
)
|
gy
;
pWGInfo
[
WGNum
].
s
[
2
]
=
scaleinfo
[
z
].
imgoff
;
memcpy
(
&
(
pWGInfo
[
WGNum
].
s
[
3
]),
&
(
scaleinfo
[
z
].
factor
),
sizeof
(
float
));
WGNum
++
;
}
}
}
openCLSafeCall
(
clEnqueueUnmapMemObject
(
getClCommandQueue
(
oclWGInfo
.
clCxt
),(
cl_mem
)
oclWGInfo
.
datastart
,
pWGInfo
,
0
,
0
,
0
));
pWGInfo
=
NULL
;
}
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascade"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
// setup global sizes to have linear array of workgroups with WGNum size
globalThreads
[
0
]
=
localThreads
[
0
]
*
WGNum
;
globalThreads
[
1
]
=
localThreads
[
1
];
globalThreads
[
2
]
=
1
;
#define NODE_SIZE 12
// pack node info to have less memory loads
oclMat
oclNodesPK
(
1
,
sizeof
(
cl_int
)
*
NODE_SIZE
*
nodenum
,
CV_8U
);
{
cl_int
status
;
cl_int
*
pNodesPK
=
(
cl_int
*
)
clEnqueueMapBuffer
(
getClCommandQueue
(
oclNodesPK
.
clCxt
),(
cl_mem
)
oclNodesPK
.
datastart
,
true
,
CL_MAP_WRITE
,
0
,
oclNodesPK
.
step
,
0
,
0
,
0
,
&
status
);
openCLVerifyCall
(
status
);
//use known local data stride to precalulate indexes
int
DATA_SIZE_X
=
(
localThreads
[
0
]
+
cascade
->
orig_window_size
.
width
);
// check that maximal value is less than maximal unsigned short
assert
(
DATA_SIZE_X
*
cascade
->
orig_window_size
.
height
+
cascade
->
orig_window_size
.
width
<
USHRT_MAX
);
for
(
int
i
=
0
;
i
<
nodenum
;
++
i
)
{
//process each node from classifier
struct
NodePK
{
unsigned
short
slm_index
[
3
][
4
];
float
weight
[
3
];
float
threshold
;
float
alpha
[
2
];
};
struct
NodePK
*
pOut
=
(
struct
NodePK
*
)(
pNodesPK
+
NODE_SIZE
*
i
);
for
(
int
k
=
0
;
k
<
3
;
++
k
)
{
// calc 4 short indexes in shared local mem for each rectangle instead of 2 (x,y) pair.
int
*
p
=
&
(
node
[
i
].
p
[
k
][
0
]);
pOut
->
slm_index
[
k
][
0
]
=
(
unsigned
short
)(
p
[
1
]
*
DATA_SIZE_X
+
p
[
0
]);
pOut
->
slm_index
[
k
][
1
]
=
(
unsigned
short
)(
p
[
1
]
*
DATA_SIZE_X
+
p
[
2
]);
pOut
->
slm_index
[
k
][
2
]
=
(
unsigned
short
)(
p
[
3
]
*
DATA_SIZE_X
+
p
[
0
]);
pOut
->
slm_index
[
k
][
3
]
=
(
unsigned
short
)(
p
[
3
]
*
DATA_SIZE_X
+
p
[
2
]);
}
//store used float point values for each node
pOut
->
weight
[
0
]
=
node
[
i
].
weight
[
0
];
pOut
->
weight
[
1
]
=
node
[
i
].
weight
[
1
];
pOut
->
weight
[
2
]
=
node
[
i
].
weight
[
2
];
pOut
->
threshold
=
node
[
i
].
threshold
;
pOut
->
alpha
[
0
]
=
node
[
i
].
alpha
[
0
];
pOut
->
alpha
[
1
]
=
node
[
i
].
alpha
[
1
];
}
openCLSafeCall
(
clEnqueueUnmapMemObject
(
getClCommandQueue
(
oclNodesPK
.
clCxt
),(
cl_mem
)
oclNodesPK
.
datastart
,
pNodesPK
,
0
,
0
,
0
));
pNodesPK
=
NULL
;
}
// add 2 additional buffers (WGinfo and packed nodes) as 2 last args
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
oclNodesPK
.
datastart
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
oclWGInfo
.
datastart
));
//form build options for kernel
string
options
=
"-D PACKED_CLASSIFIER"
;
options
+=
format
(
" -D NODE_SIZE=%d"
,
NODE_SIZE
);
options
+=
format
(
" -D WND_SIZE_X=%d"
,
cascade
->
orig_window_size
.
width
);
options
+=
format
(
" -D WND_SIZE_Y=%d"
,
cascade
->
orig_window_size
.
height
);
options
+=
format
(
" -D STUMP_BASED=%d"
,
gcascade
->
is_stump_based
);
options
+=
format
(
" -D LSx=%d"
,
localThreads
[
0
]);
options
+=
format
(
" -D LSy=%d"
,
localThreads
[
1
]);
options
+=
format
(
" -D SPLITNODE=%d"
,
splitnode
);
options
+=
format
(
" -D SPLITSTAGE=%d"
,
splitstage
);
options
+=
format
(
" -D OUTPUTSZ=%d"
,
outputsz
);
// init candiate global count by 0
int
pattern
=
0
;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
candidatebuffer
,
1
,
0
,
1
*
sizeof
(
pattern
),
&
pattern
,
0
,
NULL
,
NULL
));
// execute face detector
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascadePacked"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
options
.
c_str
());
//read candidate buffer back and put it into host list
openCLReadBuffer
(
gsum
.
clCxt
,
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
assert
(
candidate
[
0
]
<
outputsz
);
//printf("candidate[0]=%d\n",candidate[0]);
for
(
int
i
=
1
;
i
<=
candidate
[
0
];
i
++
)
{
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
}
}
else
{
const
char
*
build_options
=
gcascade
->
is_stump_based
?
"-D STUMP_BASED=1"
:
"-D STUMP_BASED=0"
;
openCLReadBuffer
(
gsum
.
clCxt
,
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascade"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
if
(
candidate
[
4
*
i
+
2
]
!=
0
)
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
openCLReadBuffer
(
gsum
.
clCxt
,
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
if
(
candidate
[
4
*
i
+
2
]
!=
0
)
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
}
free
(
scaleinfo
);
free
(
candidate
);
...
...
modules/ocl/src/imgproc.cpp
View file @
d11921f1
...
...
@@ -876,8 +876,60 @@ namespace cv
if
(
ksize
>
0
)
{
Sobel
(
src
,
Dx
,
CV_32F
,
1
,
0
,
ksize
,
scale
,
0
,
borderType
);
Sobel
(
src
,
Dy
,
CV_32F
,
0
,
1
,
ksize
,
scale
,
0
,
borderType
);
Context
*
clCxt
=
Context
::
getContext
();
if
(
clCxt
->
supportsFeature
(
FEATURE_CL_INTEL_DEVICE
)
&&
src
.
type
()
==
CV_8UC1
&&
src
.
cols
%
8
==
0
&&
src
.
rows
%
8
==
0
&&
ksize
==
3
&&
(
borderType
==
cv
::
BORDER_REFLECT
||
borderType
==
cv
::
BORDER_REPLICATE
||
borderType
==
cv
::
BORDER_REFLECT101
||
borderType
==
cv
::
BORDER_WRAP
))
{
Dx
.
create
(
src
.
size
(),
CV_32FC1
);
Dy
.
create
(
src
.
size
(),
CV_32FC1
);
const
unsigned
int
block_x
=
8
;
const
unsigned
int
block_y
=
8
;
unsigned
int
src_pitch
=
src
.
step
;
unsigned
int
dst_pitch
=
Dx
.
cols
;
float
_scale
=
scale
;
std
::
vector
<
std
::
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
std
::
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
std
::
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
Dx
.
data
));
args
.
push_back
(
std
::
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
Dy
.
data
));
args
.
push_back
(
std
::
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
std
::
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
std
::
make_pair
(
sizeof
(
cl_uint
)
,
(
void
*
)
&
src_pitch
));
args
.
push_back
(
std
::
make_pair
(
sizeof
(
cl_uint
)
,
(
void
*
)
&
dst_pitch
));
args
.
push_back
(
std
::
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
_scale
));
size_t
gt2
[
3
]
=
{
src
.
cols
,
src
.
rows
,
1
},
lt2
[
3
]
=
{
block_x
,
block_y
,
1
};
string
option
=
"-D BLK_X=8 -D BLK_Y=8"
;
switch
(
borderType
)
{
case
cv
:
:
BORDER_REPLICATE
:
option
+=
" -D BORDER_REPLICATE"
;
break
;
case
cv
:
:
BORDER_REFLECT
:
option
+=
" -D BORDER_REFLECT"
;
break
;
case
cv
:
:
BORDER_REFLECT101
:
option
+=
" -D BORDER_REFLECT101"
;
break
;
case
cv
:
:
BORDER_WRAP
:
option
+=
" -D BORDER_WRAP"
;
break
;
}
openCLExecuteKernel
(
src
.
clCxt
,
&
imgproc_sobel3
,
"sobel3"
,
gt2
,
lt2
,
args
,
-
1
,
-
1
,
option
.
c_str
()
);
}
else
{
Sobel
(
src
,
Dx
,
CV_32F
,
1
,
0
,
ksize
,
scale
,
0
,
borderType
);
Sobel
(
src
,
Dy
,
CV_32F
,
0
,
1
,
ksize
,
scale
,
0
,
borderType
);
}
}
else
{
...
...
modules/ocl/src/opencl/haarobjectdetect.cl
View file @
d11921f1
...
...
@@ -101,6 +101,144 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
float
inv_window_area
__attribute__
((
aligned
(
4
)))
;
}
GpuHidHaarClassifierCascade
;
#
ifdef
PACKED_CLASSIFIER
//
this
code
is
scalar,
one
pixel
->
one
workitem
__kernel
void
gpuRunHaarClassifierCascadePacked
(
global
const
GpuHidHaarStageClassifier
*
stagecascadeptr,
global
const
int4
*
info,
global
const
GpuHidHaarTreeNode
*
nodeptr,
global
const
int
*
restrict
sum,
global
const
float
*
restrict
sqsum,
volatile
global
int4
*
candidate,
const
int
pixelstep,
const
int
loopcount,
const
int
start_stage,
const
int
split_stage,
const
int
end_stage,
const
int
startnode,
const
int
splitnode,
const
int4
p,
const
int4
pq,
const
float
correction,
global
const
int*
pNodesPK,
global
const
int4*
pWGInfo
)
{
//
this
version
used
information
provided
for
each
workgroup
//
no
empty
WG
int
gid
=
(
int
)
get_group_id
(
0
)
;
int
lid_x
=
(
int
)
get_local_id
(
0
)
;
int
lid_y
=
(
int
)
get_local_id
(
1
)
;
int
lid
=
lid_y*LSx+lid_x
;
int4
WGInfo
=
pWGInfo[gid]
;
int
GroupX
=
(
WGInfo.y
>>
16
)
&0xFFFF
;
int
GroupY
=
(
WGInfo.y
>>
0
)
&
0xFFFF
;
int
Width
=
(
WGInfo.x
>>
16
)
&0xFFFF
;
int
Height
=
(
WGInfo.x
>>
0
)
&
0xFFFF
;
int
ImgOffset
=
WGInfo.z
;
float
ScaleFactor
=
as_float
(
WGInfo.w
)
;
#
define
DATA_SIZE_X
(
LSx+WND_SIZE_X
)
#
define
DATA_SIZE_Y
(
LSy+WND_SIZE_Y
)
#
define
DATA_SIZE
(
DATA_SIZE_X*DATA_SIZE_Y
)
local
int
SumL[DATA_SIZE]
;
//
read
input
data
window
into
local
mem
for
(
int
i
=
0
; i<DATA_SIZE; i+=(LSx*LSy))
{
int
index
=
i+lid
; // index in shared local memory
if
(
index<DATA_SIZE
)
{//
calc
global
x,y
coordinat
and
read
data
from
there
int
x
=
min
(
GroupX
+
(
index
%
(
DATA_SIZE_X
))
,
Width-1
)
;
int
y
=
min
(
GroupY
+
(
index
/
(
DATA_SIZE_X
))
,
Height-1
)
;
SumL[index]
=
sum[ImgOffset+y*pixelstep+x]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
calc
variance_norm_factor
for
all
stages
float
variance_norm_factor
;
int
nodecounter=
startnode
;
int4
info1
=
p
;
int4
info2
=
pq
;
{
int
xl
=
lid_x
;
int
yl
=
lid_y
;
int
OffsetLocal
=
yl
*
DATA_SIZE_X
+
xl
;
int
OffsetGlobal
=
(
GroupY+yl
)
*
pixelstep
+
(
GroupX+xl
)
;
//
add
shift
to
get
position
on
scaled
image
OffsetGlobal
+=
ImgOffset
;
float
mean
=
SumL[info1.y*DATA_SIZE_X+info1.x+OffsetLocal]
-
SumL[info1.y*DATA_SIZE_X+info1.z+OffsetLocal]
-
SumL[info1.w*DATA_SIZE_X+info1.x+OffsetLocal]
+
SumL[info1.w*DATA_SIZE_X+info1.z+OffsetLocal]
;
float
sq
=
sqsum[info2.y*pixelstep+info2.x+OffsetGlobal]
-
sqsum[info2.y*pixelstep+info2.z+OffsetGlobal]
-
sqsum[info2.w*pixelstep+info2.x+OffsetGlobal]
+
sqsum[info2.w*pixelstep+info2.z+OffsetGlobal]
;
mean
*=
correction
;
sq
*=
correction
;
variance_norm_factor
=
sq
-
mean
*
mean
;
variance_norm_factor
=
(
variance_norm_factor
>=0.f
)
?
sqrt
(
variance_norm_factor
)
:
1.f
;
}//
end
calc
variance_norm_factor
for
all
stages
int
result
=
(
1.0f>0.0f
)
;
for
(
int
stageloop
=
start_stage
; (stageloop < end_stage) && result; stageloop++ )
{//
iterate
until
candidate
is
exist
float
stage_sum
=
0.0f
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
int
lcl_off
=
(
lid_y*DATA_SIZE_X
)
+
(
lid_x
)
;
for
(
int
nodeloop
=
0
; nodeloop < stageinfo.x; nodecounter++,nodeloop++ )
{
//
simple
macro
to
extract
shorts
from
int
#
define
M0
(
_t
)
((
_t
)
&0xFFFF
)
#
define
M1
(
_t
)
(((
_t
)
>>16
)
&0xFFFF
)
//
load
packed
node
data
from
global
memory
(
L3
)
into
registers
global
const
int4*
pN
=
(
__global
int4*
)(
pNodesPK+nodecounter*NODE_SIZE
)
;
int4
n0
=
pN[0]
;
int4
n1
=
pN[1]
;
int4
n2
=
pN[2]
;
float
nodethreshold
=
as_float
(
n2.y
)
*
variance_norm_factor
;
//
calc
sum
of
intensity
pixels
according
to
node
information
float
classsum
=
(
SumL[M0
(
n0.x
)
+lcl_off]
-
SumL[M1
(
n0.x
)
+lcl_off]
-
SumL[M0
(
n0.y
)
+lcl_off]
+
SumL[M1
(
n0.y
)
+lcl_off]
)
*
as_float
(
n1.z
)
+
(
SumL[M0
(
n0.z
)
+lcl_off]
-
SumL[M1
(
n0.z
)
+lcl_off]
-
SumL[M0
(
n0.w
)
+lcl_off]
+
SumL[M1
(
n0.w
)
+lcl_off]
)
*
as_float
(
n1.w
)
+
(
SumL[M0
(
n1.x
)
+lcl_off]
-
SumL[M1
(
n1.x
)
+lcl_off]
-
SumL[M0
(
n1.y
)
+lcl_off]
+
SumL[M1
(
n1.y
)
+lcl_off]
)
*
as_float
(
n2.x
)
;
//accumulate
stage
responce
stage_sum
+=
(
classsum
>=
nodethreshold
)
?
as_float
(
n2.w
)
:
as_float
(
n2.z
)
;
}
result
=
(
stage_sum
>=
stagethreshold
)
;
}//
next
stage
if
needed
if
(
result
)
{//
all
stages
will
be
passed
and
there
is
a
detected
face
on
the
tested
position
int
index
=
1+atomic_inc
((
volatile
global
int*
)
candidate
)
; //get index to write global data with face info
if
(
index<OUTPUTSZ
)
{
int
x
=
GroupX+lid_x
;
int
y
=
GroupY+lid_y
;
int4
candidate_result
;
candidate_result.x
=
convert_int_rtn
(
x*ScaleFactor
)
;
candidate_result.y
=
convert_int_rtn
(
y*ScaleFactor
)
;
candidate_result.z
=
convert_int_rtn
(
ScaleFactor*WND_SIZE_X
)
;
candidate_result.w
=
convert_int_rtn
(
ScaleFactor*WND_SIZE_Y
)
;
candidate[index]
=
candidate_result
;
}
}
}//end
gpuRunHaarClassifierCascade
#
else
__kernel
void
__attribute__
((
reqd_work_group_size
(
8
,
8
,
1
)))
gpuRunHaarClassifierCascade
(
global
GpuHidHaarStageClassifier
*
stagecascadeptr,
global
int4
*
info,
...
...
@@ -421,3 +559,4 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
}//end
for
(
int
grploop=grpidx
;grploop<totalgrp;grploop+=grpnumx)
}//end
for
(
int
scalei
=
0
; scalei <loopcount; scalei++)
}
#
endif
modules/ocl/src/opencl/imgproc_sobel3.cl
0 → 100644
View file @
d11921f1
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro
for
border
type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#
ifdef
BORDER_REPLICATE
//BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
l_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
r_edge
)
-1
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
t_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
b_edge
)
-1
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT
//BORDER_REFLECT:
fedcba|abcdefgh|hgfedcb
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-1+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-1+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT101
//BORDER_REFLECT101:
gfedcb|abcdefgh|gfedcba
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-2+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-2+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
BORDER_WRAP
//BORDER_WRAP:
cdefgh|abcdefgh|abcdefg
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
i
)
+
(
r_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
i
)
-
(
r_edge
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
i
)
+
(
b_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
endif
__kernel
void
sobel3
(
__global
uchar*
Src,
__global
float*
DstX,
__global
float*
DstY,
int
width,
int
height,
uint
srcStride,
uint
dstStride,
float
scale
)
{
__local
float
lsmem[BLK_Y+2][BLK_X+2]
;
int
lix
=
get_local_id
(
0
)
;
int
liy
=
get_local_id
(
1
)
;
int
gix
=
get_group_id
(
0
)
;
int
giy
=
get_group_id
(
1
)
;
int
id_x
=
get_global_id
(
0
)
;
int
id_y
=
get_global_id
(
1
)
;
lsmem[liy+1][lix+1]
=
convert_float
(
Src[
id_y
*
srcStride
+
id_x
]
)
;
int
id_y_h
=
ADDR_H
(
id_y-1,
0
,
height
)
;
int
id_y_b
=
ADDR_B
(
id_y+1,
height,id_y+1
)
;
int
id_x_l
=
ADDR_L
(
id_x-1,
0
,
width
)
;
int
id_x_r
=
ADDR_R
(
id_x+1,
width,id_x+1
)
;
if
(
liy==0
)
{
lsmem[0][lix+1]=convert_float
(
Src[
id_y_h
*
srcStride
+
id_x
]
)
;
if
(
lix==0
)
lsmem[0][0]=convert_float
(
Src[
id_y_h
*
srcStride
+
id_x_l
]
)
;
else
if
(
lix==BLK_X-1
)
lsmem[0][BLK_X+1]=convert_float
(
Src[
id_y_h
*
srcStride
+
id_x_r
]
)
;
}
else
if
(
liy==BLK_Y-1
)
{
lsmem[BLK_Y+1][lix+1]=convert_float
(
Src[
id_y_b
*
srcStride
+
id_x
]
)
;
if
(
lix==0
)
lsmem[BLK_Y+1][0]=convert_float
(
Src[
id_y_b
*
srcStride
+
id_x_l
]
)
;
else
if
(
lix==BLK_X-1
)
lsmem[BLK_Y+1][BLK_X+1]=convert_float
(
Src[
id_y_b
*
srcStride
+
id_x_r
]
)
;
}
if
(
lix==0
)
lsmem[liy+1][0]
=
convert_float
(
Src[
id_y
*
srcStride
+
id_x_l
]
)
;
else
if
(
lix==BLK_X-1
)
lsmem[liy+1][BLK_X+1]
=
convert_float
(
Src[
id_y
*
srcStride
+
id_x_r
]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
u1
=
lsmem[liy][lix]
;
float
u2
=
lsmem[liy][lix+1]
;
float
u3
=
lsmem[liy][lix+2]
;
float
m1
=
lsmem[liy+1][lix]
;
float
m2
=
lsmem[liy+1][lix+1]
;
float
m3
=
lsmem[liy+1][lix+2]
;
float
b1
=
lsmem[liy+2][lix]
;
float
b2
=
lsmem[liy+2][lix+1]
;
float
b3
=
lsmem[liy+2][lix+2]
;
//m2
*
scale
;//
float
dx
=
mad
(
2.0f,
m3
-
m1,
u3
-
u1
+
b3
-
b1
)
;
DstX[
id_y
*
dstStride
+
id_x
]
=
dx
*
scale
;
float
dy
=
mad
(
2.0f,
b2
-
u2,
b1
-
u1
+
b3
-
u3
)
;
DstY[
id_y
*
dstStride
+
id_x
]
=
dy
*
scale
;
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment