Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0bf9ece9
Commit
0bf9ece9
authored
Oct 26, 2013
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocl: rewrite boxFilter
parent
cb6ea8bf
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
319 additions
and
579 deletions
+319
-579
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+1
-5
filtering.cpp
modules/ocl/src/filtering.cpp
+85
-235
filtering_boxFilter.cl
modules/ocl/src/opencl/filtering_boxFilter.cl
+233
-339
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
0bf9ece9
...
@@ -722,7 +722,7 @@ namespace cv
...
@@ -722,7 +722,7 @@ namespace cv
CV_EXPORTS
void
Laplacian
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
int
ksize
=
1
,
double
scale
=
1
);
CV_EXPORTS
void
Laplacian
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
int
ksize
=
1
,
double
scale
=
1
);
//! returns 2D box filter
//! returns 2D box filter
//
supports CV_8UC1 and CV_8UC4 source type,
dst type must be the same as source type
// dst type must be the same as source type
CV_EXPORTS
Ptr
<
BaseFilter_GPU
>
getBoxFilter_GPU
(
int
srcType
,
int
dstType
,
CV_EXPORTS
Ptr
<
BaseFilter_GPU
>
getBoxFilter_GPU
(
int
srcType
,
int
dstType
,
const
Size
&
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
const
Size
&
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
...
@@ -740,8 +740,6 @@ namespace cv
...
@@ -740,8 +740,6 @@ namespace cv
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
//! smooths the image using the normalized box filter
//! smooths the image using the normalized box filter
// supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
// supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP
CV_EXPORTS
void
boxFilter
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
Size
ksize
,
CV_EXPORTS
void
boxFilter
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
Size
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
...
@@ -757,8 +755,6 @@ namespace cv
...
@@ -757,8 +755,6 @@ namespace cv
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
);
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
);
//! a synonym for normalized box filter
//! a synonym for normalized box filter
// supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
// supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
static
inline
void
blur
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
static
inline
void
blur
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_CONSTANT
)
int
borderType
=
BORDER_CONSTANT
)
{
{
...
...
modules/ocl/src/filtering.cpp
View file @
0bf9ece9
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
// For Open Source Computer Vision Library
// For Open Source Computer Vision Library
//
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-201
2
, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-201
3
, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// Third party copyrights are property of their respective owners.
//
//
// @Authors
// @Authors
...
@@ -713,276 +713,126 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
...
@@ -713,276 +713,126 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
return
Ptr
<
FilterEngine_GPU
>
(
new
SeparableFilterEngine_GPU
(
rowFilter
,
columnFilter
));
return
Ptr
<
FilterEngine_GPU
>
(
new
SeparableFilterEngine_GPU
(
rowFilter
,
columnFilter
));
}
}
/*
static
void
GPUFilterBox
(
const
oclMat
&
src
,
oclMat
&
dst
,
**data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4
**support four border types: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_REFLECT_101
*/
static
void
GPUFilterBox_8u_C1R
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
{
{
//Normalize the result by default
//Normalize the result by default
float
alpha
=
ksize
.
height
*
ksize
.
width
;
float
alpha
=
1.0
f
/
(
ksize
.
height
*
ksize
.
width
)
;
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
(
src
.
rows
==
dst
.
rows
));
Context
*
clCxt
=
src
.
clCxt
;
CV_Assert
(
src
.
oclchannels
()
==
dst
.
oclchannels
());
string
kernelName
=
"boxFilter_C1_D0"
;
char
btype
[
30
];
switch
(
borderType
)
{
case
0
:
sprintf
(
btype
,
"BORDER_CONSTANT"
);
break
;
case
1
:
sprintf
(
btype
,
"BORDER_REPLICATE"
);
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
);
break
;
case
3
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
);
break
;
}
char
build_options
[
150
];
size_t
BLOCK_SIZE
=
src
.
clCxt
->
getDeviceInfo
().
maxWorkItemSizes
[
0
];
sprintf
(
build_options
,
"-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s"
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
);
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
size_t
blockSizeX
=
256
,
blockSizeY
=
1
;
CV_Assert
((
size_t
)
ksize
.
width
<=
BLOCK_SIZE
);
size_t
gSize
=
blockSizeX
-
(
ksize
.
width
-
1
);
size_t
threads
=
(
dst
.
offset
%
dst
.
step
%
4
+
dst
.
cols
+
3
)
/
4
;
size_t
globalSizeX
=
threads
%
gSize
==
0
?
threads
/
gSize
*
blockSizeX
:
(
threads
/
gSize
+
1
)
*
blockSizeX
;
size_t
globalSizeY
=
((
dst
.
rows
+
1
)
/
2
)
%
blockSizeY
==
0
?
((
dst
.
rows
+
1
)
/
2
)
:
(((
dst
.
rows
+
1
)
/
2
)
/
blockSizeY
+
1
)
*
blockSizeY
;
size_t
globalThreads
[
3
]
=
{
globalSizeX
,
globalSizeY
,
1
};
bool
isIsolatedBorder
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
size_t
localThreads
[
3
]
=
{
blockSizeX
,
blockSizeY
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
openCLExecuteKernel
(
clCxt
,
&
filtering_boxFilter
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
static
void
GPUFilterBox_8u_C4R
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
{
//Normalize the result by default
float
alpha
=
ksize
.
height
*
ksize
.
width
;
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
Context
*
clCxt
=
src
.
clCxt
;
string
kernelName
=
"boxFilter_C4_D0"
;
char
btype
[
30
];
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
cl_uint
stepBytes
=
src
.
step
;
switch
(
borderType
)
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
stepBytes
));
int
offsetXBytes
=
src
.
offset
%
src
.
step
;
int
offsetX
=
offsetXBytes
/
src
.
elemSize
();
CV_Assert
((
int
)(
offsetX
*
src
.
elemSize
())
==
offsetXBytes
);
int
offsetY
=
src
.
offset
/
src
.
step
;
int
endX
=
(
offsetX
+
src
.
cols
);
int
endY
=
(
offsetY
+
src
.
rows
);
cl_int
rect
[
4
]
=
{
offsetX
,
offsetY
,
endX
,
endY
};
if
(
!
isIsolatedBorder
)
{
{
case
0
:
rect
[
2
]
=
src
.
wholecols
;
sprintf
(
btype
,
"BORDER_CONSTANT"
);
rect
[
3
]
=
src
.
wholerows
;
break
;
case
1
:
sprintf
(
btype
,
"BORDER_REPLICATE"
);
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
);
break
;
case
3
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
);
break
;
}
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
rect
[
0
]));
char
build_options
[
150
];
sprintf
(
build_options
,
"-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s"
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
cl_uint
_stepBytes
=
dst
.
step
;
size_t
blockSizeX
=
256
,
blockSizeY
=
1
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
_stepBytes
));
size_t
gSize
=
blockSizeX
-
ksize
.
width
/
2
*
2
;
int
_offsetXBytes
=
dst
.
offset
%
dst
.
step
;
size_t
globalSizeX
=
(
src
.
cols
)
%
gSize
==
0
?
src
.
cols
/
gSize
*
blockSizeX
:
(
src
.
cols
/
gSize
+
1
)
*
blockSizeX
;
int
_offsetX
=
_offsetXBytes
/
dst
.
elemSize
();
size_t
rows_per_thread
=
2
;
CV_Assert
((
int
)(
_offsetX
*
dst
.
elemSize
())
==
_offsetXBytes
);
size_t
globalSizeY
=
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
%
blockSizeY
==
0
?
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
:
(((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
/
blockSizeY
+
1
)
*
blockSizeY
;
int
_offsetY
=
dst
.
offset
/
dst
.
step
;
int
_endX
=
(
_offsetX
+
dst
.
cols
);
size_t
globalThreads
[
3
]
=
{
globalSizeX
,
globalSizeY
,
1
};
int
_endY
=
(
_offsetY
+
dst
.
rows
);
size_t
localThreads
[
3
]
=
{
blockSizeX
,
blockSizeY
,
1
};
cl_int
_rect
[
4
]
=
{
_offsetX
,
_offsetY
,
_endX
,
_endY
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
_rect
[
0
]));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
src
.
data
));
bool
useDouble
=
src
.
depth
()
==
CV_64F
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha
));
float
borderValue
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
double
borderValueDouble
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
if
((
borderType
&
~
BORDER_ISOLATED
)
==
BORDER_CONSTANT
)
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
openCLExecuteKernel
(
clCxt
,
&
filtering_boxFilter
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
static
void
GPUFilterBox_32F_C1R
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
{
//Normalize the result by default
float
alpha
=
ksize
.
height
*
ksize
.
width
;
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
Context
*
clCxt
=
src
.
clCxt
;
string
kernelName
=
"boxFilter_C1_D5"
;
char
btype
[
30
];
switch
(
borderType
)
{
{
case
0
:
if
(
useDouble
)
sprintf
(
btype
,
"BORDER_CONSTANT"
);
args
.
push_back
(
make_pair
(
sizeof
(
double
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValue
[
0
]));
break
;
else
case
1
:
args
.
push_back
(
make_pair
(
sizeof
(
float
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValueDouble
[
0
]));
sprintf
(
btype
,
"BORDER_REPLICATE"
);
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
);
break
;
case
3
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
);
break
;
}
}
char
build_options
[
150
];
double
alphaDouble
=
alpha
;
// DON'T move into 'if' body
sprintf
(
build_options
,
"-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s"
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
);
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
),
(
void
*
)
&
alphaDouble
));
size_t
blockSizeX
=
256
,
blockSizeY
=
1
;
else
size_t
gSize
=
blockSizeX
-
ksize
.
width
/
2
*
2
;
args
.
push_back
(
make_pair
(
sizeof
(
float
),
(
void
*
)
&
alpha
));
size_t
globalSizeX
=
(
src
.
cols
)
%
gSize
==
0
?
src
.
cols
/
gSize
*
blockSizeX
:
(
src
.
cols
/
gSize
+
1
)
*
blockSizeX
;
size_t
rows_per_thread
=
2
;
size_t
globalSizeY
=
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
%
blockSizeY
==
0
?
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
:
(((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
/
blockSizeY
+
1
)
*
blockSizeY
;
size_t
globalThreads
[
3
]
=
{
globalSizeX
,
globalSizeY
,
1
};
size_t
localThreads
[
3
]
=
{
blockSizeX
,
blockSizeY
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
openCLExecuteKernel
(
clCxt
,
&
filtering_boxFilter
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
static
void
GPUFilterBox_32F_C4R
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
{
//Normalize the result by default
float
alpha
=
ksize
.
height
*
ksize
.
width
;
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
Context
*
clCxt
=
src
.
clCxt
;
string
kernelName
=
"boxFilter_C4_D5"
;
c
har
btype
[
30
]
;
c
onst
char
*
btype
=
NULL
;
switch
(
borderType
)
switch
(
borderType
&
~
BORDER_ISOLATED
)
{
{
case
0
:
case
BORDER_CONSTANT
:
sprintf
(
btype
,
"BORDER_CONSTANT"
)
;
btype
=
"BORDER_CONSTANT"
;
break
;
break
;
case
1
:
case
BORDER_REPLICATE
:
sprintf
(
btype
,
"BORDER_REPLICATE"
)
;
btype
=
"BORDER_REPLICATE"
;
break
;
break
;
case
2
:
case
BORDER_REFLECT
:
sprintf
(
btype
,
"BORDER_REFLECT"
)
;
btype
=
"BORDER_REFLECT"
;
break
;
break
;
case
3
:
case
BORDER_WRAP
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
return
;
case
4
:
case
BORDER_REFLECT101
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
)
;
btype
=
"BORDER_REFLECT_101"
;
break
;
break
;
}
}
char
build_options
[
150
];
int
requiredTop
=
anchor
.
y
;
sprintf
(
build_options
,
"-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s"
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
);
int
requiredLeft
=
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
size_t
blockSizeX
=
256
,
blockSizeY
=
1
;
int
requiredRight
=
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
size_t
gSize
=
blockSizeX
-
ksize
.
width
/
2
*
2
;
int
h
=
isIsolatedBorder
?
src
.
rows
:
src
.
wholerows
;
size_t
globalSizeX
=
(
src
.
cols
)
%
gSize
==
0
?
src
.
cols
/
gSize
*
blockSizeX
:
(
src
.
cols
/
gSize
+
1
)
*
blockSizeX
;
int
w
=
isIsolatedBorder
?
src
.
cols
:
src
.
wholecols
;
size_t
rows_per_thread
=
2
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
size_t
globalSizeY
=
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
%
blockSizeY
==
0
?
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
:
(((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
/
blockSizeY
+
1
)
*
blockSizeY
;
CV_Assert
(
w
>=
ksize
.
width
&&
h
>=
ksize
.
height
);
// TODO Other cases are not tested well
size_t
globalThreads
[
3
]
=
{
globalSizeX
,
globalSizeY
,
1
};
char
build_options
[
1024
];
size_t
localThreads
[
3
]
=
{
blockSizeX
,
blockSizeY
,
1
};
sprintf
(
build_options
,
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
src
.
depth
(),
src
.
oclchannels
(),
useDouble
?
1
:
0
,
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
src
.
data
));
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
dst
.
data
));
btype
,
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha
));
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
isIsolatedBorder
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
size_t
gt
[
3
]
=
{
divUp
(
dst
.
cols
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
,
divUp
(
dst
.
rows
,
BLOCK_SIZE_Y
),
1
},
lt
[
3
]
=
{
BLOCK_SIZE
,
1
,
1
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
openCLExecuteKernel
(
src
.
clCxt
,
&
filtering_boxFilter
,
"boxFilter"
,
gt
,
lt
,
args
,
-
1
,
-
1
,
build_options
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
openCLExecuteKernel
(
clCxt
,
&
filtering_boxFilter
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
}
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getBoxFilter_GPU
(
int
/*srcType*/
,
int
/*dstType*/
,
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getBoxFilter_GPU
(
int
srcType
,
int
dstType
,
const
Size
&
ksize
,
Point
anchor
,
int
borderType
)
const
Size
&
ksize
,
Point
anchor
,
int
borderType
)
{
{
static
const
FilterBox_t
FilterBox_callers
[
2
][
5
]
=
{{
0
,
GPUFilterBox_8u_C1R
,
0
,
GPUFilterBox_8u_C4R
,
GPUFilterBox_8u_C4R
},
{
0
,
GPUFilterBox_32F_C1R
,
0
,
GPUFilterBox_32F_C4R
,
GPUFilterBox_32F_C4R
}
};
//Remove this check if more data types need to be supported.
CV_Assert
((
srcType
==
CV_8UC1
||
srcType
==
CV_8UC3
||
srcType
==
CV_8UC4
||
srcType
==
CV_32FC1
||
srcType
==
CV_32FC3
||
srcType
==
CV_32FC4
)
&&
dstType
==
srcType
);
normalizeAnchor
(
anchor
,
ksize
);
normalizeAnchor
(
anchor
,
ksize
);
return
Ptr
<
BaseFilter_GPU
>
(
new
GPUBoxFilter
(
ksize
,
anchor
,
return
Ptr
<
BaseFilter_GPU
>
(
new
GPUBoxFilter
(
ksize
,
anchor
,
borderType
,
FilterBox_callers
[(
CV_MAT_DEPTH
(
srcType
)
==
CV_32F
)][
CV_MAT_CN
(
srcType
)]
));
borderType
,
GPUFilterBox
));
}
}
Ptr
<
FilterEngine_GPU
>
cv
::
ocl
::
createBoxFilter_GPU
(
int
srcType
,
int
dstType
,
Ptr
<
FilterEngine_GPU
>
cv
::
ocl
::
createBoxFilter_GPU
(
int
srcType
,
int
dstType
,
...
...
modules/ocl/src/opencl/filtering_boxFilter.cl
View file @
0bf9ece9
...
@@ -10,13 +10,9 @@
...
@@ -10,13 +10,9 @@
//
License
Agreement
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
For
Open
Source
Computer
Vision
Library
//
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2013,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
//
@Authors
//
Zhang
Ying,
zhangying913@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
...
@@ -79,400 +75,298 @@
...
@@ -79,400 +75,298 @@
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
endif
#
endif
#
define
THREADS
256
#
ifdef
EXTRA_EXTRAPOLATION
//
border
>
src
image
size
#
define
ELEM
(
i,
l_edge,
r_edge,
elem1,
elem2
)
(
i
)
>=
(
l_edge
)
&&
(
i
)
<
(
r_edge
)
?
(
elem1
)
:
(
elem2
)
#
ifdef
BORDER_CONSTANT
//
None
inline
void
update_dst_C1_D0
(
__global
uchar
*dst,
__local
uint*
temp,
#
elif
defined
BORDER_REPLICATE
int
dst_rows,
int
dst_cols,
#
define
EXTRAPOLATE
(
x,
y,
minX,
minY,
maxX,
maxY
)
\
int
dst_startX,
int
dst_x_off,
{
\
float
alpha
)
x
=
max
(
min
(
x,
maxX
-
1
)
,
minX
)
; \
{
y
=
max
(
min
(
y,
maxY
-
1
)
,
minY
)
; \
if
(
get_local_id
(
0
)
<
anX
|
| get_local_id(0) >= (THREADS-ksX+anX+1))
{
return;
}
}
#
elif
defined
BORDER_WRAP
uint4 tmp_sum = 0;
#
define
EXTRAPOLATE
(
x,
y,
minX,
minY,
maxX,
maxY
)
\
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
{
\
int posY = (get_group_id(1) << 1);
if
(
x
<
minX
)
\
x
-=
((
x
-
maxX
+
1
)
/
maxX
)
*
maxX
; \
for(int i=-anX; i<=anX; i++)
if
(
x
>=
maxX
)
\
{
x
%=
maxX
; \
tmp_sum += vload4(get_local_id(0), temp+i);
if
(
y
<
minY
)
\
y
-=
((
y
-
maxY
+
1
)
/
maxY
)
*
maxY
; \
if
(
y
>=
maxY
)
\
y
%=
maxY
; \
}
}
#
elif
defined
(
BORDER_REFLECT
)
|
| defined(BORDER_REFLECT_101)
if(posY < dst_rows && posX < dst_cols)
#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
{
{ \
tmp_sum /= (uint4) alpha;
if (maxX - minX == 1) \
if(posX >= 0 && posX < dst_cols)
x = minX; \
*(dst) = tmp_sum.x;
else \
if(posX+1 >= 0 && posX+1 < dst_cols)
do \
*(dst + 1) = tmp_sum.y;
{ \
if(posX+2 >= 0 && posX+2 < dst_cols)
if (x < minX) \
*(dst + 2) = tmp_sum.z;
x = -(x - minX) - 1 + delta; \
if(posX+3 >= 0 && posX+3 < dst_cols)
else \
*(dst + 3) = tmp_sum.w;
x = maxX - 1 - (x - maxX) - delta; \
} \
while (x >= maxX || x < minX); \
\
if (maxY - minY == 1) \
y = minY; \
else \
do \
{ \
if (y < minY) \
y = -(y - minY) - 1 + delta; \
else \
y = maxY - 1 - (y - maxY) - delta; \
} \
while (y >= maxY || y < minY); \
}
}
}
#ifdef BORDER_REFLECT
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
#elif defined(BORDER_REFLECT_101)
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
#endif
#else
#error No extrapolation method
#endif
#else
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
{ \
int _row = y - minY, _col = x - minX; \
_row = ADDR_H(_row, 0, maxY - minY); \
_row = ADDR_B(_row, maxY - minY, _row); \
y = _row + minY; \
\
_col = ADDR_L(_col, 0, maxX - minX); \
_col = ADDR_R(_col, maxX - minX, _col); \
x = _col + minX; \
}
#endif
#if USE_DOUBLE
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#define FPTYPE double
#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
#else
#define FPTYPE float
#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
#endif
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
#if DATA_DEPTH == 0
int dst_rows, int dst_cols,
#define BASE_TYPE uchar
int dst_startX, int dst_x_off,
#elif DATA_DEPTH == 1
float alpha)
#define BASE_TYPE char
{
#elif DATA_DEPTH == 2
if(get_local_id(0) >= (THREADS-ksX+1))
#define BASE_TYPE ushort
{
#elif DATA_DEPTH == 3
return;
#define BASE_TYPE short
}
#elif DATA_DEPTH == 4
#define BASE_TYPE int
#elif DATA_DEPTH == 5
#define BASE_TYPE float
#elif DATA_DEPTH == 6
#define BASE_TYPE double
#else
#error data_depth
#endif
int posX = dst_startX - dst_x_off + get_local_id(0);
#define __CAT(x, y) x##y
int posY = (get_group_id(1) << 1);
#define CAT(x, y) __CAT(x, y)
#define uchar1 uchar
#define char1 char
#define ushort1 ushort
#define short1 short
#define int1 int
#define float1 float
#define double1 double
#define convert_uchar1_sat_rte convert_uchar_sat_rte
#define convert_char1_sat_rte convert_char_sat_rte
#define convert_ushort1_sat_rte convert_ushort_sat_rte
#define convert_short1_sat_rte convert_short_sat_rte
#define convert_int1_sat_rte convert_int_sat_rte
#define convert_float1
#define convert_double1
#if DATA_DEPTH == 5 |
|
DATA_DEPTH
==
6
#
define
CONVERT_TO_TYPE
CAT
(
CAT
(
convert_,
BASE_TYPE
)
,
VEC_SIZE
)
#
else
#
define
CONVERT_TO_TYPE
CAT
(
CAT
(
CAT
(
convert_,
BASE_TYPE
)
,
VEC_SIZE
)
,
_sat_rte
)
#
endif
uint4 temp_sum = 0;
#
define
VEC_SIZE
DATA_CHAN
for(int i=-anX; i<=anX; i++)
{
temp_sum += temp[get_local_id(0) + anX + i];
}
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
#
define
VEC_TYPE
CAT
(
BASE_TYPE,
VEC_SIZE
)
*dst = convert_uchar4(convert_float4(temp_sum)/alpha);
#
define
TYPE
VEC_TYPE
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#
define
SCALAR_TYPE
CAT
(
FPTYPE,
VEC_SIZE
)
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
#
define
INTERMEDIATE_TYPE
CAT
(
FPTYPE,
VEC_SIZE
)
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = src_offset % src_step;
int src_y_off = src_offset / src_step;
int dst_x_off = dst_offset % dst_step;
int dst_y_off = dst_offset / dst_step;
int head_off = dst_x_off%4;
struct
RectCoords
int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
{
int startY = (gY << 1) - anY + src_y_off;
int
x1,
y1,
x2,
y2
;
int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
}
;
int dst_startY = (gY << 1) + dst_y_off;
uint4 data[ksY+1];
//#define
DEBUG
__local uint4 temp[2][THREADS];
#
ifdef
DEBUG
#
define
DEBUG_ONLY
(
x
)
x
#
define
ASSERT
(
condition
)
do
{
if
(
!
(
condition
))
{
printf
(
"BUG in boxFilter kernel (global=%d,%d): "
#
condition
"\n"
,
get_global_id
(
0
)
,
get_global_id
(
1
))
; } } while (0)
#
else
#
define
DEBUG_ONLY
(
x
)
#
define
ASSERT
(
condition
)
#
endif
#ifdef BORDER_CONSTANT
for(int i=0; i < ksY+1; i++)
inline
INTERMEDIATE_TYPE
readSrcPixel
(
int2
pos,
__global
TYPE
*src,
const
unsigned
int
srcStepBytes,
const
struct
RectCoords
srcCoords
{
#
ifdef
BORDER_CONSTANT
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
,
SCALAR_TYPE
borderValue
#
endif
)
{
#
ifdef
BORDER_ISOLATED
if
(
pos.x
>=
srcCoords.x1
&&
pos.y
>=
srcCoords.y1
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
#
else
if
(
pos.x
>=
0
&&
pos.y
>=
0
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
#
endif
{
{
data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
__global
TYPE*
ptr
=
(
__global
TYPE*
)((
__global
char*
)
src
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
srcStepBytes
)
;
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
return
CONVERT_TO_FPTYPE
(
*ptr
)
;
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
}
}
else
else
{
{
data[i]=0;
#
ifdef
BORDER_CONSTANT
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
return
borderValue
;
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
}
}
#
else
#
else
int not_all_in_range;
int
selected_col
=
pos.x
;
for(int i=0; i < ksY+1; i++)
int
selected_row
=
pos.y
;
{
not_all_in_range = (startX+col*4<0) |
(
startX+col*4+3>src_whole_cols-1
)
| (startY+i<0) |
(
startY+i>src_whole_rows-1
)
;
if
(
not_all_in_range
)
{
int
selected_row
;
int4
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col.x
=
ADDR_L
(
startX+col*4,
0
,
src_whole_cols
)
;
selected_col.x
=
ADDR_R
(
startX+col*4,
src_whole_cols,
selected_col.x
)
;
selected_col.y
=
ADDR_L
(
startX+col*4+1,
0
,
src_whole_cols
)
;
EXTRAPOLATE
(
selected_col,
selected_row,
selected_col.y
=
ADDR_R
(
startX+col*4+1,
src_whole_cols,
selected_col.y
)
;
#
ifdef
BORDER_ISOLATED
srcCoords.x1,
srcCoords.y1,
selected_col.z
=
ADDR_L
(
startX+col*4+2,
0
,
src_whole_cols
)
;
#
else
selected_col.z
=
ADDR_R
(
startX+col*4+2,
src_whole_cols,
selected_col.z
)
;
0
,
0
,
#
endif
srcCoords.x2,
srcCoords.y2
)
;
selected_col.w
=
ADDR_L
(
startX+col*4+3,
0
,
src_whole_cols
)
;
//
debug
border
mapping
selected_col.w
=
ADDR_R
(
startX+col*4+3,
src_whole_cols,
selected_col.
w
)
;
//printf
(
"pos=%d,%d --> %d, %d\n"
,
pos.x,
pos.y,
selected_col,
selected_ro
w
)
;
data[i].x
=
*
(
src
+
selected_row
*
src_step
+
selected_col.x
)
;
pos
=
(
int2
)(
selected_col,
selected_row
)
;
data[i].y
=
*
(
src
+
selected_row
*
src_step
+
selected_col.y
)
;
if
(
pos.x
>=
0
&&
pos.y
>=
0
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
data[i].z
=
*
(
src
+
selected_row
*
src_step
+
selected_col.z
)
;
{
data[i].w
=
*
(
src
+
selected_row
*
src_step
+
selected_col.w
)
;
__global
TYPE*
ptr
=
(
__global
TYPE*
)((
__global
char*
)
src
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
srcStepBytes
)
;
return
CONVERT_TO_FPTYPE
(
*ptr
)
;
}
}
else
else
{
{
data[i]
=
convert_uint4
(
vload4
(
col,
(
__global
uchar*
)(
src+
(
startY+i
)
*src_step
+
startX
)))
;
//
for
debug
only
}
DEBUG_ONLY
(
printf
(
"BUG in boxFilter kernel\n"
))
;
return
(
FPTYPE
)(
0.0f
)
;
}
}
#
endif
#
endif
uint4
tmp_sum
=
0
;
for
(
int
i=1
; i < ksY; i++)
{
tmp_sum
+=
(
data[i]
)
;
}
}
int
index
=
dst_startY
*
dst_step
+
dst_startX
+
(
col-anX
)
*4
;
temp[0][col]
=
tmp_sum
+
(
data[0]
)
;
temp[1][col]
=
tmp_sum
+
(
data[ksY]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
update_dst_C1_D0
(
dst+index,
(
__local
uint
*
)(
temp[0]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
update_dst_C1_D0
(
dst+index+dst_step,
(
__local
uint
*
)(
temp[1]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
//
INPUT
PARAMETER:
BLOCK_SIZE_Y
(
via
defines
)
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
__kernel
void
boxFilter_C4_D0
(
__global
const
uchar4
*
restrict
src,
__global
uchar4
*dst,
float
alpha,
__attribute__
((
reqd_work_group_size
(
LOCAL_SIZE,
1
,
1
)))
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
void
boxFilter
(
__global
TYPE
*src,
const
unsigned
int
srcStepBytes,
const
int4
srcRC,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
__global
TYPE
*dst,
const
unsigned
int
dstStepBytes,
const
int4
dstRC,
#
ifdef
BORDER_CONSTANT
SCALAR_TYPE
borderValue,
#
endif
FPTYPE
alpha
)
)
{
{
int
col
=
get_local_id
(
0
)
;
const
struct
RectCoords
srcCoords
=
{srcRC.s0,
srcRC.s1,
srcRC.s2,
srcRC.s3}
; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
const
int
gX
=
get_group_id
(
0
)
;
const
struct
RectCoords
dstCoords
=
{dstRC.s0,
dstRC.s1,
dstRC.s2,
dstRC.s3}
;
const
int
gY
=
get_group_id
(
1
)
;
int
src_x_off
=
(
src_offset
%
src_step
)
>>
2
;
const
int
x
=
get_local_id
(
0
)
+
(
LOCAL_SIZE
-
(
KERNEL_SIZE_X
-
1
))
*
get_group_id
(
0
)
-
ANCHOR_X
;
int
src_y_off
=
src_offset
/
src_step
;
const
int
y
=
get_global_id
(
1
)
*
BLOCK_SIZE_Y
;
int
dst_x_off
=
(
dst_offset
%
dst_step
)
>>
2
;
int
dst_y_off
=
dst_offset
/
dst_step
;
int
startX
=
gX
*
(
THREADS-ksX+1
)
-
anX
+
src_x_off
;
const
int
local_id
=
get_local_id
(
0
)
;
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
uint4
data[ksY+1
]
;
INTERMEDIATE_TYPE
data[KERNEL_SIZE_Y
]
;
__local
uint4
temp[2][THREADS
]
;
__local
INTERMEDIATE_TYPE
sumOfCols[LOCAL_SIZE
]
;
#
ifdef
BORDER_CONSTANT
int2
srcPos
=
(
int2
)(
srcCoords.x1
+
x,
srcCoords.y1
+
y
-
ANCHOR_Y
)
;
bool
con
;
for
(
int
sy
=
0
; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
for
(
int
i=0
; i < ksY+1; i++)
{
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
data[sy]
=
readSrcPixel
(
srcPos,
src,
srcStepBytes,
srcCoords
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
#
ifdef
BORDER_CONSTANT
,
borderValue
data[i].x
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].x
:
0
;
#
endif
data[i].y
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].y
:
0
;
)
;
data[i].z
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].z
:
0
;
data[i].w
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].w
:
0
;
}
#
else
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
data[i]
=
convert_uint4
(
src[selected_row
*
(
src_step>>2
)
+
selected_col]
)
;
}
}
#
endif
INTERMEDIATE_TYPE
tmp_sum
=
0
;
uint4
tmp_sum
=
0
;
for
(
int
sy
=
0
; sy < KERNEL_SIZE_Y; sy++)
for
(
int
i=1
; i < ksY; i++)
{
{
tmp_sum
+=
(
data[
i
]
)
;
tmp_sum
+=
(
data[
sy
]
)
;
}
}
int
index
=
dst_startY
*
(
dst_step>>2
)
+
dst_startX
+
col
;
sumOfCols[local_id]
=
tmp_sum
;
temp[0][col]
=
tmp_sum
+
(
data[0]
)
;
temp[1][col]
=
tmp_sum
+
(
data[ksY]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
update_dst_C4_D0
(
dst+index,
(
__local
uint4
*
)(
temp[0]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
update_dst_C4_D0
(
dst+index+
(
dst_step>>2
)
,
(
__local
uint4
*
)(
temp[1]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
}
int2
pos
=
(
int2
)(
dstCoords.x1
+
x,
dstCoords.y1
+
y
)
;
__global
TYPE*
dstPtr
=
(
__global
TYPE*
)((
__global
char*
)
dst
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
dstStepBytes
)
; // Pointer can be out of bounds!
///////////////////////////////////////////////////////////////////////////////////////////////////
int
sy_index
=
0
; // current index in data[] array
/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
int
stepsY
=
min
(
dstCoords.y2
-
pos.y,
BLOCK_SIZE_Y
)
;
////////////////////////////////////////////////////////////////////////////////////////////////////
ASSERT
(
stepsY
>
0
)
;
__kernel
void
boxFilter_C1_D5
(
__global
const
float
*restrict
src,
__global
float
*dst,
float
alpha,
for
(
; ;)
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
{
int
col
=
get_local_id
(
0
)
;
const
int
gX
=
get_group_id
(
0
)
;
const
int
gY
=
get_group_id
(
1
)
;
int
src_x_off
=
(
src_offset
%
src_step
)
>>
2
;
int
src_y_off
=
src_offset
/
src_step
;
int
dst_x_off
=
(
dst_offset
%
dst_step
)
>>
2
;
int
dst_y_off
=
dst_offset
/
dst_step
;
int
startX
=
gX
*
(
THREADS-ksX+1
)
-
anX
+
src_x_off
;
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
float
data[ksY+1]
;
__local
float
temp[2][THREADS]
;
#
ifdef
BORDER_CONSTANT
bool
con
;
float
ss
;
for
(
int
i=0
; i < ksY+1; i++)
{
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
ASSERT
(
pos.y
<
dstCoords.y2
)
;
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
ss
=
(
startY+i
)
<src_whole_rows&&
(
startY+i
)
>=0&&cur_col>=0&&cur_col<src_whole_cols?src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col]:
(
float
)
0
;
data[i]
=
con
?
ss
:
0.f
;
if
(
local_id
>=
ANCHOR_X
&&
local_id
<
LOCAL_SIZE
-
(
KERNEL_SIZE_X
-
1
-
ANCHOR_X
)
&&
}
pos.x
>=
dstCoords.x1
&&
pos.x
<
dstCoords.x2
)
#
else
for
(
int
i=0
; i < ksY+1; i++)
{
{
int
selected_row
;
ASSERT
(
pos.y
>=
dstCoords.y1
&&
pos.y
<
dstCoords.y2
)
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
INTERMEDIATE_TYPE
total_sum
=
0
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
#
pragma
unroll
for
(
int
sx
=
0
; sx < KERNEL_SIZE_X; sx++)
data[i]
=
src[selected_row
*
(
src_step>>2
)
+
selected_col]
;
}
#
endif
float
sum0
=
0.0
,
sum1
=
0.0
,
sum2
=
0.0
;
for
(
int
i=1
; i < ksY; i++)
{
{
sum0
+=
(
data[i]
)
;
total_sum
+=
sumOfCols[local_id
+
sx
-
ANCHOR_X]
;
}
}
sum1
=
sum0
+
(
data[0]
)
;
*dstPtr
=
CONVERT_TO_TYPE
(((
INTERMEDIATE_TYPE
)
alpha
)
*
total_sum
)
;
sum2
=
sum0
+
(
data[ksY]
)
;
temp[0][col]
=
sum1
;
temp[1][col]
=
sum2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
col
<
(
THREADS-
(
ksX-1
)))
{
col
+=
anX
;
int
posX
=
dst_startX
-
dst_x_off
+
col
-
anX
;
int
posY
=
(
gY
<<
1
)
;
float
tmp_sum[2]=
{0.0,
0.0}
;
for
(
int
k=0
; k<2; k++)
for
(
int
i=-anX
; i<=anX; i++)
{
tmp_sum[k]
+=
temp[k][col+i]
;
}
for
(
int
i=0
; i<2; i++)
{
if
(
posX
>=
0
&&
posX
<
dst_cols
&&
(
posY+i
)
>=
0
&&
(
posY+i
)
<
dst_rows
)
dst[
(
dst_startY+i
)
*
(
dst_step>>2
)
+
dst_startX
+
col
-
anX]
=
tmp_sum[i]/alpha
;
}
}
}
#
if
BLOCK_SIZE_Y
==
1
}
break
;
#
else
if
(
--stepsY
==
0
)
break
;
///////////////////////////////////////////////////////////////////////////////////////////////////
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
boxFilter_C4_D5
(
__global
const
float4
*restrict
src,
__global
float4
*dst,
float
alpha,
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
{
int
col
=
get_local_id
(
0
)
;
const
int
gX
=
get_group_id
(
0
)
;
const
int
gY
=
get_group_id
(
1
)
;
int
src_x_off
=
(
src_offset
%
src_step
)
>>
4
;
int
src_y_off
=
src_offset
/
src_step
;
int
dst_x_off
=
(
dst_offset
%
dst_step
)
>>
4
;
int
dst_y_off
=
dst_offset
/
dst_step
;
int
startX
=
gX
*
(
THREADS-ksX+1
)
-
anX
+
src_x_off
;
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
float4
data[ksY+1]
;
__local
float4
temp[2][THREADS]
;
#
ifdef
BORDER_CONSTANT
bool
con
;
float4
ss
;
for
(
int
i=0
; i < ksY+1; i++)
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
tmp_sum
=
sumOfCols[local_id]
; // TODO FIX IT: workaround for BUG in OpenCL compiler
ss
=
(
startY+i
)
<src_whole_rows&&
(
startY+i
)
>=0&&cur_col>=0&&cur_col<src_whole_cols?src[
(
startY+i
)
*
(
src_step>>4
)
+
cur_col]:
(
float4
)
0
;
//
only
works
with
scalars:
ASSERT
(
fabs
(
tmp_sum
-
sumOfCols[local_id]
)
<
(
INTERMEDIATE_TYPE
)
1e-6
)
;
tmp_sum
-=
data[sy_index]
;
data[i]
=
con
?
ss
:
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
;
data[sy_index]
=
readSrcPixel
(
srcPos,
src,
srcStepBytes,
srcCoords
}
#
ifdef
BORDER_CONSTANT
#
else
,
borderValue
for
(
int
i=0
; i < ksY+1; i++)
#
endif
{
)
;
int
selected_row
;
srcPos.y++
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
tmp_sum
+=
data[sy_index]
;
s
elected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
s
umOfCols[local_id]
=
tmp_sum
;
data[i]
=
src[selected_row
*
(
src_step>>4
)
+
selected_col]
;
sy_index
=
(
sy_index
+
1
<
KERNEL_SIZE_Y
)
?
sy_index
+
1
:
0
;
}
#
endif
float4
sum0
=
0.0
,
sum1
=
0.0
,
sum2
=
0.0
;
for
(
int
i=1
; i < ksY; i++)
{
sum0
+=
(
data[i]
)
;
}
sum1
=
sum0
+
(
data[0]
)
;
sum2
=
sum0
+
(
data[ksY]
)
;
temp[0][col]
=
sum1
;
temp[1][col]
=
sum2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
col
<
(
THREADS-
(
ksX-1
)))
{
col
+=
anX
;
int
posX
=
dst_startX
-
dst_x_off
+
col
-
anX
;
int
posY
=
(
gY
<<
1
)
;
float4
tmp_sum[2]=
{
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
,
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
}
;
for
(
int
k=0
; k<2; k++)
for
(
int
i=-anX
; i<=anX; i++)
{
tmp_sum[k]
+=
temp[k][col+i]
;
}
for
(
int
i=0
; i<2; i++)
{
if
(
posX
>=
0
&&
posX
<
dst_cols
&&
(
posY+i
)
>=
0
&&
(
posY+i
)
<
dst_rows
)
dst[
(
dst_startY+i
)
*
(
dst_step>>4
)
+
dst_startX
+
col
-
anX]
=
tmp_sum[i]/alpha
;
}
//
next
line
DEBUG_ONLY
(
pos.y++
)
;
dstPtr
=
(
__global
TYPE*
)((
__global
char*
)
dstPtr
+
dstStepBytes
)
; // Pointer can be out of bounds!
#
endif
//
BLOCK_SIZE_Y
==
1
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment