Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0bf9ece9
Commit
0bf9ece9
authored
Oct 26, 2013
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocl: rewrite boxFilter
parent
cb6ea8bf
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
324 additions
and
584 deletions
+324
-584
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+1
-5
filtering.cpp
modules/ocl/src/filtering.cpp
+85
-235
filtering_boxFilter.cl
modules/ocl/src/opencl/filtering_boxFilter.cl
+238
-344
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
0bf9ece9
...
...
@@ -722,7 +722,7 @@ namespace cv
CV_EXPORTS
void
Laplacian
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
int
ksize
=
1
,
double
scale
=
1
);
//! returns 2D box filter
//
supports CV_8UC1 and CV_8UC4 source type,
dst type must be the same as source type
// dst type must be the same as source type
CV_EXPORTS
Ptr
<
BaseFilter_GPU
>
getBoxFilter_GPU
(
int
srcType
,
int
dstType
,
const
Size
&
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
...
...
@@ -740,8 +740,6 @@ namespace cv
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
//! smooths the image using the normalized box filter
// supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
// supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP
CV_EXPORTS
void
boxFilter
(
const
oclMat
&
src
,
oclMat
&
dst
,
int
ddepth
,
Size
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_DEFAULT
);
...
...
@@ -757,8 +755,6 @@ namespace cv
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
);
//! a synonym for normalized box filter
// supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
// supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
static
inline
void
blur
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
borderType
=
BORDER_CONSTANT
)
{
...
...
modules/ocl/src/filtering.cpp
View file @
0bf9ece9
...
...
@@ -11,7 +11,7 @@
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-201
2
, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-201
3
, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
...
...
@@ -713,276 +713,126 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
return
Ptr
<
FilterEngine_GPU
>
(
new
SeparableFilterEngine_GPU
(
rowFilter
,
columnFilter
));
}
/*
**data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4
**support four border types: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_REFLECT_101
*/
static
void
GPUFilterBox_8u_C1R
(
const
oclMat
&
src
,
oclMat
&
dst
,
static
void
GPUFilterBox
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
{
//Normalize the result by default
float
alpha
=
ksize
.
height
*
ksize
.
width
;
float
alpha
=
1.0
f
/
(
ksize
.
height
*
ksize
.
width
)
;
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
Context
*
clCxt
=
src
.
clCxt
;
string
kernelName
=
"boxFilter_C1_D0"
;
char
btype
[
30
];
switch
(
borderType
)
{
case
0
:
sprintf
(
btype
,
"BORDER_CONSTANT"
);
break
;
case
1
:
sprintf
(
btype
,
"BORDER_REPLICATE"
);
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
);
break
;
case
3
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
);
break
;
}
CV_Assert
(
src
.
oclchannels
()
==
dst
.
oclchannels
());
char
build_options
[
150
];
sprintf
(
build_options
,
"-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s"
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
);
size_t
BLOCK_SIZE
=
src
.
clCxt
->
getDeviceInfo
().
maxWorkItemSizes
[
0
];
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
size_t
blockSizeX
=
256
,
blockSizeY
=
1
;
size_t
gSize
=
blockSizeX
-
(
ksize
.
width
-
1
);
size_t
threads
=
(
dst
.
offset
%
dst
.
step
%
4
+
dst
.
cols
+
3
)
/
4
;
size_t
globalSizeX
=
threads
%
gSize
==
0
?
threads
/
gSize
*
blockSizeX
:
(
threads
/
gSize
+
1
)
*
blockSizeX
;
size_t
globalSizeY
=
((
dst
.
rows
+
1
)
/
2
)
%
blockSizeY
==
0
?
((
dst
.
rows
+
1
)
/
2
)
:
(((
dst
.
rows
+
1
)
/
2
)
/
blockSizeY
+
1
)
*
blockSizeY
;
CV_Assert
((
size_t
)
ksize
.
width
<=
BLOCK_SIZE
);
size_t
globalThreads
[
3
]
=
{
globalSizeX
,
globalSizeY
,
1
};
size_t
localThreads
[
3
]
=
{
blockSizeX
,
blockSizeY
,
1
};
bool
isIsolatedBorder
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
openCLExecuteKernel
(
clCxt
,
&
filtering_boxFilter
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
static
void
GPUFilterBox_8u_C4R
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
{
//Normalize the result by default
float
alpha
=
ksize
.
height
*
ksize
.
width
;
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
Context
*
clCxt
=
src
.
clCxt
;
string
kernelName
=
"boxFilter_C4_D0"
;
char
btype
[
30
];
switch
(
borderType
)
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
cl_uint
stepBytes
=
src
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
stepBytes
));
int
offsetXBytes
=
src
.
offset
%
src
.
step
;
int
offsetX
=
offsetXBytes
/
src
.
elemSize
();
CV_Assert
((
int
)(
offsetX
*
src
.
elemSize
())
==
offsetXBytes
);
int
offsetY
=
src
.
offset
/
src
.
step
;
int
endX
=
(
offsetX
+
src
.
cols
);
int
endY
=
(
offsetY
+
src
.
rows
);
cl_int
rect
[
4
]
=
{
offsetX
,
offsetY
,
endX
,
endY
};
if
(
!
isIsolatedBorder
)
{
case
0
:
sprintf
(
btype
,
"BORDER_CONSTANT"
);
break
;
case
1
:
sprintf
(
btype
,
"BORDER_REPLICATE"
);
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
);
break
;
case
3
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
);
break
;
rect
[
2
]
=
src
.
wholecols
;
rect
[
3
]
=
src
.
wholerows
;
}
char
build_options
[
150
];
sprintf
(
build_options
,
"-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s"
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
);
size_t
blockSizeX
=
256
,
blockSizeY
=
1
;
size_t
gSize
=
blockSizeX
-
ksize
.
width
/
2
*
2
;
size_t
globalSizeX
=
(
src
.
cols
)
%
gSize
==
0
?
src
.
cols
/
gSize
*
blockSizeX
:
(
src
.
cols
/
gSize
+
1
)
*
blockSizeX
;
size_t
rows_per_thread
=
2
;
size_t
globalSizeY
=
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
%
blockSizeY
==
0
?
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
:
(((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
/
blockSizeY
+
1
)
*
blockSizeY
;
size_t
globalThreads
[
3
]
=
{
globalSizeX
,
globalSizeY
,
1
};
size_t
localThreads
[
3
]
=
{
blockSizeX
,
blockSizeY
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
openCLExecuteKernel
(
clCxt
,
&
filtering_boxFilter
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
static
void
GPUFilterBox_32F_C1R
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
{
//Normalize the result by default
float
alpha
=
ksize
.
height
*
ksize
.
width
;
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
Context
*
clCxt
=
src
.
clCxt
;
string
kernelName
=
"boxFilter_C1_D5"
;
char
btype
[
30
];
switch
(
borderType
)
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
rect
[
0
]));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
cl_uint
_stepBytes
=
dst
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_uint
),
(
void
*
)
&
_stepBytes
));
int
_offsetXBytes
=
dst
.
offset
%
dst
.
step
;
int
_offsetX
=
_offsetXBytes
/
dst
.
elemSize
();
CV_Assert
((
int
)(
_offsetX
*
dst
.
elemSize
())
==
_offsetXBytes
);
int
_offsetY
=
dst
.
offset
/
dst
.
step
;
int
_endX
=
(
_offsetX
+
dst
.
cols
);
int
_endY
=
(
_offsetY
+
dst
.
rows
);
cl_int
_rect
[
4
]
=
{
_offsetX
,
_offsetY
,
_endX
,
_endY
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
*
4
,
(
void
*
)
&
_rect
[
0
]));
bool
useDouble
=
src
.
depth
()
==
CV_64F
;
float
borderValue
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
double
borderValueDouble
[
4
]
=
{
0
,
0
,
0
,
0
};
// DON'T move into 'if' body
if
((
borderType
&
~
BORDER_ISOLATED
)
==
BORDER_CONSTANT
)
{
case
0
:
sprintf
(
btype
,
"BORDER_CONSTANT"
);
break
;
case
1
:
sprintf
(
btype
,
"BORDER_REPLICATE"
);
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
);
break
;
case
3
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
);
break
;
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValue
[
0
]));
else
args
.
push_back
(
make_pair
(
sizeof
(
float
)
*
src
.
oclchannels
(),
(
void
*
)
&
borderValueDouble
[
0
]));
}
char
build_options
[
150
];
sprintf
(
build_options
,
"-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s"
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
);
size_t
blockSizeX
=
256
,
blockSizeY
=
1
;
size_t
gSize
=
blockSizeX
-
ksize
.
width
/
2
*
2
;
size_t
globalSizeX
=
(
src
.
cols
)
%
gSize
==
0
?
src
.
cols
/
gSize
*
blockSizeX
:
(
src
.
cols
/
gSize
+
1
)
*
blockSizeX
;
size_t
rows_per_thread
=
2
;
size_t
globalSizeY
=
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
%
blockSizeY
==
0
?
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
:
(((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
/
blockSizeY
+
1
)
*
blockSizeY
;
size_t
globalThreads
[
3
]
=
{
globalSizeX
,
globalSizeY
,
1
};
size_t
localThreads
[
3
]
=
{
blockSizeX
,
blockSizeY
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
openCLExecuteKernel
(
clCxt
,
&
filtering_boxFilter
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
}
static
void
GPUFilterBox_32F_C4R
(
const
oclMat
&
src
,
oclMat
&
dst
,
Size
&
ksize
,
const
Point
anchor
,
const
int
borderType
)
{
//Normalize the result by default
float
alpha
=
ksize
.
height
*
ksize
.
width
;
CV_Assert
(
src
.
clCxt
==
dst
.
clCxt
);
CV_Assert
((
src
.
cols
==
dst
.
cols
)
&&
(
src
.
rows
==
dst
.
rows
));
Context
*
clCxt
=
src
.
clCxt
;
string
kernelName
=
"boxFilter_C4_D5"
;
double
alphaDouble
=
alpha
;
// DON'T move into 'if' body
if
(
useDouble
)
args
.
push_back
(
make_pair
(
sizeof
(
double
),
(
void
*
)
&
alphaDouble
));
else
args
.
push_back
(
make_pair
(
sizeof
(
float
),
(
void
*
)
&
alpha
));
c
har
btype
[
30
]
;
c
onst
char
*
btype
=
NULL
;
switch
(
borderType
)
switch
(
borderType
&
~
BORDER_ISOLATED
)
{
case
0
:
sprintf
(
btype
,
"BORDER_CONSTANT"
)
;
case
BORDER_CONSTANT
:
btype
=
"BORDER_CONSTANT"
;
break
;
case
1
:
sprintf
(
btype
,
"BORDER_REPLICATE"
)
;
case
BORDER_REPLICATE
:
btype
=
"BORDER_REPLICATE"
;
break
;
case
2
:
sprintf
(
btype
,
"BORDER_REFLECT"
)
;
case
BORDER_REFLECT
:
btype
=
"BORDER_REFLECT"
;
break
;
case
3
:
case
BORDER_WRAP
:
CV_Error
(
CV_StsUnsupportedFormat
,
"BORDER_WRAP is not supported!"
);
return
;
case
4
:
sprintf
(
btype
,
"BORDER_REFLECT_101"
)
;
case
BORDER_REFLECT101
:
btype
=
"BORDER_REFLECT_101"
;
break
;
}
char
build_options
[
150
];
sprintf
(
build_options
,
"-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s"
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
);
size_t
blockSizeX
=
256
,
blockSizeY
=
1
;
size_t
gSize
=
blockSizeX
-
ksize
.
width
/
2
*
2
;
size_t
globalSizeX
=
(
src
.
cols
)
%
gSize
==
0
?
src
.
cols
/
gSize
*
blockSizeX
:
(
src
.
cols
/
gSize
+
1
)
*
blockSizeX
;
size_t
rows_per_thread
=
2
;
size_t
globalSizeY
=
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
%
blockSizeY
==
0
?
((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
:
(((
src
.
rows
+
rows_per_thread
-
1
)
/
rows_per_thread
)
/
blockSizeY
+
1
)
*
blockSizeY
;
size_t
globalThreads
[
3
]
=
{
globalSizeX
,
globalSizeY
,
1
};
size_t
localThreads
[
3
]
=
{
blockSizeX
,
blockSizeY
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholerows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
wholecols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
openCLExecuteKernel
(
clCxt
,
&
filtering_boxFilter
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
build_options
);
int
requiredTop
=
anchor
.
y
;
int
requiredLeft
=
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
int
requiredRight
=
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
int
h
=
isIsolatedBorder
?
src
.
rows
:
src
.
wholerows
;
int
w
=
isIsolatedBorder
?
src
.
cols
:
src
.
wholecols
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
CV_Assert
(
w
>=
ksize
.
width
&&
h
>=
ksize
.
height
);
// TODO Other cases are not tested well
char
build_options
[
1024
];
sprintf
(
build_options
,
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
src
.
depth
(),
src
.
oclchannels
(),
useDouble
?
1
:
0
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
btype
,
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isIsolatedBorder
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
);
size_t
gt
[
3
]
=
{
divUp
(
dst
.
cols
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
,
divUp
(
dst
.
rows
,
BLOCK_SIZE_Y
),
1
},
lt
[
3
]
=
{
BLOCK_SIZE
,
1
,
1
};
openCLExecuteKernel
(
src
.
clCxt
,
&
filtering_boxFilter
,
"boxFilter"
,
gt
,
lt
,
args
,
-
1
,
-
1
,
build_options
);
}
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getBoxFilter_GPU
(
int
srcType
,
int
dstType
,
Ptr
<
BaseFilter_GPU
>
cv
::
ocl
::
getBoxFilter_GPU
(
int
/*srcType*/
,
int
/*dstType*/
,
const
Size
&
ksize
,
Point
anchor
,
int
borderType
)
{
static
const
FilterBox_t
FilterBox_callers
[
2
][
5
]
=
{{
0
,
GPUFilterBox_8u_C1R
,
0
,
GPUFilterBox_8u_C4R
,
GPUFilterBox_8u_C4R
},
{
0
,
GPUFilterBox_32F_C1R
,
0
,
GPUFilterBox_32F_C4R
,
GPUFilterBox_32F_C4R
}
};
//Remove this check if more data types need to be supported.
CV_Assert
((
srcType
==
CV_8UC1
||
srcType
==
CV_8UC3
||
srcType
==
CV_8UC4
||
srcType
==
CV_32FC1
||
srcType
==
CV_32FC3
||
srcType
==
CV_32FC4
)
&&
dstType
==
srcType
);
normalizeAnchor
(
anchor
,
ksize
);
return
Ptr
<
BaseFilter_GPU
>
(
new
GPUBoxFilter
(
ksize
,
anchor
,
borderType
,
FilterBox_callers
[(
CV_MAT_DEPTH
(
srcType
)
==
CV_32F
)][
CV_MAT_CN
(
srcType
)]
));
borderType
,
GPUFilterBox
));
}
Ptr
<
FilterEngine_GPU
>
cv
::
ocl
::
createBoxFilter_GPU
(
int
srcType
,
int
dstType
,
...
...
modules/ocl/src/opencl/filtering_boxFilter.cl
View file @
0bf9ece9
...
...
@@ -10,13 +10,9 @@
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Copyright
(
C
)
2010-2013,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Zhang
Ying,
zhangying913@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
...
...
@@ -79,400 +75,298 @@
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
endif
#
define
THREADS
256
#
define
ELEM
(
i,
l_edge,
r_edge,
elem1,
elem2
)
(
i
)
>=
(
l_edge
)
&&
(
i
)
<
(
r_edge
)
?
(
elem1
)
:
(
elem2
)
inline
void
update_dst_C1_D0
(
__global
uchar
*dst,
__local
uint*
temp,
int
dst_rows,
int
dst_cols,
int
dst_startX,
int
dst_x_off,
float
alpha
)
{
if
(
get_local_id
(
0
)
<
anX
|
| get_local_id(0) >= (THREADS-ksX+anX+1))
{
return;
#
ifdef
EXTRA_EXTRAPOLATION
//
border
>
src
image
size
#
ifdef
BORDER_CONSTANT
//
None
#
elif
defined
BORDER_REPLICATE
#
define
EXTRAPOLATE
(
x,
y,
minX,
minY,
maxX,
maxY
)
\
{
\
x
=
max
(
min
(
x,
maxX
-
1
)
,
minX
)
; \
y
=
max
(
min
(
y,
maxY
-
1
)
,
minY
)
; \
}
uint4 tmp_sum = 0;
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
int posY = (get_group_id(1) << 1);
for(int i=-anX; i<=anX; i++)
{
tmp_sum += vload4(get_local_id(0), temp+i);
#
elif
defined
BORDER_WRAP
#
define
EXTRAPOLATE
(
x,
y,
minX,
minY,
maxX,
maxY
)
\
{
\
if
(
x
<
minX
)
\
x
-=
((
x
-
maxX
+
1
)
/
maxX
)
*
maxX
; \
if
(
x
>=
maxX
)
\
x
%=
maxX
; \
if
(
y
<
minY
)
\
y
-=
((
y
-
maxY
+
1
)
/
maxY
)
*
maxY
; \
if
(
y
>=
maxY
)
\
y
%=
maxY
; \
}
if(posY < dst_rows && posX < dst_cols)
{
tmp_sum /= (uint4) alpha;
if(posX >= 0 && posX < dst_cols)
*(dst) = tmp_sum.x;
if(posX+1 >= 0 && posX+1 < dst_cols)
*(dst + 1) = tmp_sum.y;
if(posX+2 >= 0 && posX+2 < dst_cols)
*(dst + 2) = tmp_sum.z;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst + 3) = tmp_sum.w;
#
elif
defined
(
BORDER_REFLECT
)
|
| defined(BORDER_REFLECT_101)
#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
{ \
if (maxX - minX == 1) \
x = minX; \
else \
do \
{ \
if (x < minX) \
x = -(x - minX) - 1 + delta; \
else \
x = maxX - 1 - (x - maxX) - delta; \
} \
while (x >= maxX || x < minX); \
\
if (maxY - minY == 1) \
y = minY; \
else \
do \
{ \
if (y < minY) \
y = -(y - minY) - 1 + delta; \
else \
y = maxY - 1 - (y - maxY) - delta; \
} \
while (y >= maxY || y < minY); \
}
}
#ifdef BORDER_REFLECT
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
#elif defined(BORDER_REFLECT_101)
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
#endif
#else
#error No extrapolation method
#endif
#else
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
{ \
int _row = y - minY, _col = x - minX; \
_row = ADDR_H(_row, 0, maxY - minY); \
_row = ADDR_B(_row, maxY - minY, _row); \
y = _row + minY; \
\
_col = ADDR_L(_col, 0, maxX - minX); \
_col = ADDR_R(_col, maxX - minX, _col); \
x = _col + minX; \
}
#endif
#if USE_DOUBLE
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#define FPTYPE double
#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
#else
#define FPTYPE float
#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
#endif
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
int dst_rows, int dst_cols,
int dst_startX, int dst_x_off,
float alpha)
{
if(get_local_id(0) >= (THREADS-ksX+1))
{
return;
}
#if DATA_DEPTH == 0
#define BASE_TYPE uchar
#elif DATA_DEPTH == 1
#define BASE_TYPE char
#elif DATA_DEPTH == 2
#define BASE_TYPE ushort
#elif DATA_DEPTH == 3
#define BASE_TYPE short
#elif DATA_DEPTH == 4
#define BASE_TYPE int
#elif DATA_DEPTH == 5
#define BASE_TYPE float
#elif DATA_DEPTH == 6
#define BASE_TYPE double
#else
#error data_depth
#endif
int posX = dst_startX - dst_x_off + get_local_id(0);
int posY = (get_group_id(1) << 1);
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
#define uchar1 uchar
#define char1 char
#define ushort1 ushort
#define short1 short
#define int1 int
#define float1 float
#define double1 double
#define convert_uchar1_sat_rte convert_uchar_sat_rte
#define convert_char1_sat_rte convert_char_sat_rte
#define convert_ushort1_sat_rte convert_ushort_sat_rte
#define convert_short1_sat_rte convert_short_sat_rte
#define convert_int1_sat_rte convert_int_sat_rte
#define convert_float1
#define convert_double1
#if DATA_DEPTH == 5 |
|
DATA_DEPTH
==
6
#
define
CONVERT_TO_TYPE
CAT
(
CAT
(
convert_,
BASE_TYPE
)
,
VEC_SIZE
)
#
else
#
define
CONVERT_TO_TYPE
CAT
(
CAT
(
CAT
(
convert_,
BASE_TYPE
)
,
VEC_SIZE
)
,
_sat_rte
)
#
endif
uint4 temp_sum = 0;
for(int i=-anX; i<=anX; i++)
{
temp_sum += temp[get_local_id(0) + anX + i];
}
#
define
VEC_SIZE
DATA_CHAN
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
*dst = convert_uchar4(convert_float4(temp_sum)/alpha);
}
#
define
VEC_TYPE
CAT
(
BASE_TYPE,
VEC_SIZE
)
#
define
TYPE
VEC_TYPE
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
#
define
SCALAR_TYPE
CAT
(
FPTYPE,
VEC_SIZE
)
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = src_offset % src_step;
int src_y_off = src_offset / src_step;
int dst_x_off = dst_offset % dst_step;
int dst_y_off = dst_offset / dst_step;
#
define
INTERMEDIATE_TYPE
CAT
(
FPTYPE,
VEC_SIZE
)
struct
RectCoords
{
int
x1,
y1,
x2,
y2
;
}
;
int head_off = dst_x_off%4;
int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
//#define
DEBUG
#
ifdef
DEBUG
#
define
DEBUG_ONLY
(
x
)
x
#
define
ASSERT
(
condition
)
do
{
if
(
!
(
condition
))
{
printf
(
"BUG in boxFilter kernel (global=%d,%d): "
#
condition
"\n"
,
get_global_id
(
0
)
,
get_global_id
(
1
))
; } } while (0)
#
else
#
define
DEBUG_ONLY
(
x
)
#
define
ASSERT
(
condition
)
#
endif
uint4 data[ksY+1];
__local uint4 temp[2][THREADS];
inline
INTERMEDIATE_TYPE
readSrcPixel
(
int2
pos,
__global
TYPE
*src,
const
unsigned
int
srcStepBytes,
const
struct
RectCoords
srcCoords
#
ifdef
BORDER_CONSTANT
for(int i=0; i < ksY+1; i++)
,
SCALAR_TYPE
borderValue
#
endif
)
{
#
ifdef
BORDER_ISOLATED
if
(
pos.x
>=
srcCoords.x1
&&
pos.y
>=
srcCoords.y1
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
#
else
if
(
pos.x
>=
0
&&
pos.y
>=
0
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
#
endif
{
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
{
data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
}
else
{
data[i]=0;
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
}
__global
TYPE*
ptr
=
(
__global
TYPE*
)((
__global
char*
)
src
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
srcStepBytes
)
;
return
CONVERT_TO_FPTYPE
(
*ptr
)
;
}
#else
int not_all_in_range;
for(int i=0; i < ksY+1; i++)
else
{
not_all_in_range = (startX+col*4<0) |
(
startX+col*4+3>src_whole_cols-1
)
| (startY+i<0) |
(
startY+i>src_whole_rows-1
)
;
if
(
not_all_in_range
)
{
int
selected_row
;
int4
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col.x
=
ADDR_L
(
startX+col*4,
0
,
src_whole_cols
)
;
selected_col.x
=
ADDR_R
(
startX+col*4,
src_whole_cols,
selected_col.x
)
;
selected_col.y
=
ADDR_L
(
startX+col*4+1,
0
,
src_whole_cols
)
;
selected_col.y
=
ADDR_R
(
startX+col*4+1,
src_whole_cols,
selected_col.y
)
;
#
ifdef
BORDER_CONSTANT
return
borderValue
;
#
else
int
selected_col
=
pos.x
;
int
selected_row
=
pos.y
;
selected_col.z
=
ADDR_L
(
startX+col*4+2,
0
,
src_whole_cols
)
;
selected_col.z
=
ADDR_R
(
startX+col*4+2,
src_whole_cols,
selected_col.z
)
;
EXTRAPOLATE
(
selected_col,
selected_row,
#
ifdef
BORDER_ISOLATED
srcCoords.x1,
srcCoords.y1,
#
else
0
,
0
,
#
endif
srcCoords.x2,
srcCoords.y2
)
;
selected_col.w
=
ADDR_L
(
startX+col*4+3,
0
,
src_whole_cols
)
;
selected_col.w
=
ADDR_R
(
startX+col*4+3,
src_whole_cols,
selected_col.
w
)
;
//
debug
border
mapping
//printf
(
"pos=%d,%d --> %d, %d\n"
,
pos.x,
pos.y,
selected_col,
selected_ro
w
)
;
data[i].x
=
*
(
src
+
selected_row
*
src_step
+
selected_col.x
)
;
data[i].y
=
*
(
src
+
selected_row
*
src_step
+
selected_col.y
)
;
data[i].z
=
*
(
src
+
selected_row
*
src_step
+
selected_col.z
)
;
data[i].w
=
*
(
src
+
selected_row
*
src_step
+
selected_col.w
)
;
pos
=
(
int2
)(
selected_col,
selected_row
)
;
if
(
pos.x
>=
0
&&
pos.y
>=
0
&&
pos.x
<
srcCoords.x2
&&
pos.y
<
srcCoords.y2
)
{
__global
TYPE*
ptr
=
(
__global
TYPE*
)((
__global
char*
)
src
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
srcStepBytes
)
;
return
CONVERT_TO_FPTYPE
(
*ptr
)
;
}
else
{
data[i]
=
convert_uint4
(
vload4
(
col,
(
__global
uchar*
)(
src+
(
startY+i
)
*src_step
+
startX
)))
;
//
for
debug
only
DEBUG_ONLY
(
printf
(
"BUG in boxFilter kernel\n"
))
;
return
(
FPTYPE
)(
0.0f
)
;
}
}
#
endif
uint4
tmp_sum
=
0
;
for
(
int
i=1
; i < ksY; i++)
{
tmp_sum
+=
(
data[i]
)
;
}
int
index
=
dst_startY
*
dst_step
+
dst_startX
+
(
col-anX
)
*4
;
temp[0][col]
=
tmp_sum
+
(
data[0]
)
;
temp[1][col]
=
tmp_sum
+
(
data[ksY]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
update_dst_C1_D0
(
dst+index,
(
__local
uint
*
)(
temp[0]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
update_dst_C1_D0
(
dst+index+dst_step,
(
__local
uint
*
)(
temp[1]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
boxFilter_C4_D0
(
__global
const
uchar4
*
restrict
src,
__global
uchar4
*dst,
float
alpha,
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
{
int
col
=
get_local_id
(
0
)
;
const
int
gX
=
get_group_id
(
0
)
;
const
int
gY
=
get_group_id
(
1
)
;
int
src_x_off
=
(
src_offset
%
src_step
)
>>
2
;
int
src_y_off
=
src_offset
/
src_step
;
int
dst_x_off
=
(
dst_offset
%
dst_step
)
>>
2
;
int
dst_y_off
=
dst_offset
/
dst_step
;
int
startX
=
gX
*
(
THREADS-ksX+1
)
-
anX
+
src_x_off
;
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
uint4
data[ksY+1]
;
__local
uint4
temp[2][THREADS]
;
//
INPUT
PARAMETER:
BLOCK_SIZE_Y
(
via
defines
)
__kernel
__attribute__
((
reqd_work_group_size
(
LOCAL_SIZE,
1
,
1
)))
void
boxFilter
(
__global
TYPE
*src,
const
unsigned
int
srcStepBytes,
const
int4
srcRC,
__global
TYPE
*dst,
const
unsigned
int
dstStepBytes,
const
int4
dstRC,
#
ifdef
BORDER_CONSTANT
bool
con
;
for
(
int
i=0
; i < ksY+1; i++)
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
SCALAR_TYPE
borderValue,
#
endif
FPTYPE
alpha
)
{
const
struct
RectCoords
srcCoords
=
{srcRC.s0,
srcRC.s1,
srcRC.s2,
srcRC.s3}
; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
const
struct
RectCoords
dstCoords
=
{dstRC.s0,
dstRC.s1,
dstRC.s2,
dstRC.s3}
;
data[i].x
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].x
:
0
;
data[i].y
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].y
:
0
;
data[i].z
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].z
:
0
;
data[i].w
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].w
:
0
;
}
#
else
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
const
int
x
=
get_local_id
(
0
)
+
(
LOCAL_SIZE
-
(
KERNEL_SIZE_X
-
1
))
*
get_group_id
(
0
)
-
ANCHOR_X
;
const
int
y
=
get_global_id
(
1
)
*
BLOCK_SIZE_Y
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
const
int
local_id
=
get_local_id
(
0
)
;
INTERMEDIATE_TYPE
data[KERNEL_SIZE_Y]
;
__local
INTERMEDIATE_TYPE
sumOfCols[LOCAL_SIZE]
;
data[i]
=
convert_uint4
(
src[selected_row
*
(
src_step>>2
)
+
selected_col]
)
;
int2
srcPos
=
(
int2
)(
srcCoords.x1
+
x,
srcCoords.y1
+
y
-
ANCHOR_Y
)
;
for
(
int
sy
=
0
; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
{
data[sy]
=
readSrcPixel
(
srcPos,
src,
srcStepBytes,
srcCoords
#
ifdef
BORDER_CONSTANT
,
borderValue
#
endif
)
;
}
#
endif
uint4
tmp_sum
=
0
;
for
(
int
i=1
; i < ksY; i++)
INTERMEDIATE_TYPE
tmp_sum
=
0
;
for
(
int
sy
=
0
; sy < KERNEL_SIZE_Y; sy++)
{
tmp_sum
+=
(
data[
i
]
)
;
tmp_sum
+=
(
data[
sy
]
)
;
}
int
index
=
dst_startY
*
(
dst_step>>2
)
+
dst_startX
+
col
;
temp[0][col]
=
tmp_sum
+
(
data[0]
)
;
temp[1][col]
=
tmp_sum
+
(
data[ksY]
)
;
sumOfCols[local_id]
=
tmp_sum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
update_dst_C4_D0
(
dst+index,
(
__local
uint4
*
)(
temp[0]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
update_dst_C4_D0
(
dst+index+
(
dst_step>>2
)
,
(
__local
uint4
*
)(
temp[1]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
}
int2
pos
=
(
int2
)(
dstCoords.x1
+
x,
dstCoords.y1
+
y
)
;
__global
TYPE*
dstPtr
=
(
__global
TYPE*
)((
__global
char*
)
dst
+
pos.x
*
sizeof
(
TYPE
)
+
pos.y
*
dstStepBytes
)
; // Pointer can be out of bounds!
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
boxFilter_C1_D5
(
__global
const
float
*restrict
src,
__global
float
*dst,
float
alpha,
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
{
int
col
=
get_local_id
(
0
)
;
const
int
gX
=
get_group_id
(
0
)
;
const
int
gY
=
get_group_id
(
1
)
;
int
src_x_off
=
(
src_offset
%
src_step
)
>>
2
;
int
src_y_off
=
src_offset
/
src_step
;
int
dst_x_off
=
(
dst_offset
%
dst_step
)
>>
2
;
int
dst_y_off
=
dst_offset
/
dst_step
;
int
startX
=
gX
*
(
THREADS-ksX+1
)
-
anX
+
src_x_off
;
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
float
data[ksY+1]
;
__local
float
temp[2][THREADS]
;
#
ifdef
BORDER_CONSTANT
bool
con
;
float
ss
;
for
(
int
i=0
; i < ksY+1; i++)
int
sy_index
=
0
; // current index in data[] array
int
stepsY
=
min
(
dstCoords.y2
-
pos.y,
BLOCK_SIZE_Y
)
;
ASSERT
(
stepsY
>
0
)
;
for
(
; ;)
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
ss
=
(
startY+i
)
<src_whole_rows&&
(
startY+i
)
>=0&&cur_col>=0&&cur_col<src_whole_cols?src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col]:
(
float
)
0
;
data[i]
=
con
?
ss
:
0.f
;
}
#
else
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
data[i]
=
src[selected_row
*
(
src_step>>2
)
+
selected_col]
;
}
ASSERT
(
pos.y
<
dstCoords.y2
)
;
#
endif
float
sum0
=
0.0
,
sum1
=
0.0
,
sum2
=
0.0
;
for
(
int
i=1
; i < ksY; i++)
{
sum0
+=
(
data[i]
)
;
}
sum1
=
sum0
+
(
data[0]
)
;
sum2
=
sum0
+
(
data[ksY]
)
;
temp[0][col]
=
sum1
;
temp[1][col]
=
sum2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
col
<
(
THREADS-
(
ksX-1
)))
{
col
+=
anX
;
int
posX
=
dst_startX
-
dst_x_off
+
col
-
anX
;
int
posY
=
(
gY
<<
1
)
;
if
(
local_id
>=
ANCHOR_X
&&
local_id
<
LOCAL_SIZE
-
(
KERNEL_SIZE_X
-
1
-
ANCHOR_X
)
&&
pos.x
>=
dstCoords.x1
&&
pos.x
<
dstCoords.x2
)
{
ASSERT
(
pos.y
>=
dstCoords.y1
&&
pos.y
<
dstCoords.y2
)
;
float
tmp_sum[2]=
{0.0,
0.0}
;
for
(
int
k=0
; k<2; k++)
for
(
int
i=-anX
; i<=anX; i
++)
INTERMEDIATE_TYPE
total_sum
=
0
;
#
pragma
unroll
for
(
int
sx
=
0
; sx < KERNEL_SIZE_X; sx
++)
{
t
mp_sum[k]
+=
temp[k][col+i
]
;
t
otal_sum
+=
sumOfCols[local_id
+
sx
-
ANCHOR_X
]
;
}
for
(
int
i=0
; i<2; i++)
{
if
(
posX
>=
0
&&
posX
<
dst_cols
&&
(
posY+i
)
>=
0
&&
(
posY+i
)
<
dst_rows
)
dst[
(
dst_startY+i
)
*
(
dst_step>>2
)
+
dst_startX
+
col
-
anX]
=
tmp_sum[i]/alpha
;
*dstPtr
=
CONVERT_TO_TYPE
(((
INTERMEDIATE_TYPE
)
alpha
)
*
total_sum
)
;
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
boxFilter_C4_D5
(
__global
const
float4
*restrict
src,
__global
float4
*dst,
float
alpha,
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
{
int
col
=
get_local_id
(
0
)
;
const
int
gX
=
get_group_id
(
0
)
;
const
int
gY
=
get_group_id
(
1
)
;
int
src_x_off
=
(
src_offset
%
src_step
)
>>
4
;
int
src_y_off
=
src_offset
/
src_step
;
int
dst_x_off
=
(
dst_offset
%
dst_step
)
>>
4
;
int
dst_y_off
=
dst_offset
/
dst_step
;
int
startX
=
gX
*
(
THREADS-ksX+1
)
-
anX
+
src_x_off
;
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
float4
data[ksY+1]
;
__local
float4
temp[2][THREADS]
;
#
ifdef
BORDER_CONSTANT
bool
con
;
float4
ss
;
for
(
int
i=0
; i < ksY+1; i++)
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
ss
=
(
startY+i
)
<src_whole_rows&&
(
startY+i
)
>=0&&cur_col>=0&&cur_col<src_whole_cols?src[
(
startY+i
)
*
(
src_step>>4
)
+
cur_col]:
(
float4
)
0
;
data[i]
=
con
?
ss
:
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
;
}
#
if
BLOCK_SIZE_Y
==
1
break
;
#
else
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
if
(
--stepsY
==
0
)
break
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
data[i]
=
src[selected_row
*
(
src_step>>4
)
+
selected_col]
;
}
tmp_sum
=
sumOfCols[local_id]
; // TODO FIX IT: workaround for BUG in OpenCL compiler
//
only
works
with
scalars:
ASSERT
(
fabs
(
tmp_sum
-
sumOfCols[local_id]
)
<
(
INTERMEDIATE_TYPE
)
1e-6
)
;
tmp_sum
-=
data[sy_index]
;
data[sy_index]
=
readSrcPixel
(
srcPos,
src,
srcStepBytes,
srcCoords
#
ifdef
BORDER_CONSTANT
,
borderValue
#
endif
float4
sum0
=
0.0
,
sum1
=
0.0
,
sum2
=
0.0
;
for
(
int
i=1
; i < ksY; i++)
{
sum0
+=
(
data[i]
)
;
}
sum1
=
sum0
+
(
data[0]
)
;
sum2
=
sum0
+
(
data[ksY]
)
;
temp[0][col]
=
sum1
;
temp[1][col]
=
sum2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
col
<
(
THREADS-
(
ksX-1
)))
{
col
+=
anX
;
int
posX
=
dst_startX
-
dst_x_off
+
col
-
anX
;
int
posY
=
(
gY
<<
1
)
;
)
;
srcPos.y++
;
float4
tmp_sum[2]=
{
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
,
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
}
;
for
(
int
k=0
; k<2; k++)
for
(
int
i=-anX
; i<=anX; i++)
{
tmp_sum[k]
+=
temp[k][col+i]
;
}
for
(
int
i=0
; i<2; i++)
{
if
(
posX
>=
0
&&
posX
<
dst_cols
&&
(
posY+i
)
>=
0
&&
(
posY+i
)
<
dst_rows
)
dst[
(
dst_startY+i
)
*
(
dst_step>>4
)
+
dst_startX
+
col
-
anX]
=
tmp_sum[i]/alpha
;
}
tmp_sum
+=
data[sy_index]
;
sumOfCols[local_id]
=
tmp_sum
;
sy_index
=
(
sy_index
+
1
<
KERNEL_SIZE_Y
)
?
sy_index
+
1
:
0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
next
line
DEBUG_ONLY
(
pos.y++
)
;
dstPtr
=
(
__global
TYPE*
)((
__global
char*
)
dstPtr
+
dstStepBytes
)
; // Pointer can be out of bounds!
#
endif
//
BLOCK_SIZE_Y
==
1
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment