Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
b449b0bf
Commit
b449b0bf
authored
Mar 19, 2014
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
simplified cv::sepFilter2D OpenCL part
parent
82e6edfb
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
167 additions
and
420 deletions
+167
-420
filter.cpp
modules/imgproc/src/filter.cpp
+56
-81
filterSepCol.cl
modules/imgproc/src/opencl/filterSepCol.cl
+22
-40
filterSepRow.cl
modules/imgproc/src/opencl/filterSepRow.cl
+81
-292
filterSep_singlePass.cl
modules/imgproc/src/opencl/filterSep_singlePass.cl
+5
-6
test_sepfilter2D.cpp
modules/imgproc/test/ocl/test_sepfilter2D.cpp
+3
-1
No files found.
modules/imgproc/src/filter.cpp
View file @
b449b0bf
...
...
@@ -41,6 +41,7 @@
//M*/
#include "precomp.hpp"
#define CV_OPENCL_RUN_ASSERT
#include "opencl_kernels.hpp"
#include <sstream>
...
...
@@ -3317,11 +3318,9 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
return
kernel
.
run
(
2
,
globalsize
,
localsize
,
true
);
}
static
bool
ocl_sepRowFilter2D
(
UMat
&
src
,
UMat
&
buf
,
Mat
&
kernelX
,
int
anchor
,
int
borderType
,
bool
sync
)
static
bool
ocl_sepRowFilter2D
(
UMat
&
src
,
UMat
&
buf
,
Mat
&
kernelX
,
int
anchor
,
int
borderType
)
{
int
type
=
src
.
type
();
int
cn
=
CV_MAT_CN
(
type
);
int
sdepth
=
CV_MAT_DEPTH
(
type
);
int
type
=
src
.
type
(),
cn
=
CV_MAT_CN
(
type
),
sdepth
=
CV_MAT_DEPTH
(
type
);
Size
bufSize
=
buf
.
size
();
#ifdef ANDROID
...
...
@@ -3329,27 +3328,14 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
#else
size_t
localsize
[
2
]
=
{
16
,
16
};
#endif
size_t
globalsize
[
2
]
=
{
DIVUP
(
bufSize
.
width
,
localsize
[
0
])
*
localsize
[
0
],
DIVUP
(
bufSize
.
height
,
localsize
[
1
])
*
localsize
[
1
]};
if
(
CV_8U
==
sdepth
)
{
switch
(
cn
)
{
case
1
:
globalsize
[
0
]
=
DIVUP
((
bufSize
.
width
+
3
)
>>
2
,
localsize
[
0
])
*
localsize
[
0
];
break
;
case
2
:
globalsize
[
0
]
=
DIVUP
((
bufSize
.
width
+
1
)
>>
1
,
localsize
[
0
])
*
localsize
[
0
];
break
;
case
4
:
globalsize
[
0
]
=
DIVUP
(
bufSize
.
width
,
localsize
[
0
])
*
localsize
[
0
];
break
;
}
}
if
(
type
==
CV_8UC1
)
globalsize
[
0
]
=
DIVUP
((
bufSize
.
width
+
3
)
>>
2
,
localsize
[
0
])
*
localsize
[
0
];
int
radiusX
=
anchor
;
int
radiusY
=
(
int
)((
buf
.
rows
-
src
.
rows
)
>>
1
);
int
radiusX
=
anchor
,
radiusY
=
(
buf
.
rows
-
src
.
rows
)
>>
1
;
bool
is
IsolatedBorder
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
bool
is
olated
=
(
borderType
&
BORDER_ISOLATED
)
!=
0
;
const
char
*
const
borderMap
[]
=
{
"BORDER_CONSTANT"
,
"BORDER_REPLICATE"
,
"BORDER_REFLECT"
,
"BORDER_WRAP"
,
"BORDER_REFLECT_101"
},
*
const
btype
=
borderMap
[
borderType
&
~
BORDER_ISOLATED
];
...
...
@@ -3358,49 +3344,38 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
extra_extrapolation
|=
src
.
cols
<
(
int
)((
-
radiusX
+
globalsize
[
0
]
+
8
*
localsize
[
0
]
+
3
)
>>
1
)
+
1
;
extra_extrapolation
|=
src
.
cols
<
radiusX
;
cv
::
String
build_options
=
cv
::
format
(
"-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
,
radiusX
,
(
int
)
localsize
[
0
],
(
int
)
localsize
[
1
],
cn
,
btype
,
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isIsolatedBorder
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
);
char
cvt
[
40
];
cv
::
String
build_options
=
cv
::
format
(
"-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s"
,
radiusX
,
(
int
)
localsize
[
0
],
(
int
)
localsize
[
1
],
cn
,
btype
,
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isolated
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
,
ocl
::
typeToStr
(
type
),
ocl
::
typeToStr
(
CV_32FC
(
cn
)),
ocl
::
convertTypeStr
(
sdepth
,
CV_32F
,
cn
,
cvt
),
ocl
::
typeToStr
(
sdepth
),
ocl
::
typeToStr
(
CV_32F
));
build_options
+=
ocl
::
kernelToStr
(
kernelX
,
CV_32F
);
Size
srcWholeSize
;
Point
srcOffset
;
src
.
locateROI
(
srcWholeSize
,
srcOffset
);
std
::
stringstream
strKernel
;
strKernel
<<
"row_filter"
;
if
(
-
1
!=
cn
)
strKernel
<<
"_C"
<<
cn
;
if
(
-
1
!=
sdepth
)
strKernel
<<
"_D"
<<
sdepth
;
String
kernelName
(
"row_filter"
);
if
(
type
==
CV_8UC1
)
kernelName
+=
"_C1_D0"
;
ocl
::
Kernel
k
ernelRow
;
if
(
!
kernelRow
.
create
(
strKernel
.
str
().
c_str
(),
cv
::
ocl
::
imgproc
::
filterSepRow_oclsrc
,
build_options
))
ocl
::
Kernel
k
(
kernelName
.
c_str
(),
cv
::
ocl
::
imgproc
::
filterSepRow_oclsrc
,
build_options
);
if
(
k
.
empty
(
))
return
false
;
int
idxArg
=
0
;
idxArg
=
kernelRow
.
set
(
idxArg
,
ocl
::
KernelArg
::
PtrReadOnly
(
src
));
idxArg
=
kernelRow
.
set
(
idxArg
,
(
int
)(
src
.
step
/
src
.
elemSize
()));
idxArg
=
kernelRow
.
set
(
idxArg
,
srcOffset
.
x
);
idxArg
=
kernelRow
.
set
(
idxArg
,
srcOffset
.
y
);
idxArg
=
kernelRow
.
set
(
idxArg
,
src
.
cols
);
idxArg
=
kernelRow
.
set
(
idxArg
,
src
.
rows
);
idxArg
=
kernelRow
.
set
(
idxArg
,
srcWholeSize
.
width
);
idxArg
=
kernelRow
.
set
(
idxArg
,
srcWholeSize
.
height
);
idxArg
=
kernelRow
.
set
(
idxArg
,
ocl
::
KernelArg
::
PtrWriteOnly
(
buf
));
idxArg
=
kernelRow
.
set
(
idxArg
,
(
int
)(
buf
.
step
/
buf
.
elemSize
()));
idxArg
=
kernelRow
.
set
(
idxArg
,
buf
.
cols
);
idxArg
=
kernelRow
.
set
(
idxArg
,
buf
.
rows
);
idxArg
=
kernelRow
.
set
(
idxArg
,
radiusY
);
return
kernelRow
.
run
(
2
,
globalsize
,
localsize
,
sync
);
k
.
args
(
ocl
::
KernelArg
::
PtrReadOnly
(
src
),
(
int
)(
src
.
step
/
src
.
elemSize
()),
srcOffset
.
x
,
srcOffset
.
y
,
src
.
cols
,
src
.
rows
,
srcWholeSize
.
width
,
srcWholeSize
.
height
,
ocl
::
KernelArg
::
PtrWriteOnly
(
buf
),
(
int
)(
buf
.
step
/
buf
.
elemSize
()),
buf
.
cols
,
buf
.
rows
,
radiusY
);
return
k
.
run
(
2
,
globalsize
,
localsize
,
false
);
}
static
bool
ocl_sepColFilter2D
(
const
UMat
&
buf
,
UMat
&
dst
,
Mat
&
kernelY
,
int
anchor
,
bool
sync
)
static
bool
ocl_sepColFilter2D
(
const
UMat
&
buf
,
UMat
&
dst
,
Mat
&
kernelY
,
int
anchor
)
{
#ifdef ANDROID
size_t
localsize
[
2
]
=
{
16
,
10
};
...
...
@@ -3420,28 +3395,23 @@ static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anc
globalsize
[
0
]
=
DIVUP
(
sz
.
width
,
localsize
[
0
])
*
localsize
[
0
];
char
cvt
[
40
];
cv
::
String
build_options
=
cv
::
format
(
"-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s"
,
anchor
,
(
int
)
localsize
[
0
],
(
int
)
localsize
[
1
],
cn
,
ocl
::
typeToStr
(
buf
.
type
()),
ocl
::
typeToStr
(
dtype
),
ocl
::
convertTypeStr
(
CV_32F
,
ddepth
,
cn
,
cvt
));
cv
::
String
build_options
=
cv
::
format
(
"-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s"
,
anchor
,
(
int
)
localsize
[
0
],
(
int
)
localsize
[
1
],
cn
,
ocl
::
typeToStr
(
buf
.
type
()),
ocl
::
typeToStr
(
dtype
),
ocl
::
convertTypeStr
(
CV_32F
,
ddepth
,
cn
,
cvt
));
build_options
+=
ocl
::
kernelToStr
(
kernelY
,
CV_32F
);
ocl
::
Kernel
kernelCol
;
if
(
!
kernelCol
.
create
(
"col_filter"
,
cv
::
ocl
::
imgproc
::
filterSepCol_oclsrc
,
build_options
))
ocl
::
Kernel
k
(
"col_filter"
,
cv
::
ocl
::
imgproc
::
filterSepCol_oclsrc
,
build_options
);
if
(
k
.
empty
())
return
false
;
int
idxArg
=
0
;
idxArg
=
kernelCol
.
set
(
idxArg
,
ocl
::
KernelArg
::
PtrReadOnly
(
buf
));
idxArg
=
kernelCol
.
set
(
idxArg
,
(
int
)(
buf
.
step
/
buf
.
elemSize
()));
idxArg
=
kernelCol
.
set
(
idxArg
,
buf
.
cols
);
idxArg
=
kernelCol
.
set
(
idxArg
,
buf
.
rows
);
idxArg
=
kernelCol
.
set
(
idxArg
,
ocl
::
KernelArg
::
PtrWriteOnly
(
dst
));
idxArg
=
kernelCol
.
set
(
idxArg
,
(
int
)(
dst
.
offset
/
dst
.
elemSize
()));
idxArg
=
kernelCol
.
set
(
idxArg
,
(
int
)(
dst
.
step
/
dst
.
elemSize
()));
idxArg
=
kernelCol
.
set
(
idxArg
,
dst
.
cols
);
idxArg
=
kernelCol
.
set
(
idxArg
,
dst
.
rows
);
return
kernelCol
.
run
(
2
,
globalsize
,
localsize
,
sync
);
k
.
args
(
ocl
::
KernelArg
::
PtrReadOnly
(
buf
),
(
int
)(
buf
.
step
/
buf
.
elemSize
()),
buf
.
cols
,
buf
.
rows
,
ocl
::
KernelArg
::
PtrWriteOnly
(
dst
),
(
int
)(
dst
.
offset
/
dst
.
elemSize
()),
(
int
)(
dst
.
step
/
dst
.
elemSize
()),
dst
.
cols
,
dst
.
rows
);
return
k
.
run
(
2
,
globalsize
,
localsize
,
false
);
}
const
int
optimizedSepFilterLocalSize
=
16
;
...
...
@@ -3473,12 +3443,14 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
String
opts
=
cv
::
format
(
"-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
" -D %s"
,
(
int
)
lt2
[
0
],
(
int
)
lt2
[
1
],
_row_kernel
.
size
().
height
/
2
,
_col_kernel
.
size
().
height
/
2
,
" -D %s -D srcT1=%s -D dstT1=%s -D cn=%d"
,
(
int
)
lt2
[
0
],
(
int
)
lt2
[
1
],
_row_kernel
.
size
().
height
/
2
,
_col_kernel
.
size
().
height
/
2
,
ocl
::
kernelToStr
(
_row_kernel
,
CV_32F
,
"KERNEL_MATRIX_X"
).
c_str
(),
ocl
::
kernelToStr
(
_col_kernel
,
CV_32F
,
"KERNEL_MATRIX_Y"
).
c_str
(),
ocl
::
typeToStr
(
stype
),
ocl
::
convertTypeStr
(
sdepth
,
wdepth
,
cn
,
cvt
[
0
]),
ocl
::
typeToStr
(
CV_MAKE_TYPE
(
wdepth
,
cn
)),
ocl
::
typeToStr
(
dtype
),
ocl
::
convertTypeStr
(
wdepth
,
ddepth
,
cn
,
cvt
[
1
]),
borderMap
[
borderType
]);
ocl
::
convertTypeStr
(
wdepth
,
ddepth
,
cn
,
cvt
[
1
]),
borderMap
[
borderType
],
ocl
::
typeToStr
(
sdepth
),
ocl
::
typeToStr
(
ddepth
),
cn
);
ocl
::
Kernel
k
(
"sep_filter"
,
ocl
::
imgproc
::
filterSep_singlePass_oclsrc
,
opts
);
if
(
k
.
empty
())
...
...
@@ -3529,10 +3501,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
if
(
ddepth
<
0
)
ddepth
=
sdepth
;
CV_OCL_RUN_
(
kernelY
.
rows
<=
21
&&
kernelX
.
rows
<=
21
&&
imgSize
.
width
>
optimizedSepFilterLocalSize
+
(
kernelX
.
rows
>>
1
)
&&
imgSize
.
height
>
optimizedSepFilterLocalSize
+
(
kernelY
.
rows
>>
1
),
ocl_sepFilter2D_SinglePass
(
_src
,
_dst
,
_kernelX
,
_kernelY
,
borderType
,
ddepth
),
true
)
// printf("%d %d\n", imgSize.width, optimizedSepFilterLocalSize + (kernelX.rows >> 1));
// printf("%d %d\n", imgSize.height, optimizedSepFilterLocalSize + (kernelY.rows >> 1));
// CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 &&
// imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) &&
// imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1),
// ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true)
UMat
src
=
_src
.
getUMat
();
Size
srcWholeSize
;
Point
srcOffset
;
...
...
@@ -3546,12 +3521,12 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
Size
srcSize
=
src
.
size
();
Size
bufSize
(
srcSize
.
width
,
srcSize
.
height
+
kernelY
.
cols
-
1
);
UMat
buf
;
buf
.
create
(
bufSize
,
CV_MAKETYPE
(
CV_32F
,
cn
));
if
(
!
ocl_sepRowFilter2D
(
src
,
buf
,
kernelX
,
anchor
.
x
,
borderType
,
false
))
if
(
!
ocl_sepRowFilter2D
(
src
,
buf
,
kernelX
,
anchor
.
x
,
borderType
))
return
false
;
_dst
.
create
(
srcSize
,
CV_MAKETYPE
(
ddepth
,
cn
));
UMat
dst
=
_dst
.
getUMat
();
return
ocl_sepColFilter2D
(
buf
,
dst
,
kernelY
,
anchor
.
y
,
false
);
return
ocl_sepColFilter2D
(
buf
,
dst
,
kernelY
,
anchor
.
y
);
}
#endif
...
...
modules/imgproc/src/opencl/filterSepCol.cl
View file @
b449b0bf
...
...
@@ -36,16 +36,6 @@
#
define
READ_TIMES_COL
((
2*
(
RADIUSY+LSIZE1
)
-1
)
/LSIZE1
)
#
define
RADIUS
1
#
if
CN
==1
#
define
ALIGN
(((
RADIUS
)
+3
)
>>2<<2
)
#
elif
CN==2
#
define
ALIGN
(((
RADIUS
)
+1
)
>>1<<1
)
#
elif
CN==3
#
define
ALIGN
(((
RADIUS
)
+3
)
>>2<<2
)
#
elif
CN==4
#
define
ALIGN
(
RADIUS
)
#
define
READ_TIMES_ROW
((
2*
(
RADIUS+LSIZE0
)
-1
)
/LSIZE0
)
#
endif
#
define
noconvert
...
...
@@ -65,16 +55,8 @@ The info above maybe obsolete.
#
define
DIG
(
a
)
a,
__constant
float
mat_kernel[]
=
{
COEFF
}
;
__kernel
__attribute__
((
reqd_work_group_size
(
LSIZE0,LSIZE1,1
)))
void
col_filter
(
__global
const
GENTYPE_SRC
*
restrict
src,
const
int
src_step_in_pixel,
const
int
src_whole_cols,
const
int
src_whole_rows,
__global
GENTYPE_DST
*
dst,
const
int
dst_offset_in_pixel,
const
int
dst_step_in_pixel,
const
int
dst_cols,
const
int
dst_rows
)
__kernel
void
col_filter
(
__global
const
srcT
*
src,
int
src_step_in_pixel,
int
src_whole_cols,
int
src_whole_rows,
__global
dstT
*
dst,
int
dst_offset_in_pixel,
int
dst_step_in_pixel,
int
dst_cols,
int
dst_rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -85,35 +67,35 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
int
start_addr
=
mad24
(
y,
src_step_in_pixel,
x
)
;
int
end_addr
=
mad24
(
src_whole_rows
-
1
,
src_step_in_pixel,
src_whole_cols
)
;
int
i
;
GENTYPE_SRC
sum,
temp[READ_TIMES_COL]
;
__local
GENTYPE_SRC
LDS_DAT[LSIZE1
*
READ_TIMES_COL][LSIZE0
+
1]
;
srcT
sum,
temp[READ_TIMES_COL]
;
__local
srcT
LDS_DAT[LSIZE1
*
READ_TIMES_COL][LSIZE0
+
1]
;
//read
pixels
from
src
for
(
i
=
0
;i<READ_TIMES_COL;i++
)
//
read
pixels
from
src
for
(
int
i
=
0
; i < READ_TIMES_COL; ++i
)
{
int
current_addr
=
start_addr+i*LSIZE1*src_step_in_pixel
;
int
current_addr
=
mad24
(
i,
LSIZE1
*
src_step_in_pixel,
start_addr
)
;
current_addr
=
current_addr
<
end_addr
?
current_addr
:
0
;
temp[i]
=
src[current_addr]
;
}
//save
pixels
to
lds
for
(
i
=
0
;i<READ_TIMES_COL;i++)
{
LDS_DAT[l_y+i*LSIZE1][l_x]
=
temp[i]
;
}
//
save
pixels
to
lds
for
(
int
i
=
0
; i < READ_TIMES_COL; ++i)
LDS_DAT[mad24
(
i,
LSIZE1,
l_y
)
][l_x]
=
temp[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//read
pixels
from
lds
and
calculate
the
result
sum
=
LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY]
;
for
(
i=1
;i<=RADIUSY;i++)
//
read
pixels
from
lds
and
calculate
the
result
sum
=
LDS_DAT[l_y
+
RADIUSY][l_x]
*
mat_kernel[RADIUSY]
;
for
(
int
i
=
1
; i <= RADIUSY; ++i)
{
temp[0]
=LDS_DAT[l_y+RADIUSY-
i][l_x]
;
temp[1]
=LDS_DAT[l_y+RADIUSY+
i][l_x]
;
sum
+=
temp[0]
*
mat_kernel[RADIUSY-i]+temp[1]
*
mat_kernel[RADIUSY+i]
;
temp[0]
=
LDS_DAT[l_y
+
RADIUSY
-
i][l_x]
;
temp[1]
=
LDS_DAT[l_y
+
RADIUSY
+
i][l_x]
;
sum
+=
mad
(
temp[0],
mat_kernel[RADIUSY
-
i],
temp[1]
*
mat_kernel[RADIUSY
+
i]
)
;
}
//write
the
result
to
dst
if
((
x<dst_cols
)
&
(
y<dst_rows
))
//
write
the
result
to
dst
if
(
x
<
dst_cols
&&
y
<
dst_rows
)
{
start_addr
=
mad24
(
y,
dst_step_in_pixel,
x
+
dst_offset_in_pixel
)
;
dst[start_addr]
=
convert
_to_DS
T
(
sum
)
;
dst[start_addr]
=
convert
ToDst
T
(
sum
)
;
}
}
modules/imgproc/src/opencl/filterSepRow.cl
View file @
b449b0bf
...
...
@@ -35,40 +35,28 @@
//
#
define
READ_TIMES_ROW
((
2*
(
RADIUSX+LSIZE0
)
-1
)
/LSIZE0
)
//for
c4
only
#
define
READ_TIMES_COL
((
2*
(
RADIUSY+LSIZE1
)
-1
)
/LSIZE1
)
//#pragma
OPENCL
EXTENSION
cl_amd_printf
:
enable
#
define
RADIUS
1
#
if
CN
==1
#
define
ALIGN
(((
RADIUS
)
+3
)
>>2<<2
)
#
elif
CN==2
#
define
ALIGN
(((
RADIUS
)
+1
)
>>1<<1
)
#
elif
CN==3
#
define
ALIGN
(((
RADIUS
)
+3
)
>>2<<2
)
#
elif
CN==4
#
define
ALIGN
(
RADIUS
)
#
endif
#
ifdef
BORDER_REPLICATE
//
BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
//
BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
l_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
r_edge
)
-1
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT
//
BORDER_REFLECT:
fedcba|abcdefgh|hgfedcb
//
BORDER_REFLECT:
fedcba|abcdefgh|hgfedcb
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-1+
((
r_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT_101
//
BORDER_REFLECT_101:
gfedcb|abcdefgh|gfedcba
//
BORDER_REFLECT_101:
gfedcb|abcdefgh|gfedcba
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-2+
((
r_edge
)
<<1
)
:
(
addr
))
#
endif
//blur
function
does
not
support
BORDER_WRAP
#
ifdef
BORDER_WRAP
//
BORDER_WRAP:
cdefgh|abcdefgh|abcdefg
//
BORDER_WRAP:
cdefgh|abcdefgh|abcdefg
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
i
)
+
(
r_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
i
)
-
(
r_edge
)
:
(
addr
))
#
endif
...
...
@@ -127,65 +115,56 @@
#
endif
//BORDER_CONSTANT
#
endif
//EXTRA_EXTRAPOLATION
/**********************************************************************************
These
kernels
are
written
for
separable
filters
such
as
Sobel,
Scharr,
GaussianBlur.
Now
(
6/29/2011
)
the
kernels
only
support
8U
data
type
and
the
anchor
of
the
convovle
kernel
must
be
in
the
center.
ROI
is
not
supported
either.
For
channels
=1,2,4,
each
kernels
read
4
elements
(
not
4
pixels
)
,
and
for
channels
=3,
the
kernel
read
4
pixels,
save
them
to
LDS
and
read
the
data
needed
from
LDS
to
calculate
the
result.
The
length
of
the
convovle
kernel
supported
is
related
to
the
LSIZE0
and
the
MAX
size
of
LDS,
which
is
HW
related.
For
channels
=
1
,
3
the
RADIUS
is
no
more
than
LSIZE0*2
For
channels
=
2
,
the
RADIUS
is
no
more
than
LSIZE0
For
channels
=
4
,
arbitary
RADIUS
is
supported
unless
the
LDS
is
not
enough
Niko
6/29/2011
The
info
above
maybe
obsolete.
***********************************************************************************
/
#
define
noconvert
#
if
cn
!=
3
#
define
loadpix
(
addr
)
*
(
__global
const
srcT
*
)(
addr
)
#
define
storepix
(
val,
addr
)
*
(
__global
dstT
*
)(
addr
)
=
val
#
define
SRCSIZE
((
int
)
sizeof
(
srcT
))
#
define
DSTSIZE
((
int
)
sizeof
(
dstT
))
#
else
#
define
loadpix
(
addr
)
vload3
(
0
,
(
__global
const
srcT1
*
)(
addr
))
#
define
storepix
(
val,
addr
)
vstore3
(
val,
0
,
(
__global
dstT1
*
)(
addr
))
#
define
SRCSIZE
((
int
)
sizeof
(
srcT1
)
*3
)
#
define
DSTSIZE
((
int
)
sizeof
(
dstT1
)
*3
)
#
endif
#
define
DIG
(
a
)
a,
__constant
float
mat_kernel[]
=
{
COEFF
}
;
__kernel
__attribute__
((
reqd_work_group_size
(
LSIZE0,LSIZE1,1
)))
void
row_filter_C1_D0
(
__global
uchar
*
restrict
src,
int
src_step_in_pixel,
int
src_offset_x,
int
src_offset_y,
int
src_cols,
int
src_rows,
int
src_whole_cols,
int
src_whole_rows,
__global
float
*
dst,
int
dst_step_in_pixel,
int
dst_cols,
int
dst_rows,
int
radiusy
)
__kernel
void
row_filter_C1_D0
(
__global
const
uchar
*
src,
int
src_step_in_pixel,
int
src_offset_x,
int
src_offset_y,
int
src_cols,
int
src_rows,
int
src_whole_cols,
int
src_whole_rows,
__global
float
*
dst,
int
dst_step_in_pixel,
int
dst_cols,
int
dst_rows,
int
radiusy
)
{
int
x
=
get_global_id
(
0
)
<<2
;
int
y
=
get_global_id
(
1
)
;
int
l_x
=
get_local_id
(
0
)
;
int
l_y
=
get_local_id
(
1
)
;
int
start_x
=
x
+
src_offset_x
-
RADIUSX
&
0xfffffffc
;
int
start_x
=
x
+
src_offset_x
-
RADIUSX
&
0xfffffffc
;
int
offset
=
src_offset_x
-
RADIUSX
&
3
;
int
start_y
=
y
+
src_offset_y
-
radiusy
;
int
start_addr
=
mad24
(
start_y,
src_step_in_pixel,
start_x
)
;
int
i
;
float4
sum
;
uchar4
temp[READ_TIMES_ROW]
;
__local
uchar4
LDS_DAT[LSIZE1][READ_TIMES_ROW
*LSIZE0+
1]
;
__local
uchar4
LDS_DAT[LSIZE1][READ_TIMES_ROW
*
LSIZE0
+
1]
;
#
ifdef
BORDER_CONSTANT
int
end_addr
=
mad24
(
src_whole_rows
-
1
,
src_step_in_pixel,
src_whole_cols
)
;
//
read
pixels
from
src
for
(
i
=
0
; i < READ_TIMES_ROW; i++
)
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
{
int
current_addr
=
start_addr+i*LSIZE0*4
;
current_addr
=
((
current_addr
<
end_addr
)
&&
(
current_addr
>
0
))
?
current_addr
:
0
;
temp[i]
=
*
(
__global
uchar4
*
)
&src[current_addr]
;
int
current_addr
=
mad24
(
i,
LSIZE0
<<
2
,
start_addr
)
;
current_addr
=
current_addr
<
end_addr
&&
current_addr
>
0
?
current_addr
:
0
;
temp[i]
=
*
(
__global
const
uchar4
*
)
&src[current_addr]
;
}
//
judge
if
read
out
of
boundary
#
ifdef
BORDER_ISOLATED
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
{
temp[i].x
=
ELEM
(
start_x+i*LSIZE0*4,
src_offset_x,
src_offset_x
+
src_cols,
0
,
temp[i].x
)
;
temp[i].y
=
ELEM
(
start_x+i*LSIZE0*4+1,
src_offset_x,
src_offset_x
+
src_cols,
0
,
temp[i].y
)
;
...
...
@@ -194,7 +173,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
temp[i]
=
ELEM
(
start_y,
src_offset_y,
src_offset_y
+
src_rows,
(
uchar4
)
0
,
temp[i]
)
;
}
#
else
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
{
temp[i].x
=
ELEM
(
start_x+i*LSIZE0*4,
0
,
src_whole_cols,
0
,
temp[i].x
)
;
temp[i].y
=
ELEM
(
start_x+i*LSIZE0*4+1,
0
,
src_whole_cols,
0
,
temp[i].y
)
;
...
...
@@ -209,16 +188,15 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
#else
int not_all_in_range = (start_x<0) |
(
start_x
+
READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols
)
| (start_y<0) |
(
start_y
>=
src_whole_rows
)
;
#
endif
int4
index[READ_TIMES_ROW]
;
int4
addr
;
int4
index[READ_TIMES_ROW],
addr
;
int
s_y
;
if
(
not_all_in_range
)
{
//
judge
if
read
out
of
boundary
for
(
i
=
0
; i < READ_TIMES_ROW; i++
)
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
{
index[i]
=
(
int4
)(
start_x+i*LSIZE0*4
)
+
(
int4
)(
0
,
1
,
2
,
3
)
;
index[i]
=
(
int4
)(
mad24
(
i,
LSIZE0
<<
2
,
start_x
)
)
+
(
int4
)(
0
,
1
,
2
,
3
)
;
#
ifdef
BORDER_ISOLATED
EXTRAPOLATE
(
index[i].x,
src_offset_x,
src_offset_x
+
src_cols
)
;
EXTRAPOLATE
(
index[i].y,
src_offset_x,
src_offset_x
+
src_cols
)
;
...
...
@@ -231,6 +209,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
EXTRAPOLATE
(
index[i].w,
0
,
src_whole_cols
)
;
#
endif
}
s_y
=
start_y
;
#
ifdef
BORDER_ISOLATED
EXTRAPOLATE
(
s_y,
src_offset_y,
src_offset_y
+
src_rows
)
;
...
...
@@ -239,9 +218,9 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
#
endif
//
read
pixels
from
src
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
{
addr
=
mad24
((
int4
)
s_y,
(
int4
)
src_step_in_pixel,
index[i]
)
;
addr
=
mad24
((
int4
)
s_y,
(
int4
)
src_step_in_pixel,
index[i]
)
;
temp[i].x
=
src[addr.x]
;
temp[i].y
=
src[addr.y]
;
temp[i].z
=
src[addr.z]
;
...
...
@@ -251,26 +230,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
else
{
//
read
pixels
from
src
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
temp[i]
=
*
(
__global
uchar4*
)
&src[
start_addr+i*LSIZE0*4
]
;
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
temp[i]
=
*
(
__global
uchar4*
)
&src[
mad24
(
i,
LSIZE0
<<
2
,
start_addr
)
]
;
}
#
endif
//BORDER_CONSTANT
//
save
pixels
to
lds
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
LDS_DAT[l_y][
l_x+i*LSIZE0]=
temp[i]
;
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
LDS_DAT[l_y][
mad24
(
i,
LSIZE0,
l_x
)
]
=
temp[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
read
pixels
from
lds
and
calculate
the
result
sum
=
convert_float4
(
vload4
(
0
,
(
__local
uchar*
)
&LDS_DAT[l_y][l_x]+RADIUSX+offset
))
*
mat_kernel[RADIUSX]
;
for
(
i
=1
; i<=RADIUSX; i++
)
sum
=
convert_float4
(
vload4
(
0
,
(
__local
uchar
*
)
&LDS_DAT[l_y][l_x]+RADIUSX+offset
))
*
mat_kernel[RADIUSX]
;
for
(
i
nt
i
=
1
; i <= RADIUSX; ++i
)
{
temp[0]
=
vload4
(
0
,
(
__local
uchar*
)
&LDS_DAT[l_y][l_x]
+
RADIUSX
+
offset
-
i
)
;
temp[1]
=
vload4
(
0
,
(
__local
uchar*
)
&LDS_DAT[l_y][l_x]
+
RADIUSX
+
offset
+
i
)
;
sum
+=
convert_float4
(
temp[0]
)
*
mat_kernel[RADIUSX-i]
+
convert_float4
(
temp[1]
)
*
mat_kernel[RADIUSX+i]
;
sum
+=
mad
(
convert_float4
(
temp[0]
)
,
mat_kernel[RADIUSX-i],
convert_float4
(
temp[1]
)
*
mat_kernel[RADIUSX
+
i]
)
;
}
start_addr
=
mad24
(
y,
dst_step_in_pixel,
x
)
;
start_addr
=
mad24
(
y,
dst_step_in_pixel,
x
)
;
//
write
the
result
to
dst
if
((
x+3<dst_cols
)
&
(
y<dst_rows
))
...
...
@@ -290,154 +269,58 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
dst[start_addr]
=
sum.x
;
}
__kernel
__attribute__
((
reqd_work_group_size
(
LSIZE0,LSIZE1,1
)))
void
row_filter_C4_D0
(
__global
uchar4
*
restrict
src,
int
src_step_in_pixel,
int
src_offset_x,
int
src_offset_y,
int
src_cols,
int
src_rows,
int
src_whole_cols,
int
src_whole_rows,
__global
float4
*
dst,
int
dst_step_in_pixel,
int
dst_cols,
int
dst_rows,
int
radiusy
)
__kernel
void
row_filter
(
__global
const
srcT
*
src,
int
src_step_in_pixel,
int
src_offset_x,
int
src_offset_y,
int
src_cols,
int
src_rows,
int
src_whole_cols,
int
src_whole_rows,
__global
dstT
*
dst,
int
dst_step_in_pixel,
int
dst_cols,
int
dst_rows,
int
radiusy
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
l_x
=
get_local_id
(
0
)
;
int
l_y
=
get_local_id
(
1
)
;
int
start_x
=
x
+src_offset_x-
RADIUSX
;
int
start_y
=
y
+src_offset_y-
radiusy
;
int
start_addr
=
mad24
(
start_y,
src_step_in_pixel,
start_x
)
;
int
i
;
float4
sum
;
uchar4
temp[READ_TIMES_ROW]
;
int
start_x
=
x
+
src_offset_x
-
RADIUSX
;
int
start_y
=
y
+
src_offset_y
-
radiusy
;
int
start_addr
=
mad24
(
start_y,
src_step_in_pixel,
start_x
)
;
dstT
sum
;
srcT
temp[READ_TIMES_ROW]
;
__local
uchar4
LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+
1]
;
__local
srcT
LDS_DAT[LSIZE1][READ_TIMES_ROW
*
LSIZE0
+
1]
;
#
ifdef
BORDER_CONSTANT
int
end_addr
=
mad24
(
src_whole_rows
-
1
,
src_step_in_pixel,
src_whole_cols
)
;
int
end_addr
=
mad24
(
src_whole_rows
-
1
,
src_step_in_pixel,
src_whole_cols
)
;
//
read
pixels
from
src
for
(
i
=
0
; i<
READ_TIMES_ROW; i++)
for
(
i
nt
i
=
0
; i <
READ_TIMES_ROW; i++)
{
int
current_addr
=
start_addr+i*LSIZE0
;
current_addr
=
((
current_addr
<
end_addr
)
&&
(
current_addr
>
0
))
?
current_addr
:
0
;
int
current_addr
=
mad24
(
i,
LSIZE0,
start_addr
)
;
current_addr
=
current_addr
<
end_addr
&&
current_addr
>
0
?
current_addr
:
0
;
temp[i]
=
src[current_addr]
;
}
//judge
if
read
out
of
boundary
//
judge
if
read
out
of
boundary
#
ifdef
BORDER_ISOLATED
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
{
temp[i]
=
ELEM
(
start_x+i*LSIZE0,
src_offset_x,
src_offset_x
+
src_cols,
(
uchar4
)
0
,
temp[i]
)
;
temp[i]
=
ELEM
(
start_y,
src_offset_y,
src_offset_y
+
src_rows,
(
uchar4
)
0
,
temp[i]
)
;
temp[i]
=
ELEM
(
mad24
(
i,
LSIZE0,
start_x
)
,
src_offset_x,
src_offset_x
+
src_cols,
(
srcT
)(
0
)
,
temp[i]
)
;
temp[i]
=
ELEM
(
start_y,
src_offset_y,
src_offset_y
+
src_rows,
(
srcT
)(
0
)
,
temp[i]
)
;
}
#
else
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
{
temp[i]
=
ELEM
(
start_x+i*LSIZE0,
0
,
src_whole_cols,
(
uchar4
)
0
,
temp[i]
)
;
temp[i]
=
ELEM
(
start_y,
0
,
src_whole_rows,
(
uchar4
)
0
,
temp[i]
)
;
temp[i]
=
ELEM
(
mad24
(
i,
LSIZE0,
start_x
)
,
0
,
src_whole_cols,
(
srcT
)(
0
)
,
temp[i]
)
;
temp[i]
=
ELEM
(
start_y,
0
,
src_whole_rows,
(
srcT
)(
0
)
,
temp[i]
)
;
}
#
endif
#
else
int
index[READ_TIMES_ROW]
;
int
s_x,s_y
;
int
s_x,
s_y
;
//
judge
if
read
out
of
boundary
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
{
s_x
=
start_x+i*LSIZE0
;
s_x
=
mad24
(
i,
LSIZE0,
start_x
)
;
s_y
=
start_y
;
#
ifdef
BORDER_ISOLATED
EXTRAPOLATE
(
s_x,
src_offset_x,
src_offset_x
+
src_cols
)
;
EXTRAPOLATE
(
s_y,
src_offset_y,
src_offset_y
+
src_rows
)
;
#
else
EXTRAPOLATE
(
s_x,
0
,
src_whole_cols
)
;
EXTRAPOLATE
(
s_y,
0
,
src_whole_rows
)
;
#
endif
index[i]=mad24
(
s_y,
src_step_in_pixel,
s_x
)
;
}
//read
pixels
from
src
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
temp[i]
=
src[index[i]]
;
#
endif
//BORDER_CONSTANT
//save
pixels
to
lds
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//read
pixels
from
lds
and
calculate
the
result
sum
=convert_float4
(
LDS_DAT[l_y][l_x+RADIUSX]
)
*mat_kernel[RADIUSX]
;
for
(
i=1
; i<=RADIUSX; i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]
;
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]
;
sum
+=
convert_float4
(
temp[0]
)
*mat_kernel[RADIUSX-i]+convert_float4
(
temp[1]
)
*mat_kernel[RADIUSX+i]
;
}
//write
the
result
to
dst
if
(
x<dst_cols
&&
y<dst_rows
)
{
start_addr
=
mad24
(
y,dst_step_in_pixel,x
)
;
dst[start_addr]
=
sum
;
}
}
__kernel
__attribute__
((
reqd_work_group_size
(
LSIZE0,LSIZE1,1
)))
void
row_filter_C1_D5
(
__global
float
*
restrict
src,
int
src_step_in_pixel,
int
src_offset_x,
int
src_offset_y,
int
src_cols,
int
src_rows,
int
src_whole_cols,
int
src_whole_rows,
__global
float
*
dst,
int
dst_step_in_pixel,
int
dst_cols,
int
dst_rows,
int
radiusy
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
l_x
=
get_local_id
(
0
)
;
int
l_y
=
get_local_id
(
1
)
;
int
start_x
=
x+src_offset_x-RADIUSX
;
int
start_y
=
y+src_offset_y-radiusy
;
int
start_addr
=
mad24
(
start_y,src_step_in_pixel,start_x
)
;
int
i
;
float
sum
;
float
temp[READ_TIMES_ROW]
;
__local
float
LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]
;
#
ifdef
BORDER_CONSTANT
int
end_addr
=
mad24
(
src_whole_rows
-
1
,
src_step_in_pixel,src_whole_cols
)
;
//
read
pixels
from
src
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
{
int
current_addr
=
start_addr+i*LSIZE0
;
current_addr
=
((
current_addr
<
end_addr
)
&&
(
current_addr
>
0
))
?
current_addr
:
0
;
temp[i]
=
src[current_addr]
;
}
//
judge
if
read
out
of
boundary
#
ifdef
BORDER_ISOLATED
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
{
temp[i]=
ELEM
(
start_x+i*LSIZE0,
src_offset_x,
src_offset_x
+
src_cols,
(
float
)
0
,
temp[i]
)
;
temp[i]=
ELEM
(
start_y,
src_offset_y,
src_offset_y
+
src_rows,
(
float
)
0
,
temp[i]
)
;
}
#
else
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
{
temp[i]=
ELEM
(
start_x+i*LSIZE0,
0
,
src_whole_cols,
(
float
)
0
,
temp[i]
)
;
temp[i]=
ELEM
(
start_y,
0
,
src_whole_rows,
(
float
)
0
,
temp[i]
)
;
}
#
endif
#
else
//
BORDER_CONSTANT
int
index[READ_TIMES_ROW]
;
int
s_x,s_y
;
//
judge
if
read
out
of
boundary
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
{
s_x
=
start_x
+
i*LSIZE0,
s_y
=
start_y
;
#
ifdef
BORDER_ISOLATED
EXTRAPOLATE
(
s_x,
src_offset_x,
src_offset_x
+
src_cols
)
;
EXTRAPOLATE
(
s_y,
src_offset_y,
src_offset_y
+
src_rows
)
;
...
...
@@ -445,125 +328,31 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
EXTRAPOLATE
(
s_x,
0
,
src_whole_cols
)
;
EXTRAPOLATE
(
s_y,
0
,
src_whole_rows
)
;
#
endif
index[i]=mad24
(
s_y,
src_step_in_pixel,
s_x
)
;
}
//
read
pixels
from
src
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
temp[i]
=
src[index[i]]
;
#
endif//
BORDER_CONSTANT
//save
pixels
to
lds
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
read
pixels
from
lds
and
calculate
the
result
sum
=LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX]
;
for
(
i=1
; i<=RADIUSX; i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]
;
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]
;
sum
+=
temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i]
;
}
//
write
the
result
to
dst
if
(
x<dst_cols
&&
y<dst_rows
)
{
start_addr
=
mad24
(
y,dst_step_in_pixel,x
)
;
dst[start_addr]
=
sum
;
index[i]
=
mad24
(
s_y,
src_step_in_pixel,
s_x
)
;
}
}
__kernel
__attribute__
((
reqd_work_group_size
(
LSIZE0,LSIZE1,1
)))
void
row_filter_C4_D5
(
__global
float4
*
restrict
src,
int
src_step_in_pixel,
int
src_offset_x,
int
src_offset_y,
int
src_cols,
int
src_rows,
int
src_whole_cols,
int
src_whole_rows,
__global
float4
*
dst,
int
dst_step_in_pixel,
int
dst_cols,
int
dst_rows,
int
radiusy
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
l_x
=
get_local_id
(
0
)
;
int
l_y
=
get_local_id
(
1
)
;
int
start_x
=
x+src_offset_x-RADIUSX
;
int
start_y
=
y+src_offset_y-radiusy
;
int
start_addr
=
mad24
(
start_y,src_step_in_pixel,start_x
)
;
int
i
;
float4
sum
;
float4
temp[READ_TIMES_ROW]
;
__local
float4
LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]
;
#
ifdef
BORDER_CONSTANT
int
end_addr
=
mad24
(
src_whole_rows
-
1
,
src_step_in_pixel,src_whole_cols
)
;
//
read
pixels
from
src
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
{
int
current_addr
=
start_addr+i*LSIZE0
;
current_addr
=
((
current_addr
<
end_addr
)
&&
(
current_addr
>
0
))
?
current_addr
:
0
;
temp[i]
=
src[current_addr]
;
}
//
judge
if
read
out
of
boundary
#
ifdef
BORDER_ISOLATED
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
{
temp[i]=
ELEM
(
start_x+i*LSIZE0,
src_offset_x,
src_offset_x
+
src_cols,
(
float4
)
0
,
temp[i]
)
;
temp[i]=
ELEM
(
start_y,
src_offset_y,
src_offset_y
+
src_rows,
(
float4
)
0
,
temp[i]
)
;
}
#
else
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
{
temp[i]=
ELEM
(
start_x+i*LSIZE0,
0
,
src_whole_cols,
(
float4
)
0
,
temp[i]
)
;
temp[i]=
ELEM
(
start_y,
0
,
src_whole_rows,
(
float4
)
0
,
temp[i]
)
;
}
#
endif
#
else
int
index[READ_TIMES_ROW]
;
int
s_x,s_y
;
//
judge
if
read
out
of
boundary
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
{
s_x
=
start_x
+
i*LSIZE0,
s_y
=
start_y
;
#
ifdef
BORDER_ISOLATED
EXTRAPOLATE
(
s_x,
src_offset_x,
src_offset_x
+
src_cols
)
;
EXTRAPOLATE
(
s_y,
src_offset_y,
src_offset_y
+
src_rows
)
;
#
else
EXTRAPOLATE
(
s_x,
0
,
src_whole_cols
)
;
EXTRAPOLATE
(
s_y,
0
,
src_whole_rows
)
;
#
endif
index[i]=mad24
(
s_y,src_step_in_pixel,s_x
)
;
}
//
read
pixels
from
src
for
(
i
=
0
; i<READ_TIMES_ROW; i++)
for
(
int
i
=
0
; i < READ_TIMES_ROW; ++i)
temp[i]
=
src[index[i]]
;
#
endif
#
endif
//
BORDER_CONSTANT
//
save
pixels
to
lds
for
(
i
=
0
; i<READ_TIMES_ROW; i++
)
LDS_DAT[l_y][
l_x+i*LSIZE0]=
temp[i]
;
for
(
i
nt
i
=
0
; i < READ_TIMES_ROW; ++i
)
LDS_DAT[l_y][
mad24
(
i,
LSIZE0,
l_x
)
]
=
temp[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
read
pixels
from
lds
and
calculate
the
result
sum
=
LDS_DAT[l_y][l_x+RADIUSX]*
mat_kernel[RADIUSX]
;
for
(
i
=1
; i<=RADIUSX; i++
)
sum
=
convertToDstT
(
LDS_DAT[l_y][l_x
+
RADIUSX]
)
*
mat_kernel[RADIUSX]
;
for
(
i
nt
i
=
1
; i <= RADIUSX; ++i
)
{
temp[0]
=LDS_DAT[l_y][l_x+RADIUSX-
i]
;
temp[1]
=LDS_DAT[l_y][l_x+RADIUSX+
i]
;
sum
+=
temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i]
;
temp[0]
=
LDS_DAT[l_y][l_x
+
RADIUSX
-
i]
;
temp[1]
=
LDS_DAT[l_y][l_x
+
RADIUSX
+
i]
;
sum
+=
mad
(
convertToDstT
(
temp[0]
)
,
mat_kernel[RADIUSX
-
i],
convertToDstT
(
temp[1]
)
*
mat_kernel[RADIUSX
+
i]
)
;
}
//
write
the
result
to
dst
if
(
x
<dst_cols
&&
y<
dst_rows
)
if
(
x
<
dst_cols
&&
y
<
dst_rows
)
{
start_addr
=
mad24
(
y,
dst_step_in_pixel,
x
)
;
start_addr
=
mad24
(
y,
dst_step_in_pixel,
x
)
;
dst[start_addr]
=
sum
;
}
}
modules/imgproc/src/opencl/filterSep_singlePass.cl
View file @
b449b0bf
...
...
@@ -75,6 +75,7 @@
#endif
#define SRC(_x,_y) convertToWT(((global srcT*)(Src+(_y)*src_step))[_x])
#define DST(_x,_y) (((global dstT*)(Dst+dst_offset+(_y)*dst_step))[_x])
#ifdef BORDER_CONSTANT
// CCCCCC|abcdefgh|CCCCCCC
...
...
@@ -83,8 +84,6 @@
#
define
ELEM
(
_x,_y,r_edge,t_edge,const_v
)
SRC
((
_x
)
,
(
_y
))
#
endif
#
define
DST
(
_x,_y
)
(((
global
dstT*
)(
Dst+dst_offset+
(
_y
)
*dst_step
))
[_x]
)
#
define
noconvert
//
horizontal
and
vertical
filter
kernels
...
...
@@ -101,15 +100,15 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
//
all
these
should
be
defined
on
host
during
compile
time
//
first
lsmem
array
for
source
pixels
used
in
first
pass,
//
second
lsmemDy
for
storing
first
pass
results
__local
WT
lsmem[BLK_Y
+2*RADIUSY][BLK_X+2*
RADIUSX]
;
__local
WT
lsmemDy[BLK_Y][BLK_X
+2*
RADIUSX]
;
__local
WT
lsmem[BLK_Y
+
2
*
RADIUSY][BLK_X
+
2
*
RADIUSX]
;
__local
WT
lsmemDy[BLK_Y][BLK_X
+
2
*
RADIUSX]
;
//
get
local
and
global
ids
-
used
as
image
and
local
memory
array
indexes
int
lix
=
get_local_id
(
0
)
;
int
liy
=
get_local_id
(
1
)
;
int
x
=
(
int
)
get_global_id
(
0
)
;
int
y
=
(
int
)
get_global_id
(
1
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
//
calculate
pixel
position
in
source
image
taking
image
offset
into
account
int
srcX
=
x
+
srcOffsetX
-
RADIUSX
;
...
...
modules/imgproc/test/ocl/test_sepfilter2D.cpp
View file @
b449b0bf
...
...
@@ -79,12 +79,14 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
ksize
.
width
++
;
if
(
1
!=
(
ksize
.
height
%
2
))
ksize
.
height
++
;
Mat
temp
=
randomMat
(
Size
(
ksize
.
width
,
1
),
CV_MAKE_TYPE
(
CV_32F
,
1
),
-
MAX_VALUE
,
MAX_VALUE
);
cv
::
normalize
(
temp
,
kernelX
,
1.0
,
0.0
,
NORM_L1
);
temp
=
randomMat
(
Size
(
1
,
ksize
.
height
),
CV_MAKE_TYPE
(
CV_32F
,
1
),
-
MAX_VALUE
,
MAX_VALUE
);
cv
::
normalize
(
temp
,
kernelY
,
1.0
,
0.0
,
NORM_L1
);
Size
roiSize
=
randomSize
(
ksize
.
width
,
MAX_VALUE
,
ksize
.
height
,
MAX_VALUE
);
Size
roiSize
=
randomSize
(
ksize
.
width
+
16
,
MAX_VALUE
,
ksize
.
height
+
20
,
MAX_VALUE
);
std
::
cout
<<
roiSize
<<
std
::
endl
;
int
rest
=
roiSize
.
width
%
4
;
if
(
0
!=
rest
)
roiSize
.
width
+=
(
4
-
rest
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment