Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
dec6a3b0
Commit
dec6a3b0
authored
Apr 15, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
make boxfilter kernel compile on Mac GPU OCL
parent
abe2ea59
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
184 additions
and
183 deletions
+184
-183
filtering_boxFilter.cl
modules/ocl/src/opencl/filtering_boxFilter.cl
+184
-183
No files found.
modules/ocl/src/opencl/filtering_boxFilter.cl
View file @
dec6a3b0
...
...
@@ -79,15 +79,73 @@
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
endif
#
define
THREADS
256
#
define
ELEM
(
i,
l_edge,
r_edge,
elem1,
elem2
)
(
i
)
>=
(
l_edge
)
&&
(
i
)
<
(
r_edge
)
?
(
elem1
)
:
(
elem2
)
inline
void
update_dst_C1_D0
(
__global
uchar
*dst,
__local
uint*
temp,
int
dst_rows,
int
dst_cols,
int
dst_startX,
int
dst_x_off,
float
alpha
)
{
if
(
get_local_id
(
0
)
<
anX
|
| get_local_id(0) >= (THREADS-ksX+anX+1))
{
return;
}
uint4 tmp_sum = 0;
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
int posY = (get_group_id(1) << 1);
for(int i=-anX; i<=anX; i++)
{
tmp_sum += vload4(get_local_id(0), temp+i);
}
if(posY < dst_rows && posX < dst_cols)
{
tmp_sum /= (uint4) alpha;
if(posX >= 0 && posX < dst_cols)
*(dst) = tmp_sum.x;
if(posX+1 >= 0 && posX+1 < dst_cols)
*(dst + 1) = tmp_sum.y;
if(posX+2 >= 0 && posX+2 < dst_cols)
*(dst + 2) = tmp_sum.z;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst + 3) = tmp_sum.w;
}
}
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
int dst_rows, int dst_cols,
int dst_startX, int dst_x_off,
float alpha)
{
if(get_local_id(0) >= (THREADS-ksX+1))
{
return;
}
int posX = dst_startX - dst_x_off + get_local_id(0);
int posY = (get_group_id(1) << 1);
uint4 temp_sum = 0;
for(int i=-anX; i<=anX; i++)
{
temp_sum += temp[get_local_id(0) + anX + i];
}
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
*dst = convert_uchar4(convert_float4(temp_sum)/alpha);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#
define
THREADS
256
#
define
ELEM
(
i,
l_edge,
r_edge,
elem1,
elem2
)
(
i
)
>=
(
l_edge
)
&&
(
i
)
<
(
r_edge
)
?
(
elem1
)
:
(
elem2
)
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
...
...
@@ -105,115 +163,84 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
int dst_startY = (gY << 1) + dst_y_off;
uint4 data[ksY+1];
__local
uint4
temp[
(
THREADS<<1
)
]
;
__local uint4 temp[
2][THREADS
];
#ifdef BORDER_CONSTANT
for
(
int
i=0
; i < ksY+1; i++)
for(int i=0; i < ksY+1; i++)
{
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
{
if
(
startY+i
>=0
&&
startY+i
<
src_whole_rows
&&
startX+col*4
>=0
&&
startX+col*4+3<src_whole_cols
)
data[i]
=
convert_uint4
(
vload4
(
col,
(
__global
uchar*
)(
src+
(
startY+i
)
*src_step
+
startX
)))
;
else
{
data[i]=0
;
int
con
=
startY+i
>=0
&&
startY+i
<
src_whole_rows
&&
startX+col*4
>=0
&&
startX+col*4<src_whole_cols
;
if
(
con
)
data[i].s0
=
*
(
src+
(
startY+i
)
*src_step
+
startX
+
col*4
)
;
con
=
startY+i
>=0
&&
startY+i
<
src_whole_rows
&&
startX+col*4+1
>=0
&&
startX+col*4+1<src_whole_cols
;
if
(
con
)
data[i].s1
=
*
(
src+
(
startY+i
)
*src_step
+
startX
+
col*4+1
)
;
con
=
startY+i
>=0
&&
startY+i
<
src_whole_rows
&&
startX+col*4+2
>=0
&&
startX+col*4+2<src_whole_cols
;
if
(
con
)
data[i].s2
=
*
(
src+
(
startY+i
)
*src_step
+
startX
+
col*4+2
)
;
con
=
startY+i
>=0
&&
startY+i
<
src_whole_rows
&&
startX+col*4+3
>=0
&&
startX+col*4+3<src_whole_cols
;
if
(
con
)
data[i].s3
=
*
(
src+
(
startY+i
)
*src_step
+
startX
+
col*4+3
)
;
}
data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
}
else
{
data[i]=0;
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
}
}
#else
int
not_all_in_range
;
for
(
int
i=0
; i < ksY+1; i++)
{
not_all_in_range
=
(
startX+col*4<0
)
| (startX+col*4+3>src_whole_cols-1)
|
(
startY+i<0
)
|
(
startY+i>src_whole_rows-1
)
;
if
(
not_all_in_range
)
{
int
selected_row
;
int4
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col.x
=
ADDR_L
(
startX+col*4,
0
,
src_whole_cols
)
;
selected_col.x
=
ADDR_R
(
startX+col*4,
src_whole_cols,
selected_col.x
)
;
selected_col.y
=
ADDR_L
(
startX+col*4+1,
0
,
src_whole_cols
)
;
selected_col.y
=
ADDR_R
(
startX+col*4+1,
src_whole_cols,
selected_col.y
)
;
selected_col.z
=
ADDR_L
(
startX+col*4+2,
0
,
src_whole_cols
)
;
selected_col.z
=
ADDR_R
(
startX+col*4+2,
src_whole_cols,
selected_col.z
)
;
selected_col.w
=
ADDR_L
(
startX+col*4+3,
0
,
src_whole_cols
)
;
selected_col.w
=
ADDR_R
(
startX+col*4+3,
src_whole_cols,
selected_col.w
)
;
data[i].x
=
*
(
src
+
selected_row
*
src_step
+
selected_col.x
)
;
data[i].y
=
*
(
src
+
selected_row
*
src_step
+
selected_col.y
)
;
data[i].z
=
*
(
src
+
selected_row
*
src_step
+
selected_col.z
)
;
data[i].w
=
*
(
src
+
selected_row
*
src_step
+
selected_col.w
)
;
}
else
{
data[i]
=
convert_uint4
(
vload4
(
col,
(
__global
uchar*
)(
src+
(
startY+i
)
*src_step
+
startX
)))
;
}
}
#
endif
uint4
sum0
=
0
,
sum1
=
0
,
sum2
=
0
;
for
(
int
i=1
; i < ksY; i++)
int not_all_in_range;
for(int i=0; i < ksY+1; i++)
{
sum0
+=
(
data[i]
)
;
}
sum1
=
sum0
+
(
data[0]
)
;
sum2
=
sum0
+
(
data[ksY]
)
;
not_all_in_range = (startX+col*4<0) |
(
startX+col*4+3>src_whole_cols-1
)
| (startY+i<0) |
(
startY+i>src_whole_rows-1
)
;
if
(
not_all_in_range
)
{
int
selected_row
;
int4
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
temp[col]
=
sum1
;
temp[col+THREADS]
=
sum2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
selected_col.x
=
ADDR_L
(
startX+col*4,
0
,
src_whole_cols
)
;
selected_col.x
=
ADDR_R
(
startX+col*4,
src_whole_cols,
selected_col.x
)
;
if
(
col
>=
anX
&&
col
<
(
THREADS-ksX+anX+1
))
{
int
posX
=
dst_startX
-
dst_x_off
+
(
col-anX
)
*4
;
int
posY
=
(
gY
<<
1
)
;
uint4
tmp_sum1=0,
tmp_sum2=0
;
for
(
int
i=-anX
; i<=anX; i++)
{
tmp_sum1
+=
vload4
(
col,
(
__local
uint*
)
temp+i
)
;
}
selected_col.y
=
ADDR_L
(
startX+col*4+1,
0
,
src_whole_cols
)
;
selected_col.y
=
ADDR_R
(
startX+col*4+1,
src_whole_cols,
selected_col.y
)
;
for
(
int
i=-anX
; i<=anX; i++)
{
tmp_sum2
+=
vload4
(
col,
(
__local
uint*
)(
temp+THREADS
)
+i
)
;
}
selected_col.z
=
ADDR_L
(
startX+col*4+2,
0
,
src_whole_cols
)
;
selected_col.z
=
ADDR_R
(
startX+col*4+2,
src_whole_cols,
selected_col.z
)
;
if
(
posY
<
dst_rows
&&
posX
<
dst_cols
)
{
if
(
posX
>=
0
&&
posX
<
dst_cols
)
*
(
dst+dst_startY
*
dst_step
+
dst_startX
+
(
col-anX
)
*4
)
=
tmp_sum1.x/alpha
;
if
(
posX+1
>=
0
&&
posX+1
<
dst_cols
)
*
(
dst+dst_startY
*
dst_step
+
dst_startX+1
+
(
col-anX
)
*4
)
=
tmp_sum1.y/alpha
;
if
(
posX+2
>=
0
&&
posX+2
<
dst_cols
)
*
(
dst+dst_startY
*
dst_step
+
dst_startX+2
+
(
col-anX
)
*4
)
=
tmp_sum1.z/alpha
;
if
(
posX+3
>=
0
&&
posX+3
<
dst_cols
)
*
(
dst+dst_startY
*
dst_step
+
dst_startX+3
+
(
col-anX
)
*4
)
=
tmp_sum1.w/alpha
;
selected_col.w
=
ADDR_L
(
startX+col*4+3,
0
,
src_whole_cols
)
;
selected_col.w
=
ADDR_R
(
startX+col*4+3,
src_whole_cols,
selected_col.w
)
;
data[i].x
=
*
(
src
+
selected_row
*
src_step
+
selected_col.x
)
;
data[i].y
=
*
(
src
+
selected_row
*
src_step
+
selected_col.y
)
;
data[i].z
=
*
(
src
+
selected_row
*
src_step
+
selected_col.z
)
;
data[i].w
=
*
(
src
+
selected_row
*
src_step
+
selected_col.w
)
;
}
if
(
posY+1
<
dst_rows
&&
posX
<
dst_cols
)
else
{
dst_startY+=1
;
if
(
posX
>=
0
&&
posX
<
dst_cols
)
*
(
dst+dst_startY
*
dst_step
+
dst_startX
+
(
col-anX
)
*4
)
=
tmp_sum2.x/alpha
;
if
(
posX+1
>=
0
&&
posX+1
<
dst_cols
)
*
(
dst+dst_startY
*
dst_step
+
dst_startX+1
+
(
col-anX
)
*4
)
=
tmp_sum2.y/alpha
;
if
(
posX+2
>=
0
&&
posX+2
<
dst_cols
)
*
(
dst+dst_startY
*
dst_step
+
dst_startX+2
+
(
col-anX
)
*4
)
=
tmp_sum2.z/alpha
;
if
(
posX+3
>=
0
&&
posX+3
<
dst_cols
)
*
(
dst+dst_startY
*
dst_step
+
dst_startX+3
+
(
col-anX
)
*4
)
=
tmp_sum2.w/alpha
;
data[i]
=
convert_uint4
(
vload4
(
col,
(
__global
uchar*
)(
src+
(
startY+i
)
*src_step
+
startX
)))
;
}
}
#
endif
uint4
tmp_sum
=
0
;
for
(
int
i=1
; i < ksY; i++)
{
tmp_sum
+=
(
data[i]
)
;
}
int
index
=
dst_startY
*
dst_step
+
dst_startX
+
(
col-anX
)
*4
;
temp[0][col]
=
tmp_sum
+
(
data[0]
)
;
temp[1][col]
=
tmp_sum
+
(
data[ksY]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
update_dst_C1_D0
(
dst+index,
(
__local
uint
*
)(
temp[0]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
update_dst_C1_D0
(
dst+index+dst_step,
(
__local
uint
*
)(
temp[1]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
}
...
...
@@ -221,9 +248,9 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
boxFilter_C4_D0
(
__global
const
uchar4
*
restrict
src,
__global
uchar4
*dst,
float
alpha,
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
{
int
col
=
get_local_id
(
0
)
;
const
int
gX
=
get_group_id
(
0
)
;
...
...
@@ -238,81 +265,63 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
//int
end_addr
=
(
src_whole_rows-1
)
*
(
src_step>>2
)
+
src_whole_cols-4
;
int
end_addr
=
src_whole_cols-4
;
uint4
data[ksY+1]
;
__local
uint4
temp[2][THREADS]
;
#
ifdef
BORDER_CONSTANT
bool
con
;
uint4
ss
;
for
(
int
i=0
; i < ksY+1; i++)
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
//int
cur_addr
=
clamp
((
startY+i
)
*
(
src_step>>2
)
+
(
startX+col
)
,
0
,
end_addr
)
;
//ss
=
convert_uint4
(
src[cur_addr]
)
;
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
if
(
con
)
ss
=
convert_uint4
(
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col]
)
;
data[i]
=
con
?
ss
:
0
;
data[i].x
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].x
:
0
;
data[i].y
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].y
:
0
;
data[i].z
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].z
:
0
;
data[i].w
=
con
?
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col].w
:
0
;
}
#
else
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
data[i]
=
convert_uint4
(
src[selected_row
*
(
src_step>>2
)
+
selected_col]
)
;
}
data[i]
=
convert_uint4
(
src[selected_row
*
(
src_step>>2
)
+
selected_col]
)
;
}
#
endif
uint4
sum0
=
0
,
sum1
=
0
,
sum2
=
0
;
uint4
tmp_sum
=
0
;
for
(
int
i=1
; i < ksY; i++)
{
sum0
+=
(
data[i]
)
;
tmp_sum
+=
(
data[i]
)
;
}
sum1
=
sum0
+
(
data[0]
)
;
sum2
=
sum0
+
(
data[ksY]
)
;
temp[0][col]
=
sum1
;
temp[1][col]
=
sum2
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
col
<
(
THREADS-
(
ksX-1
)))
{
col
+=
anX
;
int
posX
=
dst_startX
-
dst_x_off
+
col
-
anX
;
int
posY
=
(
gY
<<
1
)
;
uint4
tmp_sum[2]={
(
uint4
)(
0
,
0
,
0
,
0
)
,
(
uint4
)(
0
,
0
,
0
,
0
)
}
;
for
(
int
k=0
; k<2; k++)
for
(
int
i=-anX
; i<=anX; i++)
{
tmp_sum[k]
+=
temp[k][col+i]
;
}
for
(
int
i=0
; i<2; i++)
{
if
(
posX
>=
0
&&
posX
<
dst_cols
&&
(
posY+i
)
>=
0
&&
(
posY+i
)
<
dst_rows
)
dst[
(
dst_startY+i
)
*
(
dst_step>>2
)
+
dst_startX
+
col
-
anX]
=
convert_uchar4
(
convert_float4
(
tmp_sum[i]
)
/alpha
)
;
}
int
index
=
dst_startY
*
(
dst_step>>2
)
+
dst_startX
+
col
;
temp[0][col]
=
tmp_sum
+
(
data[0]
)
;
temp[1][col]
=
tmp_sum
+
(
data[ksY]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
update_dst_C4_D0
(
dst+index,
(
__local
uint4
*
)(
temp[0]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
update_dst_C4_D0
(
dst+index+
(
dst_step>>2
)
,
(
__local
uint4
*
)(
temp[1]
)
,
dst_rows,
dst_cols,
dst_startX,
dst_x_off,
alpha
)
;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
boxFilter_C1_D5
(
__global
const
float
*restrict
src,
__global
float
*dst,
float
alpha,
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
{
int
col
=
get_local_id
(
0
)
;
const
int
gX
=
get_group_id
(
0
)
;
...
...
@@ -327,7 +336,6 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
int
end_addr
=
(
src_whole_rows-1
)
*
(
src_step>>2
)
+
src_whole_cols-4
;
float
data[ksY+1]
;
__local
float
temp[2][THREADS]
;
#
ifdef
BORDER_CONSTANT
...
...
@@ -336,28 +344,25 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
for
(
int
i=0
; i < ksY+1; i++)
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
//int
cur_addr
=
clamp
((
startY+i
)
*
(
src_step>>2
)
+
(
startX+col
)
,
0
,
end_addr
)
;
//ss
=
src[cur_addr]
;
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
//ss
=
src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col]
;
ss
=
(
startY+i
)
<src_whole_rows&&
(
startY+i
)
>=0&&cur_col>=0&&cur_col<src_whole_cols?src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col]:0
;
ss
=
(
startY+i
)
<src_whole_rows&&
(
startY+i
)
>=0&&cur_col>=0&&cur_col<src_whole_cols?src[
(
startY+i
)
*
(
src_step>>2
)
+
cur_col]:
(
float
)
0
;
data[i]
=
con
?
ss
:
0.f
;
}
#
else
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
data[i]
=
src[selected_row
*
(
src_step>>2
)
+
selected_col]
;
}
data[i]
=
src[selected_row
*
(
src_step>>2
)
+
selected_col]
;
}
#
endif
float
sum0
=
0.0
,
sum1
=
0.0
,
sum2
=
0.0
;
...
...
@@ -376,7 +381,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
int
posX
=
dst_startX
-
dst_x_off
+
col
-
anX
;
int
posY
=
(
gY
<<
1
)
;
float
tmp_sum[2]={0.0,
0.0}
;
float
tmp_sum[2]=
{0.0,
0.0}
;
for
(
int
k=0
; k<2; k++)
for
(
int
i=-anX
; i<=anX; i++)
{
...
...
@@ -395,9 +400,9 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
boxFilter_C4_D5
(
__global
const
float4
*restrict
src,
__global
float4
*dst,
float
alpha,
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
int
src_offset,
int
src_whole_rows,
int
src_whole_cols,
int
src_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
int
dst_step
)
{
int
col
=
get_local_id
(
0
)
;
const
int
gX
=
get_group_id
(
0
)
;
...
...
@@ -412,7 +417,6 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
int
startY
=
(
gY
<<
1
)
-
anY
+
src_y_off
;
int
dst_startX
=
gX
*
(
THREADS-ksX+1
)
+
dst_x_off
;
int
dst_startY
=
(
gY
<<
1
)
+
dst_y_off
;
int
end_addr
=
(
src_whole_rows-1
)
*
(
src_step>>4
)
+
src_whole_cols-16
;
float4
data[ksY+1]
;
__local
float4
temp[2][THREADS]
;
#
ifdef
BORDER_CONSTANT
...
...
@@ -421,28 +425,25 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
for
(
int
i=0
; i < ksY+1; i++)
{
con
=
startX+col
>=
0
&&
startX+col
<
src_whole_cols
&&
startY+i
>=
0
&&
startY+i
<
src_whole_rows
;
//int
cur_addr
=
clamp
((
startY+i
)
*
(
src_step>>4
)
+
(
startX+col
)
,
0
,
end_addr
)
;
//ss
=
src[cur_addr]
;
int
cur_col
=
clamp
(
startX
+
col,
0
,
src_whole_cols
)
;
//ss
=
src[
(
startY+i
)
*
(
src_step>>4
)
+
cur_col]
;
ss
=
(
startY+i
)
<src_whole_rows&&
(
startY+i
)
>=0&&cur_col>=0&&cur_col<src_whole_cols?src[
(
startY+i
)
*
(
src_step>>4
)
+
cur_col]:0
;
ss
=
(
startY+i
)
<src_whole_rows&&
(
startY+i
)
>=0&&cur_col>=0&&cur_col<src_whole_cols?src[
(
startY+i
)
*
(
src_step>>4
)
+
cur_col]:
(
float4
)
0
;
data[i]
=
con
?
ss
:
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
;
}
#
else
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
for
(
int
i=0
; i < ksY+1; i++)
{
int
selected_row
;
int
selected_col
;
selected_row
=
ADDR_H
(
startY+i,
0
,
src_whole_rows
)
;
selected_row
=
ADDR_B
(
startY+i,
src_whole_rows,
selected_row
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
selected_col
=
ADDR_L
(
startX+col,
0
,
src_whole_cols
)
;
selected_col
=
ADDR_R
(
startX+col,
src_whole_cols,
selected_col
)
;
data[i]
=
src[selected_row
*
(
src_step>>4
)
+
selected_col]
;
}
data[i]
=
src[selected_row
*
(
src_step>>4
)
+
selected_col]
;
}
#
endif
float4
sum0
=
0.0
,
sum1
=
0.0
,
sum2
=
0.0
;
...
...
@@ -461,7 +462,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
int
posX
=
dst_startX
-
dst_x_off
+
col
-
anX
;
int
posY
=
(
gY
<<
1
)
;
float4
tmp_sum[2]={
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
,
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
}
;
float4
tmp_sum[2]=
{
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
,
(
float4
)(
0.0
,
0.0
,
0.0
,
0.0
)
}
;
for
(
int
k=0
; k<2; k++)
for
(
int
i=-anX
; i<=anX; i++)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment