Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
310bd870
Commit
310bd870
authored
Sep 18, 2012
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
a little optimizations to ocl::pyrDown
parent
b1d14f76
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
400 additions
and
473 deletions
+400
-473
pyr_down.cl
modules/ocl/src/kernels/pyr_down.cl
+400
-473
No files found.
modules/ocl/src/kernels/pyr_down.cl
View file @
310bd870
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Multicoreware,
Inc.,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Dachuan
Zhao,
dachuan@multicorewareinc.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
oclMaterials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
pragma
OPENCL
EXTENSION
cl_amd_printf
:
enable
uchar
round_uchar_uchar
(
uchar
v
)
{
return
v
;
}
uchar
round_uchar_int
(
int
v
)
{
return
(
uchar
)((
uint
)
v
<=
255
?
v
:
v
>
0
?
255
:
0
)
;
}
uchar
round_uchar_float
(
float
v
)
{
if
(
v
-
convert_int_sat_rte
(
v
)
>
1e-6
|
| v - convert_int_sat_rte(v) < -1e-6)
{
if(((int)v + 1) - (v + 0.5f) < 1e-6 && ((int)v + 1) - (v + 0.5f) > -1e-6)
{
v = (int)v + 0.51f;
}
}
int iv = convert_int_sat_rte(v);
return round_uchar_int(iv);
}
uchar4 round_uchar4_uchar4(uchar4 v)
{
return v;
}
uchar4 round_uchar4_int4(int4 v)
{
uchar4 result;
result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0);
result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0);
result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0);
result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0);
return result;
}
uchar4 round_uchar4_float4(float4 v)
{
if(v.x - convert_int_sat_rte(v.x) > 1e-6 || v.x - convert_int_sat_rte(v.x) < -1e-6)
{
if(((int)(v.x) + 1) - (v.x + 0.5f) < 1e-6 && ((int)(v.x) + 1) - (v.x + 0.5f) > -1e-6)
{
v.x = (int)(v.x) + 0.51f;
}
}
if(v.y - convert_int_sat_rte(v.y) > 1e-6 || v.y - convert_int_sat_rte(v.y) < -1e-6)
{
if(((int)(v.y) + 1) - (v.y + 0.5f) < 1e-6 && ((int)(v.y) + 1) - (v.y + 0.5f) > -1e-6)
{
v.y = (int)(v.y) + 0.51f;
}
}
if(v.z - convert_int_sat_rte(v.z) > 1e-6 || v.z - convert_int_sat_rte(v.z) < -1e-6)
{
if(((int)(v.z) + 1) - (v.z + 0.5f) < 1e-6 && ((int)(v.z) + 1) - (v.z + 0.5f) > -1e-6)
{
v.z = (int)(v.z) + 0.51f;
}
}
if(v.w - convert_int_sat_rte(v.w) > 1e-6 |
|
v.w
-
convert_int_sat_rte
(
v.w
)
<
-1e-6
)
{
if
(((
int
)(
v.w
)
+
1
)
-
(
v.w
+
0.5f
)
<
1e-6
&&
((
int
)(
v.w
)
+
1
)
-
(
v.w
+
0.5f
)
>
-1e-6
)
{
v.w
=
(
int
)(
v.w
)
+
0.51f
;
}
}
int4
iv
=
convert_int4_sat_rte
(
v
)
;
return
round_uchar4_int4
(
iv
)
;
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Multicoreware,
Inc.,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Dachuan
Zhao,
dachuan@multicorewareinc.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
oclMaterials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
pragma
OPENCL
EXTENSION
cl_amd_printf
:
enable
uchar
round_uchar_uchar
(
uchar
v
)
{
return
v
;
}
uchar
round_uchar_int
(
int
v
)
{
return
(
uchar
)((
uint
)
v
<=
255
?
v
:
v
>
0
?
255
:
0
)
;
}
uchar
round_uchar_float
(
float
v
)
{
int
iv
=
convert_int_sat_rte
(
v
)
;
return
round_uchar_int
(
iv
)
;
}
uchar4
round_uchar4_uchar4
(
uchar4
v
)
{
return
v
;
}
uchar4
round_uchar4_int4
(
int4
v
)
{
uchar4
result
;
result.x
=
(
uchar
)(
v.x
<=
255
?
v.x
:
v.x
>
0
?
255
:
0
)
;
result.y
=
(
uchar
)(
v.y
<=
255
?
v.y
:
v.y
>
0
?
255
:
0
)
;
result.z
=
(
uchar
)(
v.z
<=
255
?
v.z
:
v.z
>
0
?
255
:
0
)
;
result.w
=
(
uchar
)(
v.w
<=
255
?
v.w
:
v.w
>
0
?
255
:
0
)
;
return
result
;
}
uchar4
round_uchar4_float4
(
float4
v
)
{
int4
iv
=
convert_int4_sat_rte
(
v
)
;
return
round_uchar4_int4
(
iv
)
;
}
int
idx_row_low
(
int
y,
int
last_row
)
{
if
(
y
<
0
)
{
y
=
-y
;
}
return
y
%
(
last_row
+
1
)
;
return
abs
(
y
)
%
(
last_row
+
1
)
;
}
int
idx_row_high
(
int
y,
int
last_row
)
{
int
i
;
int
j
;
if
(
last_row
-
y
<
0
)
{
i
=
(
y
-
last_row
)
;
}
else
{
i
=
(
last_row
-
y
)
;
}
if
(
last_row
-
i
<
0
)
{
j
=
i
-
last_row
;
}
else
{
j
=
last_row
-
i
;
}
int
i=abs_diff
(
y,last_row
)
;
int
j=abs_diff
(
i,last_row
)
;
return
j
%
(
last_row
+
1
)
;
}
...
...
@@ -160,33 +105,14 @@ int idx_row(int y, int last_row)
int
idx_col_low
(
int
x,
int
last_col
)
{
if
(
x
<
0
)
{
x
=
-x
;
}
return
x
%
(
last_col
+
1
)
;
return
abs
(
x
)
%
(
last_col
+
1
)
;
}
int
idx_col_high
(
int
x,
int
last_col
)
{
int
i
;
int
j
;
if
(
last_col
-
x
<
0
)
{
i
=
(
x
-
last_col
)
;
}
else
{
i
=
(
last_col
-
x
)
;
}
if
(
last_col
-
i
<
0
)
{
j
=
i
-
last_col
;
}
else
{
j
=
last_col
-
i
;
}
int
i=abs_diff
(
x,last_col
)
;
int
j=abs_diff
(
i,last_col
)
;
return
j
%
(
last_col
+
1
)
;
}
...
...
@@ -194,307 +120,308 @@ int idx_col(int x, int last_col)
{
return
idx_col_low
(
idx_col_high
(
x,
last_col
)
,
last_col
)
;
}
__kernel
void
pyrDown_C1_D0
(
__global
uchar
*
srcData,
int
srcStep,
int
srcOffset,
int
srcRows,
int
srcCols,
__global
uchar
*dst,
int
dstStep,
int
dstOffset,
int
dstCols
)
{
const
int
x
=
get_group_id
(
0
)
*
get_local_size
(
0
)
+
get_local_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
__local
float
smem[256
+
4]
;
float
sum
;
const
int
src_y
=
2*y
;
const
int
last_row
=
srcRows
-
1
;
const
int
last_col
=
srcCols
-
1
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
sum
=
sum
+
0.375f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
sum
=
sum
+
0.0625f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
smem[2
+
get_local_id
(
0
)
]
=
sum
;
if
(
get_local_id
(
0
)
<
2
)
{
const
int
left_x
=
x
-
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
sum
=
sum
+
0.375f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
sum
=
sum
+
0.0625f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
smem[get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
0
)
>
253
)
{
const
int
right_x
=
x
+
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
sum
=
sum
+
0.375f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
sum
=
sum
+
0.0625f
*
round_uchar_uchar
(((
__global
uchar*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
smem[4
+
get_local_id
(
0
)
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
128
)
{
const
int
tid2
=
get_local_id
(
0
)
*
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
smem[2
+
tid2
-
2]
;
sum
=
sum
+
0.25f
*
smem[2
+
tid2
-
1]
;
sum
=
sum
+
0.375f
*
smem[2
+
tid2
]
;
sum
=
sum
+
0.25f
*
smem[2
+
tid2
+
1]
;
sum
=
sum
+
0.0625f
*
smem[2
+
tid2
+
2]
;
const
int
dst_x
=
(
get_group_id
(
0
)
*
get_local_size
(
0
)
+
tid2
)
/
2
;
if
(
dst_x
<
dstCols
)
dst[y
*
dstStep
+
dst_x]
=
round_uchar_float
(
sum
)
;
}
}
__kernel
void
pyrDown_C4_D0
(
__global
uchar4
*
srcData,
int
srcStep,
int
srcOffset,
int
srcRows,
int
srcCols,
__global
uchar4
*dst,
int
dstStep,
int
dstOffset,
int
dstCols
)
{
const
int
x
=
get_group_id
(
0
)
*
get_local_size
(
0
)
+
get_local_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
__local
float4
smem[256
+
4]
;
float4
sum
;
const
int
src_y
=
2*y
;
const
int
last_row
=
srcRows
-
1
;
const
int
last_col
=
srcCols
-
1
;
float4
co1
=
(
float4
)(
0.375f,
0.375f,
0.375f,
0.375f
)
;
float4
co2
=
(
float4
)(
0.25f,
0.25f,
0.25f,
0.25f
)
;
float4
co3
=
(
float4
)(
0.0625f,
0.0625f,
0.0625f,
0.0625f
)
;
sum
=
0
;
sum
=
sum
+
co3
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
sum
=
sum
+
co1
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
sum
=
sum
+
co3
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
smem[2
+
get_local_id
(
0
)
]
=
sum
;
if
(
get_local_id
(
0
)
<
2
)
{
const
int
left_x
=
x
-
2
;
sum
=
0
;
sum
=
sum
+
co3
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
sum
=
sum
+
co1
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
sum
=
sum
+
co3
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
smem[get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
0
)
>
253
)
{
const
int
right_x
=
x
+
2
;
sum
=
0
;
sum
=
sum
+
co3
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
sum
=
sum
+
co1
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
sum
=
sum
+
co3
*
convert_float4
(
round_uchar4_uchar4
(((
__global
uchar4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
smem[4
+
get_local_id
(
0
)
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
128
)
{
const
int
tid2
=
get_local_id
(
0
)
*
2
;
sum
=
0
;
sum
=
sum
+
co3
*
smem[2
+
tid2
-
2]
;
sum
=
sum
+
co2
*
smem[2
+
tid2
-
1]
;
sum
=
sum
+
co1
*
smem[2
+
tid2
]
;
sum
=
sum
+
co2
*
smem[2
+
tid2
+
1]
;
sum
=
sum
+
co3
*
smem[2
+
tid2
+
2]
;
const
int
dst_x
=
(
get_group_id
(
0
)
*
get_local_size
(
0
)
+
tid2
)
/
2
;
if
(
dst_x
<
dstCols
)
dst[y
*
dstStep
/
4
+
dst_x]
=
round_uchar4_float4
(
sum
)
;
}
}
__kernel
void
pyrDown_C1_D5
(
__global
float
*
srcData,
int
srcStep,
int
srcOffset,
int
srcRows,
int
srcCols,
__global
float
*dst,
int
dstStep,
int
dstOffset,
int
dstCols
)
{
const
int
x
=
get_group_id
(
0
)
*
get_local_size
(
0
)
+
get_local_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
__local
float
smem[256
+
4]
;
float
sum
;
const
int
src_y
=
2*y
;
const
int
last_row
=
srcRows
-
1
;
const
int
last_col
=
srcCols
-
1
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
0.375f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
smem[2
+
get_local_id
(
0
)
]
=
sum
;
if
(
get_local_id
(
0
)
<
2
)
{
const
int
left_x
=
x
-
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
0.375f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
smem[get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
0
)
>
253
)
{
const
int
right_x
=
x
+
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
0.375f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
smem[4
+
get_local_id
(
0
)
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
128
)
{
const
int
tid2
=
get_local_id
(
0
)
*
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
smem[2
+
tid2
-
2]
;
sum
=
sum
+
0.25f
*
smem[2
+
tid2
-
1]
;
sum
=
sum
+
0.375f
*
smem[2
+
tid2
]
;
sum
=
sum
+
0.25f
*
smem[2
+
tid2
+
1]
;
sum
=
sum
+
0.0625f
*
smem[2
+
tid2
+
2]
;
const
int
dst_x
=
(
get_group_id
(
0
)
*
get_local_size
(
0
)
+
tid2
)
/
2
;
if
(
dst_x
<
dstCols
)
dst[y
*
dstStep
/
4
+
dst_x]
=
sum
;
}
}
__kernel
void
pyrDown_C4_D5
(
__global
float4
*
srcData,
int
srcStep,
int
srcOffset,
int
srcRows,
int
srcCols,
__global
float4
*dst,
int
dstStep,
int
dstOffset,
int
dstCols
)
{
const
int
x
=
get_group_id
(
0
)
*
get_local_size
(
0
)
+
get_local_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
__local
float4
smem[256
+
4]
;
float4
sum
;
const
int
src_y
=
2*y
;
const
int
last_row
=
srcRows
-
1
;
const
int
last_col
=
srcCols
-
1
;
float4
co1
=
(
float4
)(
0.375f,
0.375f,
0.375f,
0.375f
)
;
float4
co2
=
(
float4
)(
0.25f,
0.25f,
0.25f,
0.25f
)
;
float4
co3
=
(
float4
)(
0.0625f,
0.0625f,
0.0625f,
0.0625f
)
;
sum
=
0
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
co1
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
smem[2
+
get_local_id
(
0
)
]
=
sum
;
if
(
get_local_id
(
0
)
<
2
)
{
const
int
left_x
=
x
-
2
;
sum
=
0
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
co1
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
smem[get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
0
)
>
253
)
{
const
int
right_x
=
x
+
2
;
sum
=
0
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
co1
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
smem[4
+
get_local_id
(
0
)
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
128
)
{
const
int
tid2
=
get_local_id
(
0
)
*
2
;
sum
=
0
;
sum
=
sum
+
co3
*
smem[2
+
tid2
-
2]
;
sum
=
sum
+
co2
*
smem[2
+
tid2
-
1]
;
sum
=
sum
+
co1
*
smem[2
+
tid2
]
;
sum
=
sum
+
co2
*
smem[2
+
tid2
+
1]
;
sum
=
sum
+
co3
*
smem[2
+
tid2
+
2]
;
const
int
dst_x
=
(
get_group_id
(
0
)
*
get_local_size
(
0
)
+
tid2
)
/
2
;
if
(
dst_x
<
dstCols
)
dst[y
*
dstStep
/
16
+
dst_x]
=
sum
;
}
}
__kernel
void
pyrDown_C1_D0
(
__global
uchar
*
srcData,
int
srcStep,
int
srcOffset,
int
srcRows,
int
srcCols,
__global
uchar
*dst,
int
dstStep,
int
dstOffset,
int
dstCols
)
{
const
int
x
=
get_group_id
(
0
)
*
get_local_size
(
0
)
+
get_local_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
__local
float
smem[256
+
4]
;
float
sum
;
const
int
src_y
=
2*y
;
const
int
last_row
=
srcRows
-
1
;
const
int
last_col
=
srcCols
-
1
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
(((
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
(((
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
sum
=
sum
+
0.375f
*
(((
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
(((
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
sum
=
sum
+
0.0625f
*
(((
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
)
;
smem[2
+
get_local_id
(
0
)
]
=
sum
;
if
(
get_local_id
(
0
)
<
2
)
{
const
int
left_x
=
x
-
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
(((
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
(((
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
sum
=
sum
+
0.375f
*
(((
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
(((
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
sum
=
sum
+
0.0625f
*
(((
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
)
;
smem[get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
0
)
>
253
)
{
const
int
right_x
=
x
+
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
(((
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
(((
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
sum
=
sum
+
0.375f
*
(((
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
sum
=
sum
+
0.25f
*
(((
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
sum
=
sum
+
0.0625f
*
(((
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
)
;
smem[4
+
get_local_id
(
0
)
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
128
)
{
const
int
tid2
=
get_local_id
(
0
)
*
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
smem[2
+
tid2
-
2]
;
sum
=
sum
+
0.25f
*
smem[2
+
tid2
-
1]
;
sum
=
sum
+
0.375f
*
smem[2
+
tid2
]
;
sum
=
sum
+
0.25f
*
smem[2
+
tid2
+
1]
;
sum
=
sum
+
0.0625f
*
smem[2
+
tid2
+
2]
;
const
int
dst_x
=
(
get_group_id
(
0
)
*
get_local_size
(
0
)
+
tid2
)
/
2
;
if
(
dst_x
<
dstCols
)
dst[y
*
dstStep
+
dst_x]
=
round_uchar_float
(
sum
)
;
}
}
__kernel
void
pyrDown_C4_D0
(
__global
uchar4
*
srcData,
int
srcStep,
int
srcOffset,
int
srcRows,
int
srcCols,
__global
uchar4
*dst,
int
dstStep,
int
dstOffset,
int
dstCols
)
{
const
int
x
=
get_group_id
(
0
)
*
get_local_size
(
0
)
+
get_local_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
__local
float4
smem[256
+
4]
;
float4
sum
;
const
int
src_y
=
2*y
;
const
int
last_row
=
srcRows
-
1
;
const
int
last_col
=
srcCols
-
1
;
float4
co1
=
(
float4
)(
0.375f,
0.375f,
0.375f,
0.375f
)
;
float4
co2
=
(
float4
)(
0.25f,
0.25f,
0.25f,
0.25f
)
;
float4
co3
=
(
float4
)(
0.0625f,
0.0625f,
0.0625f,
0.0625f
)
;
sum
=
0
;
sum
=
sum
+
co3
*
convert_float4
((((
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
((((
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
sum
=
sum
+
co1
*
convert_float4
((((
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
((((
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
sum
=
sum
+
co3
*
convert_float4
((((
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
))
;
smem[2
+
get_local_id
(
0
)
]
=
sum
;
if
(
get_local_id
(
0
)
<
2
)
{
const
int
left_x
=
x
-
2
;
sum
=
0
;
sum
=
sum
+
co3
*
convert_float4
((((
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
((((
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
sum
=
sum
+
co1
*
convert_float4
((((
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
((((
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
sum
=
sum
+
co3
*
convert_float4
((((
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
))
;
smem[get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
0
)
>
253
)
{
const
int
right_x
=
x
+
2
;
sum
=
0
;
sum
=
sum
+
co3
*
convert_float4
((((
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
((((
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
sum
=
sum
+
co1
*
convert_float4
((((
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
sum
=
sum
+
co2
*
convert_float4
((((
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
sum
=
sum
+
co3
*
convert_float4
((((
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
))
;
smem[4
+
get_local_id
(
0
)
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
128
)
{
const
int
tid2
=
get_local_id
(
0
)
*
2
;
sum
=
0
;
sum
=
sum
+
co3
*
smem[2
+
tid2
-
2]
;
sum
=
sum
+
co2
*
smem[2
+
tid2
-
1]
;
sum
=
sum
+
co1
*
smem[2
+
tid2
]
;
sum
=
sum
+
co2
*
smem[2
+
tid2
+
1]
;
sum
=
sum
+
co3
*
smem[2
+
tid2
+
2]
;
const
int
dst_x
=
(
get_group_id
(
0
)
*
get_local_size
(
0
)
+
tid2
)
/
2
;
if
(
dst_x
<
dstCols
)
dst[y
*
dstStep
/
4
+
dst_x]
=
round_uchar4_float4
(
sum
)
;
}
}
__kernel
void
pyrDown_C1_D5
(
__global
float
*
srcData,
int
srcStep,
int
srcOffset,
int
srcRows,
int
srcCols,
__global
float
*dst,
int
dstStep,
int
dstOffset,
int
dstCols
)
{
const
int
x
=
get_group_id
(
0
)
*
get_local_size
(
0
)
+
get_local_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
__local
float
smem[256
+
4]
;
float
sum
;
const
int
src_y
=
2*y
;
const
int
last_row
=
srcRows
-
1
;
const
int
last_col
=
srcCols
-
1
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
0.375f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
x,
last_col
)
]
;
smem[2
+
get_local_id
(
0
)
]
=
sum
;
if
(
get_local_id
(
0
)
<
2
)
{
const
int
left_x
=
x
-
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
0.375f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
left_x,
last_col
)
]
;
smem[get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
0
)
>
253
)
{
const
int
right_x
=
x
+
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
0.375f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
0.25f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
0.0625f
*
((
__global
float*
)((
__global
char*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
))
[idx_col
(
right_x,
last_col
)
]
;
smem[4
+
get_local_id
(
0
)
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
128
)
{
const
int
tid2
=
get_local_id
(
0
)
*
2
;
sum
=
0
;
sum
=
sum
+
0.0625f
*
smem[2
+
tid2
-
2]
;
sum
=
sum
+
0.25f
*
smem[2
+
tid2
-
1]
;
sum
=
sum
+
0.375f
*
smem[2
+
tid2
]
;
sum
=
sum
+
0.25f
*
smem[2
+
tid2
+
1]
;
sum
=
sum
+
0.0625f
*
smem[2
+
tid2
+
2]
;
const
int
dst_x
=
(
get_group_id
(
0
)
*
get_local_size
(
0
)
+
tid2
)
/
2
;
if
(
dst_x
<
dstCols
)
dst[y
*
dstStep
/
4
+
dst_x]
=
sum
;
}
}
__kernel
void
pyrDown_C4_D5
(
__global
float4
*
srcData,
int
srcStep,
int
srcOffset,
int
srcRows,
int
srcCols,
__global
float4
*dst,
int
dstStep,
int
dstOffset,
int
dstCols
)
{
const
int
x
=
get_group_id
(
0
)
*
get_local_size
(
0
)
+
get_local_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
__local
float4
smem[256
+
4]
;
float4
sum
;
const
int
src_y
=
2*y
;
const
int
last_row
=
srcRows
-
1
;
const
int
last_col
=
srcCols
-
1
;
float4
co1
=
(
float4
)(
0.375f,
0.375f,
0.375f,
0.375f
)
;
float4
co2
=
(
float4
)(
0.25f,
0.25f,
0.25f,
0.25f
)
;
float4
co3
=
(
float4
)(
0.0625f,
0.0625f,
0.0625f,
0.0625f
)
;
sum
=
0
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
co1
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
x,
last_col
)
]
;
smem[2
+
get_local_id
(
0
)
]
=
sum
;
if
(
get_local_id
(
0
)
<
2
)
{
const
int
left_x
=
x
-
2
;
sum
=
0
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
co1
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
left_x,
last_col
)
]
;
smem[get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
0
)
>
253
)
{
const
int
right_x
=
x
+
2
;
sum
=
0
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
-
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
co1
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
co2
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
1
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
sum
=
sum
+
co3
*
((
__global
float4*
)((
__global
char4*
)
srcData
+
idx_row
(
src_y
+
2
,
last_row
)
*
srcStep
/
4
))
[idx_col
(
right_x,
last_col
)
]
;
smem[4
+
get_local_id
(
0
)
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
128
)
{
const
int
tid2
=
get_local_id
(
0
)
*
2
;
sum
=
0
;
sum
=
sum
+
co3
*
smem[2
+
tid2
-
2]
;
sum
=
sum
+
co2
*
smem[2
+
tid2
-
1]
;
sum
=
sum
+
co1
*
smem[2
+
tid2
]
;
sum
=
sum
+
co2
*
smem[2
+
tid2
+
1]
;
sum
=
sum
+
co3
*
smem[2
+
tid2
+
2]
;
const
int
dst_x
=
(
get_group_id
(
0
)
*
get_local_size
(
0
)
+
tid2
)
/
2
;
if
(
dst_x
<
dstCols
)
dst[y
*
dstStep
/
16
+
dst_x]
=
sum
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment