Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
161674bf
Commit
161674bf
authored
Sep 24, 2013
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactored and extended ocl::addWeighted
parent
b20bd470
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
51 additions
and
407 deletions
+51
-407
arithm.cpp
modules/ocl/src/arithm.cpp
+39
-37
arithm_addWeighted.cl
modules/ocl/src/opencl/arithm_addWeighted.cl
+12
-370
No files found.
modules/ocl/src/arithm.cpp
View file @
161674bf
...
...
@@ -1795,64 +1795,66 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
void
cv
::
ocl
::
addWeighted
(
const
oclMat
&
src1
,
double
alpha
,
const
oclMat
&
src2
,
double
beta
,
double
gama
,
oclMat
&
dst
)
{
Context
*
clCxt
=
src1
.
clCxt
;
bool
hasDouble
=
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
);
if
(
!
hasDouble
&&
src1
.
depth
()
==
CV_64F
)
{
CV_Error
(
CV_GpuNotSupported
,
"Selected device doesn't support double
\r\n
"
);
return
;
}
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
CV_Assert
(
src1
.
cols
==
src2
.
cols
&&
src2
.
cols
==
dst
.
cols
&&
src1
.
rows
==
src2
.
rows
&&
src2
.
rows
==
dst
.
rows
);
CV_Assert
(
src1
.
type
()
==
src2
.
type
()
&&
src1
.
type
()
==
dst
.
type
());
Context
*
clCxt
=
src1
.
clCxt
;
int
channels
=
dst
.
oclchannels
();
int
depth
=
dst
.
depth
();
int
cols1
=
src1
.
cols
*
channels
;
int
src1step1
=
src1
.
step1
(),
src1offset1
=
src1
.
offset
/
src1
.
elemSize1
();
int
src2step1
=
src2
.
step1
(),
src2offset1
=
src2
.
offset
/
src1
.
elemSize1
();
int
dststep1
=
dst
.
step1
(),
dstoffset1
=
dst
.
offset
/
dst
.
elemSize1
();
int
vector_lengths
[
4
][
7
]
=
{{
4
,
0
,
4
,
4
,
4
,
4
,
4
},
{
4
,
0
,
4
,
4
,
4
,
4
,
4
},
{
4
,
0
,
4
,
4
,
4
,
4
,
4
},
{
4
,
0
,
4
,
4
,
4
,
4
,
4
}
};
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
depth
];
int
offset_cols
=
(
dst
.
offset
/
dst
.
elemSize1
())
&
(
vector_length
-
1
);
int
cols
=
divUp
(
dst
.
cols
*
channels
+
offset_cols
,
vector_length
);
const
char
*
const
typeMap
[]
=
{
"uchar"
,
"char"
,
"ushort"
,
"short"
,
"int"
,
"float"
,
"double"
};
std
::
string
buildOptions
=
format
(
"-D T=%s -D WT=%s -D convertToT=convert_%s%s"
,
typeMap
[
depth
],
hasDouble
?
"double"
:
"float"
,
typeMap
[
depth
],
depth
>=
CV_32F
?
""
:
"_sat_rte"
);
size_t
localThreads
[
3
]
=
{
256
,
1
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
dst
.
rows
,
1
};
size_t
globalThreads
[
3
]
=
{
cols1
,
dst
.
rows
,
1
};
float
alpha_f
=
static_cast
<
float
>
(
alpha
),
beta_f
=
static_cast
<
float
>
(
beta
),
gama_f
=
static_cast
<
float
>
(
gama
);
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
int
src1_step
=
(
int
)
src1
.
step
;
int
src2_step
=
(
int
)
src2
.
step
;
int
dst_step
=
(
int
)
dst
.
step
;
float
alpha_f
=
alpha
,
beta_f
=
beta
,
gama_f
=
gama
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src1
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
step1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
offset1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src2
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2step1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2offset1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dststep1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dstoffset1
));
if
(
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
))
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_double
),
(
void
*
)
&
alpha
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_double
),
(
void
*
)
&
beta
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_double
),
(
void
*
)
&
gama
));
}
else
if
(
!
hasDouble
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
alpha_f
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
beta_f
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
gama_f
));
}
else
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_double
),
(
void
*
)
&
alpha
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_double
),
(
void
*
)
&
beta
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_double
),
(
void
*
)
&
gama
));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
openCLExecuteKernel
(
clCxt
,
&
arithm_addWeighted
,
"addWeighted"
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
openCLExecuteKernel
(
clCxt
,
&
arithm_addWeighted
,
"addWeighted"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
buildOptions
.
c_str
());
}
static
void
arithmetic_pow_run
(
const
oclMat
&
src1
,
double
p
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
)
...
...
modules/ocl/src/opencl/arithm_addWeighted.cl
View file @
161674bf
...
...
@@ -42,392 +42,34 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
F
;
#
else
typedef
float
F
;
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
addWeighted_D0
(
__global
uchar
*src1,int
src1_step,int
src1_offset,
__global
uchar
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
__global
uchar
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
,
src2_data
;
src1_data.x=
src1_index+0
>=
0
?
src1[src1_index+0]
:
0
;
src1_data.y=
src1_index+1
>=
0
?
src1[src1_index+1]
:
0
;
src1_data.z=
src1_index+2
>=
0
?
src1[src1_index+2]
:
0
;
src1_data.w=
src1_index+3
>=
0
?
src1[src1_index+3]
:
0
;
src2_data.x=
src2_index+0
>=
0
?
src2[src2_index+0]
:
0
;
src2_data.y=
src2_index+1
>=
0
?
src2[src2_index+1]
:
0
;
src2_data.z=
src2_index+2
>=
0
?
src2[src2_index+2]
:
0
;
src2_data.w=
src2_index+3
>=
0
?
src2[src2_index+3]
:
0
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
//
short4
tmp
=
convert_short4_sat
(
src1_data
)
*
alpha
+
convert_short4_sat
(
src2_data
)
*
beta
+
gama
;
short4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
dst_data
;
//
dst[x
+
y
*
dst_step]
=
src1[x
+
y
*
src1_step]
*
alpha
+
src2[x
+
y
*
src2_step]
*
beta
+
gama
;
}
}
__kernel
void
addWeighted_D2
(
__global
ushort
*src1,
int
src1_step,int
src1_offset,
__global
ushort
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
__global
ushort
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
int4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
ushort4
tmp_data
=
convert_ushort4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
addWeighted_D3
(
__global
short
*src1,
int
src1_step,int
src1_offset,
__global
short
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
__global
short
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x<<
1
)
-
(
dst_align
<<
1
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
int4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
short4
tmp_data
=
convert_short4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
addWeighted
_D4
(
__global
int
*src1,
int
src1_step,int
src1_offset
,
__global
int
*src2,
int
src2_step,int
src2_offset
,
F
alpha,F
beta,
F
gama
,
__global
int
*dst,
int
dst_step,int
dst_offset
,
int
rows,
int
cols,int
dst_step1
)
__kernel
void
addWeighted
(
__global
T
*
src1,
int
src1_step1,
int
src1_offset1
,
__global
T
*
src2,
int
src2_step1,
int
src2_offset1
,
__global
T
*
dst,
int
dst_step1,
int
dst_offset1
,
WT
alpha,
WT
beta,
WT
gama
,
int
cols1,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols1
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step1,
x
+
src1_offset1
)
;
int
src2_index
=
mad24
(
y,
src2_step1,
x
+
src2_offset1
)
;
int
dst_index
=
mad24
(
y,
dst_step1,
x
+
dst_offset1
)
;
x
=
x
<<
2
;
#
define
bitOfInt
(
sizeof
(
int
)
==
4
?
2:
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
bitOfInt
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
bitOfInt
)
+
src1_offset
-
(
dst_align
<<
bitOfInt
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
bitOfInt
)
+
src2_offset
-
(
dst_align
<<
bitOfInt
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
bitOfInt
)
-
(
dst_align
<<
bitOfInt
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int4
src1_data
=
vload4
(
0
,
(
__global
int
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
int4
src2_data
=
vload4
(
0
,
(
__global
int
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
int4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
int4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
dst[dst_index]
=
convertToT
(
src1[src1_index]*alpha
+
src2[src2_index]*beta
+
gama
)
;
}
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
float4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
int4
tmp_data
=
convert_int4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
8
>=
dst_start
)
&&
(
dst_index
+
8
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
12
>=
dst_start
)
&&
(
dst_index
+
12
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
addWeighted_D5
(
__global
float
*src1,int
src1_step,int
src1_offset,
__global
float
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,
F
gama,
__global
float
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
2
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
-
(
dst_align
<<
2
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
-
(
dst_align
<<
2
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
2
)
-
(
dst_align
<<
2
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
float4
src1_data
=
vload4
(
0
,
(
__global
float
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
float4
src2_data
=
vload4
(
0
,
(
__global
float
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
if
(
src1_index
<
0
)
{
float4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
float4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
//
float4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
float4
tmp_data
;
tmp_data.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp_data.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp_data.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp_data.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
//
float4
tmp_data
=
convert_float4
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
8
>=
dst_start
)
&&
(
dst_index
+
8
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
12
>=
dst_start
)
&&
(
dst_index
+
12
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
addWeighted_D6
(
__global
double
*src1,
int
src1_step,int
src1_offset,
__global
double
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,
F
gama,
__global
double
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
-
(
dst_align
<<
3
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
-
(
dst_align
<<
3
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
3
)
-
(
dst_align
<<
3
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
double4
src1_data
=
vload4
(
0
,
(
__global
double
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
double4
src2_data
=
vload4
(
0
,
(
__global
double
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
if
(
src1_index
<
0
)
{
double4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
double4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
//
double4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
double4
tmp_data
;
tmp_data.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp_data.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp_data.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp_data.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
8
>=
dst_start
)
&&
(
dst_index
+
8
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
16
>=
dst_start
)
&&
(
dst_index
+
16
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
24
>=
dst_start
)
&&
(
dst_index
+
24
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
#
endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment