Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
7726e273
Commit
7726e273
authored
Apr 06, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
merge add and sub into one set of kernels
parent
bee970ab
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
115 additions
and
2009 deletions
+115
-2009
arithm.cpp
modules/ocl/src/arithm.cpp
+40
-24
arithm_add.cl
modules/ocl/src/opencl/arithm_add.cl
+29
-24
arithm_add_scalar.cl
modules/ocl/src/opencl/arithm_add_scalar.cl
+23
-18
arithm_add_scalar_mask.cl
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+23
-19
arithm_sub.cl
modules/ocl/src/opencl/arithm_sub.cl
+0
-802
arithm_sub_scalar.cl
modules/ocl/src/opencl/arithm_sub_scalar.cl
+0
-511
arithm_sub_scalar_mask.cl
modules/ocl/src/opencl/arithm_sub_scalar_mask.cl
+0
-611
No files found.
modules/ocl/src/arithm.cpp
View file @
7726e273
...
...
@@ -92,9 +92,6 @@ namespace cv
extern
const
char
*
arithm_bitwise_xor_scalar_mask
;
extern
const
char
*
arithm_compare_eq
;
extern
const
char
*
arithm_compare_ne
;
extern
const
char
*
arithm_sub
;
extern
const
char
*
arithm_sub_scalar
;
extern
const
char
*
arithm_sub_scalar_mask
;
extern
const
char
*
arithm_mul
;
extern
const
char
*
arithm_div
;
extern
const
char
*
arithm_absdiff
;
...
...
@@ -130,7 +127,8 @@ inline int divUp(int total, int grain)
/////////////////////// add subtract multiply divide /////////////////////////
//////////////////////////////////////////////////////////////////////////////
template
<
typename
T
>
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
void
*
_scalar
)
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
void
*
_scalar
,
int
op_type
=
0
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
...
...
@@ -186,14 +184,25 @@ void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string
scalar
=
(
T
)
scalar1
;
args
.
push_back
(
make_pair
(
sizeof
(
T
),
(
void
*
)
&
scalar
));
}
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
switch
(
op_type
)
{
case
MAT_ADD
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
,
"-D ARITHM_ADD"
);
break
;
case
MAT_SUB
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
,
"-D ARITHM_SUB"
);
break
;
default
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
}
}
static
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
)
static
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
int
op_type
=
0
)
{
arithmetic_run
<
char
>
(
src1
,
src2
,
dst
,
kernelName
,
kernelString
,
(
void
*
)
NULL
);
arithmetic_run
<
char
>
(
src1
,
src2
,
dst
,
kernelName
,
kernelString
,
(
void
*
)
NULL
,
op_type
);
}
static
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
)
static
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
int
op_type
=
0
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
...
...
@@ -248,24 +257,34 @@ static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
);
switch
(
op_type
)
{
case
MAT_ADD
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
"-D ARITHM_ADD"
);
break
;
case
MAT_SUB
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
"-D ARITHM_SUB"
);
break
;
default
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
);
}
}
void
cv
::
ocl
::
add
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
"arithm_add"
,
&
arithm_add
);
arithmetic_run
(
src1
,
src2
,
dst
,
"arithm_add"
,
&
arithm_add
,
MAT_ADD
);
}
void
cv
::
ocl
::
add
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
mask
,
"arithm_add_with_mask"
,
&
arithm_add
);
arithmetic_run
(
src1
,
src2
,
dst
,
mask
,
"arithm_add_with_mask"
,
&
arithm_add
,
MAT_ADD
);
}
void
cv
::
ocl
::
subtract
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
"arithm_
sub"
,
&
arithm_sub
);
arithmetic_run
(
src1
,
src2
,
dst
,
"arithm_
add"
,
&
arithm_add
,
MAT_SUB
);
}
void
cv
::
ocl
::
subtract
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
mask
,
"arithm_
sub_with_mask"
,
&
arithm_sub
);
arithmetic_run
(
src1
,
src2
,
dst
,
mask
,
"arithm_
add_with_mask"
,
&
arithm_add
,
MAT_SUB
);
}
typedef
void
(
*
MulDivFunc
)(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
void
*
scalar
);
...
...
@@ -351,12 +370,9 @@ void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst,
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_step1
));
if
(
isMatSubScalar
!=
0
)
{
isMatSubScalar
=
isMatSubScalar
>
0
?
1
:
0
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
isMatSubScalar
));
}
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
);
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
"-D ARITHM_SUB"
);
else
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
"-D ARITHM_ADD"
);
}
static
void
arithmetic_scalar_run
(
const
oclMat
&
src
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
double
scalar
)
...
...
@@ -452,14 +468,14 @@ void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const ocl
void
cv
::
ocl
::
subtract
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
string
kernelName
=
mask
.
data
?
"arithm_s_
sub_with_mask"
:
"arithm_s_sub
"
;
const
char
**
kernelString
=
mask
.
data
?
&
arithm_
sub_scalar_mask
:
&
arithm_sub
_scalar
;
string
kernelName
=
mask
.
data
?
"arithm_s_
add_with_mask"
:
"arithm_s_add
"
;
const
char
**
kernelString
=
mask
.
data
?
&
arithm_
add_scalar_mask
:
&
arithm_add
_scalar
;
arithmetic_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
,
1
);
}
void
cv
::
ocl
::
subtract
(
const
Scalar
&
src2
,
const
oclMat
&
src1
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
string
kernelName
=
mask
.
data
?
"arithm_s_
sub_with_mask"
:
"arithm_s_sub
"
;
const
char
**
kernelString
=
mask
.
data
?
&
arithm_
sub_scalar_mask
:
&
arithm_sub
_scalar
;
string
kernelName
=
mask
.
data
?
"arithm_s_
add_with_mask"
:
"arithm_s_add
"
;
const
char
**
kernelString
=
mask
.
data
?
&
arithm_
add_scalar_mask
:
&
arithm_add
_scalar
;
arithmetic_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
,
-
1
);
}
void
cv
::
ocl
::
divide
(
double
scalar
,
const
oclMat
&
src
,
oclMat
&
dst
)
...
...
modules/ocl/src/opencl/arithm_add.cl
View file @
7726e273
...
...
@@ -52,6 +52,11 @@
#
endif
#
endif
#
ifdef
ARITHM_ADD
#
define
ARITHM_OP
(
A,B
)
((
A
)
+
(
B
))
#
elif
defined
ARITHM_SUB
#
define
ARITHM_OP
(
A,B
)
((
A
)
-
(
B
))
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////ADD////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -95,7 +100,7 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
convert_short4_sat
(
src1_data
)
+
convert_short4_sat
(
src2_data
)
;
short4
tmp
=
ARITHM_OP
(
convert_short4_sat
(
src1_data
)
,
convert_short4_sat
(
src2_data
)
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
...
...
@@ -134,7 +139,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
+
convert_int4_sat
(
src2_data
)
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
convert_int4_sat
(
src2_data
)
)
;
ushort4
tmp_data
=
convert_ushort4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
...
...
@@ -172,7 +177,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
+
convert_int4_sat
(
src2_data
)
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
convert_int4_sat
(
src2_data
)
)
;
short4
tmp_data
=
convert_short4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
...
...
@@ -200,7 +205,7 @@ __kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset,
int
data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
long
tmp
=
(
long
)(
data1
)
+
(
long
)(
data2
)
;
long
tmp
=
ARITHM_OP
((
long
)(
data1
)
,
(
long
)(
data2
)
)
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
convert_int_sat
(
tmp
)
;
}
...
...
@@ -221,7 +226,7 @@ __kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offse
float
data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
data2
=
*
((
__global
float
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float
tmp
=
data1
+
data2
;
float
tmp
=
ARITHM_OP
(
data1,
data2
)
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
...
...
@@ -245,7 +250,7 @@ __kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offs
double
data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
data2
=
*
((
__global
double
*
)((
__global
char
*
)
src2
+
src2_index
))
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data1
+
data2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
ARITHM_OP
(
data1,
data2
)
;
}
}
#
endif
...
...
@@ -302,7 +307,7 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
}
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
convert_short4_sat
(
src1_data
)
+
convert_short4_sat
(
src2_data
)
;
short4
tmp
=
ARITHM_OP
(
convert_short4_sat
(
src1_data
)
,
convert_short4_sat
(
src2_data
)
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
...
...
@@ -344,7 +349,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
+
convert_int2_sat
(
src2_data
)
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
convert_int2_sat
(
src2_data
)
)
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
...
...
@@ -384,7 +389,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
+
convert_int2_sat
(
src2_data
)
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
convert_int2_sat
(
src2_data
)
)
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
...
...
@@ -416,7 +421,7 @@ __kernel void arithm_add_with_mask_C1_D4 (__global int *src1, int src1_step, i
int
src_data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
convert_int_sat
(
(
long
)
src_data1
+
(
long
)
src_data2
)
;
int
data
=
convert_int_sat
(
ARITHM_OP
((
long
)
src_data1,
(
long
)
src_data2
)
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -446,7 +451,7 @@ __kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, i
float
src_data2
=
*
((
__global
float
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float
dst_data
=
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float
data
=
src_data1
+
src_data2
;
float
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -477,7 +482,7 @@ __kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step,
double
src_data2
=
*
((
__global
double
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double
dst_data
=
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double
data
=
src_data1
+
src_data2
;
double
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -516,7 +521,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
convert_short4_sat
(
src1_data
)
+
convert_short4_sat
(
src2_data
)
;
short4
tmp
=
ARITHM_OP
(
convert_short4_sat
(
src1_data
)
,
convert_short4_sat
(
src2_data
)
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
...
...
@@ -548,7 +553,7 @@ __kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step,
ushort2
src_data2
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
+
convert_int2_sat
(
src_data2
)
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
convert_int2_sat
(
src_data2
)
)
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
...
...
@@ -578,7 +583,7 @@ __kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, i
short2
src_data2
=
*
((
__global
short2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
+
convert_int2_sat
(
src_data2
)
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
convert_int2_sat
(
src_data2
)
)
;
short2
data
=
convert_short2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
...
...
@@ -608,7 +613,7 @@ __kernel void arithm_add_with_mask_C2_D4 (__global int *src1, int src1_step, i
int2
src_data2
=
*
((
__global
int2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
convert_int2_sat
(
convert_long2_sat
(
src_data1
)
+
convert_long2_sat
(
src_data2
))
;
int2
data
=
convert_int2_sat
(
ARITHM_OP
(
convert_long2_sat
(
src_data1
)
,
convert_long2_sat
(
src_data2
)
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -637,7 +642,7 @@ __kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, i
float2
src_data2
=
*
((
__global
float2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
data
=
src_data1
+
src_data2
;
float2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -668,7 +673,7 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step,
double2
src_data2
=
*
((
__global
double2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
src_data1
+
src_data2
;
double2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -699,7 +704,7 @@ __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, i
uchar4
src_data2
=
*
((
__global
uchar4
*
)(
src2
+
src2_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
data
=
convert_uchar4_sat
(
convert_ushort4_sat
(
src_data1
)
+
convert_ushort4_sat
(
src_data2
))
;
uchar4
data
=
convert_uchar4_sat
(
ARITHM_OP
(
convert_short4_sat
(
src_data1
)
,
convert_short4_sat
(
src_data2
)
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
...
...
@@ -728,7 +733,7 @@ __kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step,
ushort4
src_data2
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
data
=
convert_ushort4_sat
(
convert_int4_sat
(
src_data1
)
+
convert_int4_sat
(
src_data2
))
;
ushort4
data
=
convert_ushort4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
convert_int4_sat
(
src_data2
)
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -757,7 +762,7 @@ __kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, i
short4
src_data2
=
*
((
__global
short4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
data
=
convert_short4_sat
(
convert_int4_sat
(
src_data1
)
+
convert_int4_sat
(
src_data2
))
;
short4
data
=
convert_short4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
convert_int4_sat
(
src_data2
)
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -786,7 +791,7 @@ __kernel void arithm_add_with_mask_C4_D4 (__global int *src1, int src1_step, i
int4
src_data2
=
*
((
__global
int4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
data
=
convert_int4_sat
(
convert_long4_sat
(
src_data1
)
+
convert_long4_sat
(
src_data2
))
;
int4
data
=
convert_int4_sat
(
ARITHM_OP
(
convert_long4_sat
(
src_data1
)
,
convert_long4_sat
(
src_data2
)
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -815,7 +820,7 @@ __kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, i
float4
src_data2
=
*
((
__global
float4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float4
data
=
src_data1
+
src_data2
;
float4
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -846,7 +851,7 @@ __kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step,
double4
src_data2
=
*
((
__global
double4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double4
data
=
src_data1
+
src_data2
;
double4
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
modules/ocl/src/opencl/arithm_add_scalar.cl
View file @
7726e273
...
...
@@ -49,7 +49,12 @@
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
ifdef
ARITHM_ADD
#
define
ARITHM_OP
(
A,B
)
((
A
)
+
(
B
))
#
elif
defined
ARITHM_SUB
#
define
ARITHM_OP
(
A,B
)
((
A
)
-
(
B
))
#
endif
/**************************************add
with
scalar
without
mask**************************************/
__kernel
void
arithm_s_add_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -83,7 +88,7 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
}
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
+
src2_data
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
...
...
@@ -120,7 +125,7 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
+
src2_data
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
src2_data
)
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.x
:
data.x
;
...
...
@@ -155,7 +160,7 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
+
src2_data
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
src2_data
)
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.x
:
data.x
;
...
...
@@ -181,7 +186,7 @@ __kernel void arithm_s_add_C1_D4 (__global int *src1, int src1_step, int src1_
int
src_data2
=
src2.x
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
convert_int_sat
(
(
long
)
src_data1
+
(
long
)
src_data2
)
;
int
data
=
convert_int_sat
(
ARITHM_OP
((
long
)
src_data1,
(
long
)
src_data2
)
)
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
@@ -203,7 +208,7 @@ __kernel void arithm_s_add_C1_D5 (__global float *src1, int src1_step, int src
float
src_data2
=
src2.x
;
float
dst_data
=
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float
data
=
src_data1
+
src_data2
;
float
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
@@ -227,7 +232,7 @@ __kernel void arithm_s_add_C1_D6 (__global double *src1, int src1_step, int sr
double
src2_data
=
src2.x
;
double
dst_data
=
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double
data
=
src_data1
+
src2_data
;
double
data
=
ARITHM_OP
(
src_data1,
src2_data
)
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
@@ -260,7 +265,7 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
int4
src2_data
=
(
int4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
+
src2_data
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.xy
:
data.xy
;
...
...
@@ -286,7 +291,7 @@ __kernel void arithm_s_add_C2_D2 (__global ushort *src1, int src1_step, int sr
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
+
src_data2
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
src_data2
)
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -309,7 +314,7 @@ __kernel void arithm_s_add_C2_D3 (__global short *src1, int src1_step, int src
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
+
src_data2
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
src_data2
)
;
short2
data
=
convert_short2_sat
(
tmp
)
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -332,7 +337,7 @@ __kernel void arithm_s_add_C2_D4 (__global int *src1, int src1_step, int src1_
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
convert_int2_sat
(
convert_long2_sat
(
src_data1
)
+
convert_long2_sat
(
src_data2
))
;
int2
data
=
convert_int2_sat
(
ARITHM_OP
(
convert_long2_sat
(
src_data1
)
,
convert_long2_sat
(
src_data2
)
))
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
...
...
@@ -353,7 +358,7 @@ __kernel void arithm_s_add_C2_D5 (__global float *src1, int src1_step, int src
float2
src_data2
=
(
float2
)(
src2.x,
src2.y
)
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
data
=
src_data1
+
src_data2
;
float2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
...
...
@@ -376,7 +381,7 @@ __kernel void arithm_s_add_C2_D6 (__global double *src1, int src1_step, int sr
double2
src_data2
=
(
double2
)(
src2.x,
src2.y
)
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
src_data1
+
src_data2
;
double2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
@@ -398,7 +403,7 @@ __kernel void arithm_s_add_C4_D0 (__global uchar *src1, int src1_step, int src
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
data
=
convert_uchar4_sat
(
convert_int4_sat
(
src_data1
)
+
src2
)
;
uchar4
data
=
convert_uchar4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
)
)
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
...
...
@@ -418,7 +423,7 @@ __kernel void arithm_s_add_C4_D2 (__global ushort *src1, int src1_step, int sr
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
data
=
convert_ushort4_sat
(
convert_int4_sat
(
src_data1
)
+
src2
)
;
ushort4
data
=
convert_ushort4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
)
)
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
@@ -438,7 +443,7 @@ __kernel void arithm_s_add_C4_D3 (__global short *src1, int src1_step, int src
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
data
=
convert_short4_sat
(
convert_int4_sat
(
src_data1
)
+
src2
)
;
short4
data
=
convert_short4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
)
)
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
@@ -458,7 +463,7 @@ __kernel void arithm_s_add_C4_D4 (__global int *src1, int src1_step, int src1_
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
data
=
convert_int4_sat
(
convert_long4_sat
(
src_data1
)
+
convert_long4_sat
(
src2
))
;
int4
data
=
convert_int4_sat
(
ARITHM_OP
(
convert_long4_sat
(
src_data1
)
,
convert_long4_sat
(
src2
)
))
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
@@ -478,7 +483,7 @@ __kernel void arithm_s_add_C4_D5 (__global float *src1, int src1_step, int src
float4
src_data1
=
*
((
__global
float4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float4
data
=
src_data1
+
src2
;
float4
data
=
ARITHM_OP
(
src_data1,
src2
)
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
@@ -500,7 +505,7 @@ __kernel void arithm_s_add_C4_D6 (__global double *src1, int src1_step, int sr
double4
src_data1
=
*
((
__global
double4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double4
data
=
src_data1
+
src2
;
double4
data
=
ARITHM_OP
(
src_data1,
src2
)
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
...
...
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
View file @
7726e273
...
...
@@ -51,6 +51,11 @@
#
endif
#
endif
#
ifdef
ARITHM_ADD
#
define
ARITHM_OP
(
A,B
)
((
A
)
+
(
B
))
#
elif
defined
ARITHM_SUB
#
define
ARITHM_OP
(
A,B
)
((
A
)
-
(
B
))
#
endif
/**************************************add
with
scalar
with
mask**************************************/
__kernel
void
arithm_s_add_with_mask_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
...
...
@@ -94,7 +99,7 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
}
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
+
src2_data
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
...
...
@@ -134,7 +139,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
+
src2_data
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
src2_data
)
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
...
...
@@ -172,7 +177,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
+
src2_data
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
src2_data
)
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
...
...
@@ -202,7 +207,7 @@ __kernel void arithm_s_add_with_mask_C1_D4 (__global int *src1, int src1_ste
int
src_data2
=
src2.x
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
convert_int_sat
(
(
long
)
src_data1
+
(
long
)
src_data2
)
;
int
data
=
convert_int_sat
(
ARITHM_OP
((
long
)
src_data1,
(
long
)
src_data2
)
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -230,7 +235,7 @@ __kernel void arithm_s_add_with_mask_C1_D5 (__global float *src1, int src1_s
float
src_data2
=
src2.x
;
float
dst_data
=
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float
data
=
src_data1
+
src_data2
;
float
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -260,7 +265,7 @@ __kernel void arithm_s_add_with_mask_C1_D6 (__global double *src1, int src1_
double
src_data2
=
src2.x
;
double
dst_data
=
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double
data
=
src_data1
+
src_data2
;
double
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -296,7 +301,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
+
src2_data
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
...
...
@@ -326,7 +331,7 @@ __kernel void arithm_s_add_with_mask_C2_D2 (__global ushort *src1, int src1_st
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
+
src_data2
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
src_data2
)
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
...
...
@@ -354,7 +359,7 @@ __kernel void arithm_s_add_with_mask_C2_D3 (__global short *src1, int src1_ste
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
+
src_data2
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
src_data2
)
;
short2
data
=
convert_short2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
...
...
@@ -382,7 +387,7 @@ __kernel void arithm_s_add_with_mask_C2_D4 (__global int *src1, int src1_step,
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
convert_int2_sat
(
convert_long2_sat
(
src_data1
)
+
convert_long2_sat
(
src_data2
))
;
int2
data
=
convert_int2_sat
(
ARITHM_OP
(
convert_long2_sat
(
src_data1
)
,
convert_long2_sat
(
src_data2
)
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -409,7 +414,7 @@ __kernel void arithm_s_add_with_mask_C2_D5 (__global float *src1, int src1_ste
float2
src_data2
=
(
float2
)(
src2.x,
src2.y
)
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
data
=
src_data1
+
src_data2
;
float2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -438,7 +443,7 @@ __kernel void arithm_s_add_with_mask_C2_D6 (__global double *src1, int src1_st
double2
src_data2
=
(
double2
)(
src2.x,
src2.y
)
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
src_data1
+
src_data2
;
double2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -451,7 +456,6 @@ __kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_ste
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -466,7 +470,7 @@ __kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_ste
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
data
=
convert_uchar4_sat
(
convert_int4_sat
(
src_data1
)
+
src2
)
;
uchar4
data
=
convert_uchar4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
)
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
...
...
@@ -492,7 +496,7 @@ __kernel void arithm_s_add_with_mask_C4_D2 (__global ushort *src1, int src1_st
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
data
=
convert_ushort4_sat
(
convert_int4_sat
(
src_data1
)
+
src2
)
;
ushort4
data
=
convert_ushort4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
)
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -518,7 +522,7 @@ __kernel void arithm_s_add_with_mask_C4_D3 (__global short *src1, int src1_ste
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
data
=
convert_short4_sat
(
convert_int4_sat
(
src_data1
)
+
src2
)
;
short4
data
=
convert_short4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
)
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -544,7 +548,7 @@ __kernel void arithm_s_add_with_mask_C4_D4 (__global int *src1, int src1_step,
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
data
=
convert_int4_sat
(
convert_long4_sat
(
src_data1
)
+
convert_long4_sat
(
src2
))
;
int4
data
=
convert_int4_sat
(
ARITHM_OP
(
convert_long4_sat
(
src_data1
)
,
convert_long4_sat
(
src2
)
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -570,7 +574,7 @@ __kernel void arithm_s_add_with_mask_C4_D5 (__global float *src1, int src1_ste
float4
src_data1
=
*
((
__global
float4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float4
data
=
src_data1
+
src2
;
float4
data
=
ARITHM_OP
(
src_data1,
src2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
@@ -598,7 +602,7 @@ __kernel void arithm_s_add_with_mask_C4_D6 (__global double *src1, int src1_st
double4
src_data1
=
*
((
__global
double4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double4
data
=
src_data1
+
src2
;
double4
data
=
ARITHM_OP
(
src_data1,
src2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
...
...
modules/ocl/src/opencl/arithm_sub.cl
deleted
100644 → 0
View file @
bee970ab
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
GpuMaterials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////SUB////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************sub
without
mask**************************************/
__kernel
void
arithm_sub_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
convert_short4_sat
(
src1_data
)
-
convert_short4_sat
(
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_sub_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
-
convert_int4_sat
(
src2_data
)
;
ushort4
tmp_data
=
convert_ushort4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_sub_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
-
convert_int4_sat
(
src2_data
)
;
short4
tmp_data
=
convert_short4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_sub_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
long
tmp
=
(
long
)(
data1
)
-
(
long
)(
data2
)
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
convert_int_sat
(
tmp
)
;
}
}
__kernel
void
arithm_sub_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
float
data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
data2
=
*
((
__global
float
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float
tmp
=
data1
-
data2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_sub_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
double
data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
data2
=
*
((
__global
double
*
)((
__global
char
*
)
src2
+
src2_index
))
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data1
-
data2
;
}
}
#
endif
/**************************************sub
with
mask**************************************/
__kernel
void
arithm_sub_with_mask_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar4
mask_data
=
vload4
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
convert_short4_sat
(
src1_data
)
-
convert_short4_sat
(
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
mask_data.z
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
mask_data.w
)
&&
(
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src2_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index
))
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
-
convert_int2_sat
(
src2_data
)
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src2_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index
))
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
-
convert_int2_sat
(
src2_data
)
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
convert_int_sat
((
long
)
src_data1
-
(
long
)
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C1_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float
src_data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
src_data2
=
*
((
__global
float
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float
dst_data
=
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float
data
=
src_data1
-
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_sub_with_mask_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double
src_data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
src_data2
=
*
((
__global
double
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double
dst_data
=
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double
data
=
src_data1
-
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_sub_with_mask_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
convert_short4_sat
(
src1_data
)
-
convert_short4_sat
(
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
data.zw
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src_data2
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
-
convert_int2_sat
(
src_data2
)
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src_data2
=
*
((
__global
short2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
-
convert_int2_sat
(
src_data2
)
;
short2
data
=
convert_short2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
*
((
__global
int2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
convert_int2_sat
(
convert_long2_sat
(
src_data1
)
-
convert_long2_sat
(
src_data2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float2
src_data1
=
*
((
__global
float2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float2
src_data2
=
*
((
__global
float2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
data
=
src_data1
-
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_sub_with_mask_C2_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double2
src_data1
=
*
((
__global
double2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double2
src_data2
=
*
((
__global
double2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
src_data1
-
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_sub_with_mask_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
src_data2
=
*
((
__global
uchar4
*
)(
src2
+
src2_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
data
=
convert_uchar4_sat
(
convert_short4_sat
(
src_data1
)
-
convert_short4_sat
(
src_data2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
src_data2
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
data
=
convert_ushort4_sat
(
convert_int4_sat
(
src_data1
)
-
convert_int4_sat
(
src_data2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
src_data2
=
*
((
__global
short4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
data
=
convert_short4_sat
(
convert_int4_sat
(
src_data1
)
-
convert_int4_sat
(
src_data2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
src_data2
=
*
((
__global
int4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
data
=
convert_int4_sat
(
convert_long4_sat
(
src_data1
)
-
convert_long4_sat
(
src_data2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_sub_with_mask_C4_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float4
src_data1
=
*
((
__global
float4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float4
src_data2
=
*
((
__global
float4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float4
data
=
src_data1
-
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_sub_with_mask_C4_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
5
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double4
src_data1
=
*
((
__global
double4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double4
src_data2
=
*
((
__global
double4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double4
data
=
src_data1
-
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
modules/ocl/src/opencl/arithm_sub_scalar.cl
deleted
100644 → 0
View file @
bee970ab
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
oclMaterials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
/**************************************sub
with
scalar
without
mask**************************************/
__kernel
void
arithm_s_sub_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
int4
src2_data
=
(
int4
)(
src2.x,
src2.x,
src2.x,
src2.x
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
-
src2_data
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
-
src2_data
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.x
:
data.x
;
data.y
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
-
src2_data
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.x
:
data.x
;
data.y
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
src2.x
;
long
tmp
=
(
long
)
src_data1
-
(
long
)
src_data2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
int
data
=
convert_int_sat
(
tmp
)
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C1_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
float
src_data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
src_data2
=
src2.x
;
float
tmp
=
src_data1
-
src_data2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_sub_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
double
src_data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
src2_data
=
src2.x
;
double
data
=
src_data1
-
src2_data
;
data
=
isMatSubScalar
?
data
:
-data
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_sub_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
int4
src2_data
=
(
int4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
-
src2_data
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.xy
:
data.xy
;
data.zw
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
-
src_data2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
-
src_data2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
short2
data
=
convert_short2_sat
(
tmp
)
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
long2
tmp
=
convert_long2_sat
(
src_data1
)
-
convert_long2_sat
(
src_data2
)
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
int2
data
=
convert_int2_sat
(
tmp
)
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
float2
src_data1
=
*
((
__global
float2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float2
src_data2
=
(
float2
)(
src2.x,
src2.y
)
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
tmp
=
src_data1
-
src_data2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_sub_C2_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
double2
src_data1
=
*
((
__global
double2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double2
src_data2
=
(
double2
)(
src2.x,
src2.y
)
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
src_data1
-
src_data2
;
data
=
isMatSubScalar
?
data
:
-data
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_sub_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
int4
tmp
=
convert_int4_sat
(
src_data1
)
-
src2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
uchar4
data
=
convert_uchar4_sat
(
tmp
)
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
tmp
=
convert_int4_sat
(
src_data1
)
-
src2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
ushort4
data
=
convert_ushort4_sat
(
tmp
)
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
tmp
=
convert_int4_sat
(
src_data1
)
-
src2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
short4
data
=
convert_short4_sat
(
tmp
)
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
long4
tmp
=
convert_long4_sat
(
src_data1
)
-
convert_long4_sat
(
src2
)
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
int4
data
=
convert_int4_sat
(
tmp
)
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_C4_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
float4
src_data1
=
*
((
__global
float4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float4
tmp
=
src_data1
-
src2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_sub_C4_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
double4
src_data1
=
*
((
__global
double4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double4
data
=
src_data1
-
src2
;
data
=
isMatSubScalar
?
data
:
-data
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
modules/ocl/src/opencl/arithm_sub_scalar_mask.cl
deleted
100644 → 0
View file @
bee970ab
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
GpuMaterials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
/**************************************sub
with
scalar
with
mask**************************************/
__kernel
void
arithm_s_sub_with_mask_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
int4
src2_data
=
(
int4
)(
src2.x,
src2.x,
src2.x,
src2.x
)
;
uchar4
mask_data
=
vload4
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
-
src2_data
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
mask_data.z
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
mask_data.w
)
&&
(
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
-
src2_data
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src1_data
)
-
src2_data
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
src2.x
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
long
tmp
=
(
long
)
src_data1
-
(
long
)
src_data2
;
tmp
=
isMatSubScalar
?
tmp
:
-
tmp
;
int
data
=
convert_int_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C1_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float
src_data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
src_data2
=
src2.x
;
float
dst_data
=
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float
data
=
src_data1
-
src_data2
;
data
=
isMatSubScalar
?
data
:
-data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_sub_with_mask_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double
src_data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
src_data2
=
src2.x
;
double
dst_data
=
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double
data
=
src_data1
-
src_data2
;
data
=
isMatSubScalar
?
data
:
-data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_sub_with_mask_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
int4
src2_data
=
(
int4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
-
src2_data
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
data.zw
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
-
src_data2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
convert_int2_sat
(
src_data1
)
-
src_data2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
short2
data
=
convert_short2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
long2
tmp
=
convert_long2_sat
(
src_data1
)
-
convert_long2_sat
(
src_data2
)
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
int2
data
=
convert_int2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float2
src_data1
=
*
((
__global
float2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float2
src_data2
=
(
float2
)(
src2.x,
src2.y
)
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
data
=
src_data1
-
src_data2
;
data
=
isMatSubScalar
?
data
:
-data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_sub_with_mask_C2_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double2
src_data1
=
*
((
__global
double2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double2
src_data2
=
(
double2
)(
src2.x,
src2.y
)
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
src_data1
-
src_data2
;
data
=
isMatSubScalar
?
data
:
-data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_sub_with_mask_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src_data1
)
-
src2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
uchar4
data
=
convert_uchar4_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src_data1
)
-
src2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
ushort4
data
=
convert_ushort4_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src_data1
)
-
src2
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
short4
data
=
convert_short4_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
long4
tmp
=
convert_long4_sat
(
src_data1
)
-
convert_long4_sat
(
src2
)
;
tmp
=
isMatSubScalar
?
tmp
:
-tmp
;
int4
data
=
convert_int4_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_sub_with_mask_C4_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float4
src_data1
=
*
((
__global
float4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float4
data
=
src_data1
-
src2
;
data
=
isMatSubScalar
?
data
:
-data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_sub_with_mask_C4_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1,
int
isMatSubScalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double4
src_data1
=
*
((
__global
double4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double4
data
=
src_data1
-
src2
;
data
=
isMatSubScalar
?
data
:
-data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment