Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
8e0e352d
Commit
8e0e352d
authored
Sep 24, 2013
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactored and extended binary bitwise operations
parent
161674bf
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
106 additions
and
2565 deletions
+106
-2565
arithm.cpp
modules/ocl/src/arithm.cpp
+59
-266
arithm_bitwise_binary.cl
modules/ocl/src/opencl/arithm_bitwise_binary.cl
+10
-288
arithm_bitwise_binary_mask.cl
modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
+14
-750
arithm_bitwise_binary_scalar.cl
modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
+10
-580
arithm_bitwise_binary_scalar_mask.cl
modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+13
-681
No files found.
modules/ocl/src/arithm.cpp
View file @
8e0e352d
...
...
@@ -1290,7 +1290,8 @@ int cv::ocl::countNonZero(const oclMat &src)
//////////////////////////////////////////////////////////////////////////////
////////////////////////////////bitwise_op////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
static
void
bitwise_run
(
const
oclMat
&
src1
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
)
static
void
bitwise_unary_run
(
const
oclMat
&
src1
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
...
...
@@ -1327,331 +1328,123 @@ static void bitwise_run(const oclMat &src1, oclMat &dst, string kernelName, cons
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
}
enum
{
AND
=
0
,
OR
,
XOR
};
template
<
typename
T
>
void
bitwise_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
void
*
_scalar
,
const
char
*
_opt
=
NULL
)
static
void
bitwise_binary_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
const
Scalar
&
src3
,
const
oclMat
&
mask
,
oclMat
&
dst
,
int
operationType
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
CV_Assert
(
src1
.
cols
==
src2
.
cols
&&
src2
.
cols
==
dst
.
cols
&&
src1
.
rows
==
src2
.
rows
&&
src2
.
rows
==
dst
.
rows
);
CV_Assert
(
src1
.
type
()
==
src2
.
type
()
&&
src1
.
type
()
==
dst
.
type
());
Context
*
clCxt
=
src1
.
clCxt
;
int
channels
=
dst
.
oclchannels
();
int
depth
=
dst
.
depth
();
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
4
,
4
,
1
,
1
,
1
},
{
4
,
4
,
4
,
4
,
1
,
1
,
1
},
{
4
,
4
,
4
,
4
,
1
,
1
,
1
},
{
4
,
4
,
4
,
4
,
1
,
1
,
1
}
};
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
depth
];
int
offset_cols
=
(
dst
.
offset
/
dst
.
elemSize1
())
&
(
vector_length
-
1
);
int
cols
=
divUp
(
dst
.
cols
*
channels
+
offset_cols
,
vector_length
);
size_t
localThreads
[
3
]
=
{
64
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
dst
.
rows
,
1
};
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src1
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src2
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
T
scalar
;
if
(
_scalar
!=
NULL
)
if
(
!
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
depth
()
==
CV_64F
)
{
double
scalar1
=
*
((
double
*
)
_scalar
);
scalar
=
(
T
)
scalar1
;
args
.
push_back
(
make_pair
(
sizeof
(
T
),
(
void
*
)
&
scalar
));
cout
<<
"Selected device does not support double"
<<
endl
;
return
;
}
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
,
_opt
);
}
static
void
bitwise_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
const
char
*
_opt
=
NULL
)
{
bitwise_run
<
char
>
(
src1
,
src2
,
dst
,
kernelName
,
kernelString
,
(
void
*
)
NULL
,
_opt
);
}
static
void
bitwise_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
const
char
*
_opt
=
NULL
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
CV_Assert
(
src1
.
cols
==
src2
.
cols
&&
src2
.
cols
==
dst
.
cols
&&
src1
.
rows
==
src2
.
rows
&&
src2
.
rows
==
dst
.
rows
&&
src1
.
rows
==
mask
.
rows
&&
src1
.
cols
==
mask
.
cols
);
CV_Assert
(
src1
.
type
()
==
src2
.
type
()
&&
src1
.
type
()
==
dst
.
type
());
CV_Assert
(
mask
.
type
()
==
CV_8U
);
CV_Assert
(
operationType
>=
AND
&&
operationType
<=
XOR
);
CV_Assert
(
src2
.
empty
()
||
(
!
src2
.
empty
()
&&
src1
.
type
()
==
src2
.
type
()
&&
src1
.
size
()
==
src2
.
size
()));
CV_Assert
(
mask
.
empty
()
||
(
!
mask
.
empty
()
&&
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src1
.
size
()));
Context
*
clCxt
=
src1
.
clCxt
;
int
channels
=
dst
.
oclchannels
();
int
depth
=
dst
.
depth
();
dst
.
create
(
src1
.
size
(),
src1
.
type
());
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
2
,
2
,
1
,
1
,
1
},
{
2
,
2
,
1
,
1
,
1
,
1
,
1
},
{
4
,
4
,
2
,
2
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
}
};
int
elemSize
=
dst
.
elemSize
();
int
cols1
=
dst
.
cols
*
elemSize
;
oclMat
m
;
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
depth
]
;
int
offset_cols
=
((
dst
.
offset
%
dst
.
step
)
/
dst
.
elemSize
())
&
(
vector_length
-
1
);
int
cols
=
divUp
(
dst
.
cols
+
offset_cols
,
vector_length
);
const
char
operationMap
[]
=
{
'&'
,
'|'
,
'^'
}
;
std
::
string
kernelName
(
"arithm_bitwise_binary"
);
std
::
string
buildOptions
=
format
(
"-D Operation=%c"
,
operationMap
[
operationType
]
);
size_t
localThreads
[
3
]
=
{
64
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
dst
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
1
,
dst
.
rows
,
1
};
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src1
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src2
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mask
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mask
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mask
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
_opt
);
}
template
<
typename
WT
,
typename
CL_WT
>
void
bitwise_scalar_run
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
int
isMatSubScalar
,
const
char
*
opt
=
NULL
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
CV_Assert
(
src1
.
cols
==
dst
.
cols
&&
src1
.
rows
==
dst
.
rows
&&
src1
.
type
()
==
dst
.
type
());
if
(
mask
.
data
)
if
(
src2
.
empty
())
{
CV_Assert
(
mask
.
type
()
==
CV_8U
&&
src1
.
rows
==
mask
.
rows
&&
src1
.
cols
==
mask
.
cols
);
}
Context
*
clCxt
=
src1
.
clCxt
;
int
channels
=
dst
.
oclchannels
();
int
depth
=
dst
.
depth
();
WT
s
[
4
]
=
{
saturate_cast
<
WT
>
(
src2
.
val
[
0
]),
saturate_cast
<
WT
>
(
src2
.
val
[
1
]),
saturate_cast
<
WT
>
(
src2
.
val
[
2
]),
saturate_cast
<
WT
>
(
src2
.
val
[
3
])
};
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
2
,
2
,
1
,
1
,
1
},
{
2
,
2
,
1
,
1
,
1
,
1
,
1
},
{
4
,
4
,
2
,
2
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
}
};
m
.
create
(
1
,
1
,
dst
.
type
());
m
.
setTo
(
src3
);
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
depth
];
int
offset_cols
=
((
dst
.
offset
%
dst
.
step
)
/
dst
.
elemSize
())
&
(
vector_length
-
1
);
int
cols
=
divUp
(
dst
.
cols
+
offset_cols
,
vector_length
);
size_t
localThreads
[
3
]
=
{
64
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
dst
.
rows
,
1
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
m
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
elemSize
)
);
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src1
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src1
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src1
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst
.
offset
));
if
(
mask
.
data
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
mask
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
mask
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
mask
.
offset
));
kernelName
+=
"_scalar"
;
}
args
.
push_back
(
make_pair
(
sizeof
(
CL_WT
)
,
(
void
*
)
&
s
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src1
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_step1
));
if
(
isMatSubScalar
!=
0
)
else
{
isMatSubScalar
=
isMatSubScalar
>
0
?
1
:
0
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
isMatSubScalar
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src2
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
offset
));
}
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
opt
);
}
if
(
!
mask
.
empty
())
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mask
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mask
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mask
.
offset
));
if
(
!
src2
.
empty
())
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
elemSize
));
kernelName
+=
"_mask"
;
}
typedef
void
(
*
BitwiseFuncS
)(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
int
isMatSubScalar
,
const
char
*
opt
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
rows
));
static
void
bitwise_scalar
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
int
isMatSubScalar
,
const
char
*
opt
)
{
static
BitwiseFuncS
tab
[
8
]
=
{
#if 0
bitwise_scalar_run<unsigned char>,
bitwise_scalar_run<char>,
bitwise_scalar_run<unsigned short>,
bitwise_scalar_run<short>,
bitwise_scalar_run<int>,
bitwise_scalar_run<float>,
bitwise_scalar_run<double>,
0
#else
bitwise_scalar_run
<
unsigned
char
,
cl_uchar4
>
,
bitwise_scalar_run
<
char
,
cl_char4
>
,
bitwise_scalar_run
<
unsigned
short
,
cl_ushort4
>
,
bitwise_scalar_run
<
short
,
cl_short4
>
,
bitwise_scalar_run
<
int
,
cl_int4
>
,
bitwise_scalar_run
<
float
,
cl_float4
>
,
bitwise_scalar_run
<
double
,
cl_double4
>
,
0
#endif
};
BitwiseFuncS
func
=
tab
[
src1
.
depth
()];
if
(
func
==
0
)
cv
::
ocl
::
error
(
"Unsupported arithmetic operation"
,
__FILE__
,
__LINE__
);
func
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
,
isMatSubScalar
,
opt
);
}
static
void
bitwise_scalar
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
const
char
*
opt
=
NULL
)
{
bitwise_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
,
0
,
opt
);
openCLExecuteKernel
(
clCxt
,
mask
.
empty
()
?
(
!
src2
.
empty
()
?
&
arithm_bitwise_binary
:
&
arithm_bitwise_binary_scalar
)
:
(
!
src2
.
empty
()
?
&
arithm_bitwise_binary_mask
:
&
arithm_bitwise_binary_scalar_mask
),
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
buildOptions
.
c_str
());
}
void
cv
::
ocl
::
bitwise_not
(
const
oclMat
&
src
,
oclMat
&
dst
)
{
if
(
!
src
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src
.
type
()
==
CV_64F
)
if
(
!
src
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src
.
type
()
==
CV_64F
)
{
cout
<<
"Selected device do not support double"
<<
endl
;
cout
<<
"Selected device do
es
not support double"
<<
endl
;
return
;
}
dst
.
create
(
src
.
size
(),
src
.
type
());
string
kernelName
=
"arithm_bitwise_not"
;
bitwise_run
(
src
,
dst
,
kernelName
,
&
arithm_bitwise_not
);
bitwise_
unary_
run
(
src
,
dst
,
kernelName
,
&
arithm_bitwise_not
);
}
void
cv
::
ocl
::
bitwise_or
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
// dst.create(src1.size(),src1.type());
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
cout
<<
"Selected device do not support double"
<<
endl
;
return
;
}
string
kernelName
=
mask
.
empty
()
?
"arithm_bitwise_binary"
:
"arithm_bitwise_binary_with_mask"
;
static
const
char
opt
[]
=
"-D OP_BINARY=|"
;
if
(
mask
.
empty
())
bitwise_run
(
src1
,
src2
,
dst
,
kernelName
,
&
arithm_bitwise_binary
,
opt
);
else
bitwise_run
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_mask
,
opt
);
bitwise_binary_run
(
src1
,
src2
,
Scalar
(),
mask
,
dst
,
OR
);
}
void
cv
::
ocl
::
bitwise_or
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
cout
<<
"Selected device do not support double"
<<
endl
;
return
;
}
static
const
char
opt
[]
=
"-D OP_BINARY=|"
;
string
kernelName
=
mask
.
data
?
"arithm_s_bitwise_binary_with_mask"
:
"arithm_s_bitwise_binary"
;
if
(
mask
.
data
)
bitwise_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_scalar_mask
,
opt
);
else
bitwise_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_scalar
,
opt
);
bitwise_binary_run
(
src1
,
oclMat
(),
src2
,
mask
,
dst
,
OR
);
}
void
cv
::
ocl
::
bitwise_and
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
// dst.create(src1.size(),src1.type());
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
cout
<<
"Selected device do not support double"
<<
endl
;
return
;
}
oclMat
emptyMat
;
string
kernelName
=
mask
.
empty
()
?
"arithm_bitwise_binary"
:
"arithm_bitwise_binary_with_mask"
;
static
const
char
opt
[]
=
"-D OP_BINARY=&"
;
if
(
mask
.
empty
())
bitwise_run
(
src1
,
src2
,
dst
,
kernelName
,
&
arithm_bitwise_binary
,
opt
);
else
bitwise_run
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_mask
,
opt
);
bitwise_binary_run
(
src1
,
src2
,
Scalar
(),
mask
,
dst
,
AND
);
}
void
cv
::
ocl
::
bitwise_and
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
cout
<<
"Selected device do not support double"
<<
endl
;
return
;
}
static
const
char
opt
[]
=
"-D OP_BINARY=&"
;
string
kernelName
=
mask
.
data
?
"arithm_s_bitwise_binary_with_mask"
:
"arithm_s_bitwise_binary"
;
if
(
mask
.
data
)
bitwise_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_scalar_mask
,
opt
);
else
bitwise_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_scalar
,
opt
);
bitwise_binary_run
(
src1
,
oclMat
(),
src2
,
mask
,
dst
,
AND
);
}
void
cv
::
ocl
::
bitwise_xor
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
cout
<<
"Selected device do not support double"
<<
endl
;
return
;
}
string
kernelName
=
mask
.
empty
()
?
"arithm_bitwise_binary"
:
"arithm_bitwise_binary_with_mask"
;
static
const
char
opt
[]
=
"-D OP_BINARY=^"
;
if
(
mask
.
empty
())
bitwise_run
(
src1
,
src2
,
dst
,
kernelName
,
&
arithm_bitwise_binary
,
opt
);
else
bitwise_run
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_mask
,
opt
);
bitwise_binary_run
(
src1
,
src2
,
Scalar
(),
mask
,
dst
,
XOR
);
}
void
cv
::
ocl
::
bitwise_xor
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
cout
<<
"Selected device do not support double"
<<
endl
;
return
;
}
string
kernelName
=
mask
.
data
?
"arithm_s_bitwise_binary_with_mask"
:
"arithm_s_bitwise_binary"
;
static
const
char
opt
[]
=
"-D OP_BINARY=^"
;
if
(
mask
.
data
)
bitwise_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_scalar_mask
,
opt
);
else
bitwise_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_bitwise_binary_scalar
,
opt
);
bitwise_binary_run
(
src1
,
oclMat
(),
src2
,
mask
,
dst
,
XOR
);
}
oclMat
cv
::
ocl
::
operator
~
(
const
oclMat
&
src
)
...
...
modules/ocl/src/opencl/arithm_bitwise_binary.cl
View file @
8e0e352d
...
...
@@ -43,303 +43,25 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//bitwise_binary
without
mask
for
and,
or,
xor
operators
/////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////
/bitwise_binary/
//////////////////////////////////////////
///////////////////////////////////////////
bitwise_binary
//////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////
#
ifndef
OP_BINARY
#
define
OP_BINARY
&
#
endif
__kernel
void
arithm_bitwise_binary_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_bitwise_binary_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_bitwise_binary_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
arithm_bitwise_binary
(
__global
uchar
*
src1,
int
src1_step,
int
src1_offset,
__global
uchar
*
src2,
int
src2_step,
int
src2_offset,
__global
uchar
*
dst,
int
dst_step,
int
dst_offset,
int
cols1,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
1
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_bitwise_binary_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_bitwise_binary_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int
tmp
=
data1
OP_BINARY
data2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
__kernel
void
arithm_bitwise_binary_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
char4
data1
=
*
((
__global
char4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char4
data2
=
*
((
__global
char4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
char4
tmp
=
data1
OP_BINARY
data2
;
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_bitwise_binary_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
char8
data1
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char8
data2
=
*
((
__global
char8
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
)
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data1
OP_BINARY
data2
;
dst[dst_index]
=
src1[src1_index]
Operation
src2[src2_index]
;
}
}
#
endif
modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
View file @
8e0e352d
...
...
@@ -43,767 +43,31 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
ifndef
OP_BINARY
#
define
OP_BINARY
&
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_binary
with
mask**************************************/
__kernel
void
arithm_bitwise_binary_with_mask_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar4
mask_data
=
vload4
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
mask_data.z
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
mask_data.w
)
&&
(
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C1_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
arithm_bitwise_binary_mask
(
__global
uchar
*
src1,
int
src1_step,
int
src1_offset,
__global
uchar
*
src2,
int
src2_step,
int
src2_offset,
__global
uchar
*
mask,
int
mask_step,
int
mask_offset,
int
elemSize,
__global
uchar
*
dst,
int
dst_step,
int
dst_offset,
int
cols1,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
1
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar4
mask_data
=
vload4
(
0
,
mask
+
mask_index
)
;
char4
data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
convert_char
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
convert_char
((
mask_data.y
)
&&
(
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
convert_char
((
mask_data.z
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
convert_char
((
mask_data.w
)
&&
(
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src2_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index
))
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
ushort2
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
convert_ushort
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
convert_ushort
((
mask_data.y
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src2_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index
))
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
short2
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
convert_short
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
convert_short
((
mask_data.y
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C1_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char4
src_data1
=
*
((
__global
char4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char4
src_data2
=
*
((
__global
char4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
char4
dst_data
=
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char4
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C1_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char8
src_data1
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char8
src_data2
=
*
((
__global
char8
*
)((
__global
char
*
)
src2
+
src2_index
))
;
char8
dst_data
=
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char8
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
data.zw
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C2_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
char4
data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
data.zw
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.zw
:
data.zw
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src_data2
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort2
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src_data2
=
*
((
__global
short2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short2
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
*
((
__global
int2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C2_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char8
src_data1
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char8
src_data2
=
*
((
__global
char8
*
)((
__global
char
*
)
src2
+
src2_index
))
;
char8
dst_data
=
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char8
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C2_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char16
src_data1
=
*
((
__global
char16
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char16
src_data2
=
*
((
__global
char16
*
)((
__global
char
*
)
src2
+
src2_index
))
;
char16
dst_data
=
*
((
__global
char16
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char16
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char16
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
src_data2
=
*
((
__global
uchar4
*
)(
src2
+
src2_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C4_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char4
src_data1
=
*
((
__global
char4
*
)(
src1
+
src1_index
))
;
char4
src_data2
=
*
((
__global
char4
*
)(
src2
+
src2_index
))
;
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
src_data2
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
src_data2
=
*
((
__global
short4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
src_data2
=
*
((
__global
int4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_bitwise_binary_with_mask_C4_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char16
src_data1
=
*
((
__global
char16
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char16
src_data2
=
*
((
__global
char16
*
)((
__global
char
*
)
src2
+
src2_index
))
;
char16
dst_data
=
*
((
__global
char16
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char16
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char16
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_bitwise_binary_with_mask_C4_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
5
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char8
src_data1_0
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
+
0
))
;
char8
src_data1_1
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
+
8
))
;
char8
src_data1_2
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
+
16
))
;
char8
src_data1_3
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
+
24
))
;
char8
src_data2_0
=
*
((
__global
char8
*
)((
__global
char
*
)
src2
+
src2_index
+
0
))
;
char8
src_data2_1
=
*
((
__global
char8
*
)((
__global
char
*
)
src2
+
src2_index
+
8
))
;
char8
src_data2_2
=
*
((
__global
char8
*
)((
__global
char
*
)
src2
+
src2_index
+
16
))
;
char8
src_data2_3
=
*
((
__global
char8
*
)((
__global
char
*
)
src2
+
src2_index
+
24
))
;
char8
dst_data_0
=
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
;
char8
dst_data_1
=
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
;
char8
dst_data_2
=
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
;
char8
dst_data_3
=
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
+
24
))
;
char8
data_0
=
src_data1_0
OP_BINARY
src_data2_0
;
char8
data_1
=
src_data1_1
OP_BINARY
src_data2_1
;
char8
data_2
=
src_data1_2
OP_BINARY
src_data2_2
;
char8
data_3
=
src_data1_3
OP_BINARY
src_data2_3
;
int
mask_index
=
mad24
(
y,
mask_step,
mask_offset
+
(
x
/
elemSize
))
;
data_0
=
mask_data
?
data_0
:
dst_data_0
;
data_1
=
mask_data
?
data_1
:
dst_data_1
;
data_2
=
mask_data
?
data_2
:
dst_data_2
;
data_3
=
mask_data
?
data_3
:
dst_data_3
;
if
(
mask[mask_index]
)
{
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
+
24
))
=
data_3
;
dst[dst_index]
=
src1[src1_index]
Operation
src2[src2_index]
;
}
}
}
#
endif
modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
View file @
8e0e352d
...
...
@@ -43,596 +43,26 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
ifndef
OP_BINARY
#
define
OP_BINARY
&
#
endif
///////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////bitwise_binary/////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/******************************bitwise
binary
with
scalar
without
mask********************************/
__kernel
void
arithm_s_bitwise_binary_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
uchar4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
(
uchar4
)(
src2.x,
src2.x,
src2.x,
src2.x
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C1_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
char4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
char4
src2_data
=
(
char4
)(
src2.x,
src2.x,
src2.x,
src2.x
)
;
char4
data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
ushort4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src2_data
=
(
ushort2
)(
src2.x,
src2.x
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
ushort2
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.x
:
data.x
;
data.y
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
short4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src2_data
=
(
short2
)(
src2.x,
src2.x
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
short2
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.x
:
data.x
;
data.y
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
arithm_bitwise_binary_scalar
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
elemSize,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
cols1,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
1
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
src2.x
;
int
data
=
src_data1
OP_BINARY
src_data2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C1_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
char16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
char4
src1_data
=
*
((
__global
char4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char4
src2_data
=
(
char4
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3
)
;
char4
data
=
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_bitwise_binary_C1_D6
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
short16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
short4
src1_data
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
src2_data
=
(
short4
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3
)
;
short4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp_data
;
}
}
#
endif
__kernel
void
arithm_s_bitwise_binary_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
uchar4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
(
uchar4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.xy
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.xy
:
data.xy
;
data.zw
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C2_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
char4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
char4
src2_data
=
(
char4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
char4
data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.xy
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.xy
:
data.xy
;
data.zw
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.zw
:
data.zw
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
ushort4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src_data2
=
(
ushort2
)(
src2.x,
src2.y
)
;
ushort2
data
=
src_data1
OP_BINARY
src_data2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
short4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src_data2
=
(
short2
)(
src2.x,
src2.y
)
;
short2
data
=
src_data1
OP_BINARY
src_data2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
int2
data
=
src_data1
OP_BINARY
src_data2
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C2_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
char16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
char8
src1_data
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char8
src2_data
=
(
char8
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3,
src2.s4,
src2.s5,
src2.s6,
src2.s7
)
;
char8
tmp_data
=
src1_data
OP_BINARY
src2_data
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp_data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_bitwise_binary_C2_D6
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
short16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
short8
src1_data
=
*
((
__global
short8
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short8
src2_data
=
(
short8
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3,
src2.s4,
src2.s5,
src2.s6,
src2.s7
)
;
short8
tmp_data
=
src1_data
OP_BINARY
src2_data
;
*
((
__global
short8
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp_data
;
}
}
#
endif
__kernel
void
arithm_s_bitwise_binary_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
uchar4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
data
=
src_data1
OP_BINARY
src2
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C4_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
char4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
char4
src_data1
=
*
((
__global
char4
*
)(
src1
+
src1_index
))
;
char4
data
=
src_data1
OP_BINARY
src2
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
ushort4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
data
=
src_data1
OP_BINARY
src2
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
short4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
data
=
src_data1
OP_BINARY
src2
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
data
=
src_data1
OP_BINARY
src2
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_C4_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
char16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
char16
src1_data
=
*
((
__global
char16
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char16
src2_data
=
(
char16
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3,
src2.s4,
src2.s5,
src2.s6,
src2.s7,
src2.s8,
src2.s9,
src2.sa,
src2.sb,
src2.sc,
src2.sd,
src2.se,
src2.sf
)
;
char16
tmp_data
=
src1_data
OP_BINARY
src2_data
;
*
((
__global
char16
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp_data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_bitwise_binary_C4_D6
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
short16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
short4
src1_data_0
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
+
0
))
;
short4
src1_data_1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
+
8
))
;
short4
src1_data_2
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
+
16
))
;
short4
src1_data_3
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
+
24
))
;
short4
src2_data_0
=
(
short4
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3
)
;
short4
src2_data_1
=
(
short4
)(
src2.s4,
src2.s5,
src2.s6,
src2.s7
)
;
short4
src2_data_2
=
(
short4
)(
src2.s8,
src2.s9,
src2.sa,
src2.sb
)
;
short4
src2_data_3
=
(
short4
)(
src2.sc,
src2.sd,
src2.se,
src2.sf
)
;
short4
tmp_data_0
=
src1_data_0
OP_BINARY
src2_data_0
;
short4
tmp_data_1
=
src1_data_1
OP_BINARY
src2_data_1
;
short4
tmp_data_2
=
src1_data_2
OP_BINARY
src2_data_2
;
short4
tmp_data_3
=
src1_data_3
OP_BINARY
src2_data_3
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
24
))
=
tmp_data_3
;
int
src1_index
=
mad24
(
y,
src1_step,
src1_offset
+
x
)
;
int
src2_index
=
x
%
elemSize
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
)
;
dst[dst_index]
=
src1[src1_index]
Operation
src2[src2_index]
;
}
}
#
endif
modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
View file @
8e0e352d
...
...
@@ -42,6 +42,7 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
...
...
@@ -50,698 +51,29 @@
#
endif
#
endif
#
ifndef
OP_BINARY
#
define
OP_BINARY
&
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_binary
with
scalar
with
mask**************************************/
__kernel
void
arithm_s_bitwise_binary_with_mask_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
uchar4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
(
uchar4
)(
src2.x,
src2.x,
src2.x,
src2.x
)
;
uchar4
mask_data
=
vload4
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
mask_data.z
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
mask_data.w
)
&&
(
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C1_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
char4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
char4
src2_data
=
(
char4
)(
src2.x,
src2.x,
src2.x,
src2.x
)
;
uchar4
mask_data
=
vload4
(
0
,
mask
+
mask_index
)
;
char4
data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
mask_data.z
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
mask_data.w
)
&&
(
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
ushort4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src2_data
=
(
ushort2
)(
src2.x,
src2.x
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
ushort2
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
short4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src2_data
=
(
short2
)(
src2.x,
src2.x
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
short2
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
src2.x
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C1_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
char16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char4
src1_data
=
*
((
__global
char4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char4
src2_data
=
(
char4
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3
)
;
char4
dst_data
=
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char4
data
=
src1_data
OP_BINARY
src2_data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_bitwise_binary_with_mask_C1_D6
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
short16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short4
src1_data
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
src2_data
=
(
short4
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3
)
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
data
=
src1_data
OP_BINARY
src2_data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_bitwise_binary_with_mask_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
uchar4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
(
uchar4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
data.zw
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C2_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
char4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
char4
src2_data
=
(
char4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
char4
data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
OP_BINARY
src2_data
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
data.zw
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.zw
:
data.zw
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
ushort4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src_data2
=
(
ushort2
)(
src2.x,
src2.y
)
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort2
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
short4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src_data2
=
(
short2
)(
src2.x,
src2.y
)
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short2
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
src_data1
OP_BINARY
src_data2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C2_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
char16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char8
src1_data
=
*
((
__global
char8
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char8
src2_data
=
(
char8
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3,
src2.s4,
src2.s5,
src2.s6,
src2.s7
)
;
char8
dst_data
=
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char8
data
=
src1_data
OP_BINARY
src2_data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char8
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_bitwise_binary_with_mask_C2_D6
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__kernel
void
arithm_bitwise_binary_scalar_mask
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
elemSize,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
short16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short8
src1_data
=
*
((
__global
short8
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short8
src2_data
=
(
short8
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3,
src2.s4,
src2.s5,
src2.s6,
src2.s7
)
;
short8
dst_data
=
*
((
__global
short8
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short8
data
=
src1_data
OP_BINARY
src2_data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short8
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_bitwise_binary_with_mask_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
uchar4
src2,
int
rows,
int
cols,
int
dst_step1
)
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
(
x
/
elemSize
)
+
mask_offset
)
;
if
(
mask[mask_index]
)
{
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
x
%
elemSize
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
data
=
src_data1
OP_BINARY
src2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
dst[dst_index]
=
src1[src1_index]
Operation
src2[src2_index]
;
}
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C4_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
char4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char4
src_data1
=
*
((
__global
char4
*
)(
src1
+
src1_index
))
;
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
data
=
src_data1
OP_BINARY
src2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
ushort4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
data
=
src_data1
OP_BINARY
src2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
short4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
data
=
src_data1
OP_BINARY
src2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
data
=
src_data1
OP_BINARY
src2
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_bitwise_binary_with_mask_C4_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
char16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
char16
src1_data
=
*
((
__global
char16
*
)((
__global
char
*
)
src1
+
src1_index
))
;
char16
src2_data
=
(
char16
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3,
src2.s4,
src2.s5,
src2.s6,
src2.s7,
src2.s8,
src2.s9,
src2.sa,
src2.sb,
src2.sc,
src2.sd,
src2.se,
src2.sf
)
;
char16
dst_data
=
*
((
__global
char16
*
)((
__global
char
*
)
dst
+
dst_index
))
;
char16
data
=
src1_data
OP_BINARY
src2_data
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
char16
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_bitwise_binary_with_mask_C4_D6
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
short16
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short4
src1_data_0
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
+
0
))
;
short4
src1_data_1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
+
8
))
;
short4
src1_data_2
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
+
16
))
;
short4
src1_data_3
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
+
24
))
;
short4
src2_data_0
=
(
short4
)(
src2.s0,
src2.s1,
src2.s2,
src2.s3
)
;
short4
src2_data_1
=
(
short4
)(
src2.s4,
src2.s5,
src2.s6,
src2.s7
)
;
short4
src2_data_2
=
(
short4
)(
src2.s8,
src2.s9,
src2.sa,
src2.sb
)
;
short4
src2_data_3
=
(
short4
)(
src2.sc,
src2.sd,
src2.se,
src2.sf
)
;
short4
dst_data_0
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
;
short4
dst_data_1
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
;
short4
dst_data_2
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
;
short4
dst_data_3
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
24
))
;
short4
data_0
=
src1_data_0
OP_BINARY
src2_data_0
;
short4
data_1
=
src1_data_1
OP_BINARY
src2_data_1
;
short4
data_2
=
src1_data_2
OP_BINARY
src2_data_2
;
short4
data_3
=
src1_data_3
OP_BINARY
src2_data_3
;
data_0
=
mask_data
?
data_0
:
dst_data_0
;
data_1
=
mask_data
?
data_1
:
dst_data_1
;
data_2
=
mask_data
?
data_2
:
dst_data_2
;
data_3
=
mask_data
?
data_3
:
dst_data_3
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
+
24
))
=
data_3
;
}
}
#
endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment