Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0ad03162
Commit
0ad03162
authored
Sep 24, 2013
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactored and extended arithm operations add/sub/mul/div/absdiff
parent
5ff5fdd7
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
283 additions
and
2832 deletions
+283
-2832
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+8
-11
arithm.cpp
modules/ocl/src/arithm.cpp
+106
-321
arithm_add.cl
modules/ocl/src/opencl/arithm_add.cl
+51
-755
arithm_add_mask.cl
modules/ocl/src/opencl/arithm_add_mask.cl
+79
-0
arithm_add_scalar.cl
modules/ocl/src/opencl/arithm_add_scalar.cl
+28
-430
arithm_add_scalar_mask.cl
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+11
-544
arithm_div.cl
modules/ocl/src/opencl/arithm_div.cl
+0
-468
arithm_mul.cl
modules/ocl/src/opencl/arithm_mul.cl
+0
-303
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
0ad03162
...
...
@@ -409,40 +409,37 @@ namespace cv
CV_EXPORTS
void
split
(
const
oclMat
&
src
,
vector
<
oclMat
>
&
dst
);
////////////////////////////// Arithmetics ///////////////////////////////////
//#if defined DOUBLE_SUPPORT
//typedef double F;
//#else
//typedef float F;
//#endif
// CV_EXPORTS void addWeighted(const oclMat& a,F alpha, const oclMat& b,F beta,F gama, oclMat& c);
CV_EXPORTS
void
addWeighted
(
const
oclMat
&
a
,
double
alpha
,
const
oclMat
&
b
,
double
beta
,
double
gama
,
oclMat
&
c
);
//! adds one matrix to another (c = a + b)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
add
(
const
oclMat
&
a
,
const
oclMat
&
b
,
oclMat
&
c
);
//! adds one matrix to another (c = a + b)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
add
(
const
oclMat
&
a
,
const
oclMat
&
b
,
oclMat
&
c
,
const
oclMat
&
mask
);
CV_EXPORTS
void
add
(
const
oclMat
&
a
,
const
oclMat
&
b
,
oclMat
&
c
,
const
oclMat
&
mask
=
oclMat
());
//! adds scalar to a matrix (c = a + s)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
add
(
const
oclMat
&
a
,
const
Scalar
&
sc
,
oclMat
&
c
,
const
oclMat
&
mask
=
oclMat
());
//! subtracts one matrix from another (c = a - b)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
subtract
(
const
oclMat
&
a
,
const
oclMat
&
b
,
oclMat
&
c
);
//! subtracts one matrix from another (c = a - b)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
subtract
(
const
oclMat
&
a
,
const
oclMat
&
b
,
oclMat
&
c
,
const
oclMat
&
mask
);
CV_EXPORTS
void
subtract
(
const
oclMat
&
a
,
const
oclMat
&
b
,
oclMat
&
c
,
const
oclMat
&
mask
=
oclMat
());
//! subtracts scalar from a matrix (c = a - s)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
subtract
(
const
oclMat
&
a
,
const
Scalar
&
sc
,
oclMat
&
c
,
const
oclMat
&
mask
=
oclMat
());
//! subtracts scalar from a matrix (c = a - s)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
subtract
(
const
Scalar
&
sc
,
const
oclMat
&
a
,
oclMat
&
c
,
const
oclMat
&
mask
=
oclMat
());
//! computes element-wise product of the two arrays (c = a * b)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
multiply
(
const
oclMat
&
a
,
const
oclMat
&
b
,
oclMat
&
c
,
double
scale
=
1
);
//! multiplies matrix to a number (dst = scalar * src)
// supports CV_32FC1 only
CV_EXPORTS
void
multiply
(
double
scalar
,
const
oclMat
&
src
,
oclMat
&
dst
);
//! computes element-wise quotient of the two arrays (c = a / b)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS
void
divide
(
const
oclMat
&
a
,
const
oclMat
&
b
,
oclMat
&
c
,
double
scale
=
1
);
...
...
modules/ocl/src/arithm.cpp
View file @
0ad03162
...
...
@@ -62,11 +62,11 @@ namespace cv
{
namespace
ocl
{
////////////////////////////////OpenCL kernel strings/////////////////////
//////////////////////////////// OpenCL kernel strings /////////////////////
extern
const
char
*
transpose_kernel
;
extern
const
char
*
arithm_nonzero
;
extern
const
char
*
arithm_sum
;
extern
const
char
*
arithm_2_mat
;
extern
const
char
*
arithm_sum_3
;
extern
const
char
*
arithm_minMax
;
extern
const
char
*
arithm_minMax_mask
;
...
...
@@ -74,6 +74,7 @@ namespace cv
extern
const
char
*
arithm_minMaxLoc_mask
;
extern
const
char
*
arithm_LUT
;
extern
const
char
*
arithm_add
;
extern
const
char
*
arithm_add_mask
;
extern
const
char
*
arithm_add_scalar
;
extern
const
char
*
arithm_add_scalar_mask
;
extern
const
char
*
arithm_bitwise_binary
;
...
...
@@ -83,9 +84,7 @@ namespace cv
extern
const
char
*
arithm_bitwise_not
;
extern
const
char
*
arithm_compare_eq
;
extern
const
char
*
arithm_compare_ne
;
extern
const
char
*
arithm_mul
;
extern
const
char
*
arithm_div
;
extern
const
char
*
arithm_absdiff
;
extern
const
char
*
arithm_magnitudeSqr
;
extern
const
char
*
arithm_transpose
;
extern
const
char
*
arithm_flip
;
extern
const
char
*
arithm_flip_rc
;
...
...
@@ -97,390 +96,176 @@ namespace cv
extern
const
char
*
arithm_addWeighted
;
extern
const
char
*
arithm_phase
;
extern
const
char
*
arithm_pow
;
extern
const
char
*
arithm_magnitudeSqr
;
extern
const
char
*
arithm_setidentity
;
//extern const char * jhp_transpose_kernel;
int64
kernelrealtotal
=
0
;
int64
kernelalltotal
=
0
;
int64
reducetotal
=
0
;
int64
downloadtotal
=
0
;
int64
alltotal
=
0
;
}
}
//////////////////////////////////////////////////////////////////////////////
/////////////////////// add subtract multiply divide /////////////////////////
//////////////////////////////////////////////////////////////////////////////
template
<
typename
T
>
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
void
*
_scalar
,
int
op_type
=
0
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
CV_Error
(
CV_GpuNotSupported
,
"Selected device don't support double
\r\n
"
);
return
;
}
dst
.
create
(
src1
.
size
(),
src1
.
type
());
CV_Assert
(
src1
.
cols
==
src2
.
cols
&&
src2
.
cols
==
dst
.
cols
&&
src1
.
rows
==
src2
.
rows
&&
src2
.
rows
==
dst
.
rows
);
CV_Assert
(
src1
.
type
()
==
src2
.
type
()
&&
src1
.
type
()
==
dst
.
type
());
CV_Assert
(
src1
.
depth
()
!=
CV_8S
);
Context
*
clCxt
=
src1
.
clCxt
;
int
channels
=
dst
.
oclchannels
();
int
depth
=
dst
.
depth
();
int
vector_lengths
[
4
][
7
]
=
{{
4
,
0
,
4
,
4
,
1
,
1
,
1
},
{
4
,
0
,
4
,
4
,
1
,
1
,
1
},
{
4
,
0
,
4
,
4
,
1
,
1
,
1
},
{
4
,
0
,
4
,
4
,
1
,
1
,
1
}
};
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
depth
];
int
offset_cols
=
(
dst
.
offset
/
dst
.
elemSize1
())
&
(
vector_length
-
1
);
int
cols
=
divUp
(
dst
.
cols
*
channels
+
offset_cols
,
vector_length
);
//////////////////////////////////////////////////////////////////////////////
/////////////////////// add subtract multiply divide /////////////////////////
//////////////////////////////////////////////////////////////////////////////
size_t
localThreads
[
3
]
=
{
64
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
dst
.
rows
,
1
};
enum
{
ADD
=
0
,
SUB
,
MUL
,
DIV
,
ABS_DIFF
};
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src1
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src2
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
T
scalar
;
if
(
_scalar
!=
NULL
)
{
double
scalar1
=
*
((
double
*
)
_scalar
);
scalar
=
(
T
)
scalar1
;
args
.
push_back
(
make_pair
(
sizeof
(
T
),
(
void
*
)
&
scalar
));
}
switch
(
op_type
)
{
case
MAT_ADD
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
,
"-D ARITHM_ADD"
);
break
;
case
MAT_SUB
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
,
"-D ARITHM_SUB"
);
break
;
default
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
}
}
static
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
int
op_type
=
0
)
{
arithmetic_run
<
char
>
(
src1
,
src2
,
dst
,
kernelName
,
kernelString
,
(
void
*
)
NULL
,
op_type
);
}
static
void
arithmetic_run
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
int
op_type
=
0
)
static
void
arithmetic_run_generic
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
const
Scalar
&
scalar
,
const
oclMat
&
mask
,
oclMat
&
dst
,
int
op_type
,
bool
use_scalar
=
false
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
Context
*
clCxt
=
src1
.
clCxt
;
bool
hasDouble
=
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
);
if
(
!
hasDouble
&&
(
src1
.
depth
()
==
CV_64F
||
src2
.
depth
()
==
CV_64F
||
dst
.
depth
()
==
CV_64F
))
{
CV_Error
(
CV_GpuNotSupported
,
"Selected device don't support double
\r\n
"
);
CV_Error
(
CV_GpuNotSupported
,
"Selected device do
es
n't support double
\r\n
"
);
return
;
}
dst
.
create
(
src1
.
size
(),
src1
.
type
());
CV_Assert
(
src1
.
cols
==
src2
.
cols
&&
src2
.
cols
==
dst
.
cols
&&
src1
.
rows
==
src2
.
rows
&&
src2
.
rows
==
dst
.
rows
&&
src1
.
rows
==
mask
.
rows
&&
src1
.
cols
==
mask
.
cols
);
CV_Assert
(
src1
.
type
()
==
src2
.
type
()
&&
src1
.
type
()
==
dst
.
type
());
CV_Assert
(
src1
.
depth
()
!=
CV_8S
);
CV_Assert
(
mask
.
type
()
==
CV_8U
);
CV_Assert
(
src2
.
empty
()
||
(
!
src2
.
empty
()
&&
src1
.
type
()
==
src2
.
type
()
&&
src1
.
size
()
==
src2
.
size
()));
CV_Assert
(
mask
.
empty
()
||
(
!
mask
.
empty
()
&&
mask
.
type
()
==
CV_8UC1
&&
mask
.
size
()
==
src1
.
size
()));
CV_Assert
(
op_type
>=
ADD
&&
op_type
<=
ABS_DIFF
);
Context
*
clCxt
=
src1
.
clCxt
;
int
channels
=
dst
.
oclchannels
();
int
depth
=
dst
.
depth
();
dst
.
create
(
src1
.
size
(),
src1
.
type
());
int
vector_lengths
[
4
][
7
]
=
{{
4
,
4
,
2
,
2
,
1
,
1
,
1
},
{
2
,
2
,
1
,
1
,
1
,
1
,
1
},
{
4
,
4
,
2
,
2
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
}
};
int
oclChannels
=
src1
.
oclchannels
(),
depth
=
src1
.
depth
();
int
src1step1
=
src1
.
step
/
src1
.
elemSize
(),
src1offset1
=
src1
.
offset
/
src1
.
elemSize
();
int
src2step1
=
src2
.
step
/
src2
.
elemSize
(),
src2offset1
=
src2
.
offset
/
src2
.
elemSize
();
int
maskstep1
=
mask
.
step
,
maskoffset1
=
mask
.
offset
/
mask
.
elemSize
();
int
dststep1
=
dst
.
step
/
dst
.
elemSize
(),
dstoffset1
=
dst
.
offset
/
dst
.
elemSize
();
oclMat
m
;
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
depth
];
int
offset_cols
=
((
dst
.
offset
%
dst
.
step
)
/
dst
.
elemSize
())
&
(
vector_length
-
1
);
int
cols
=
divUp
(
dst
.
cols
+
offset_cols
,
vector_length
);
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
size_t
globalThreads
[
3
]
=
{
dst
.
cols
,
dst
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
64
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
dst
.
rows
,
1
};
std
::
string
kernelName
=
op_type
==
ABS_DIFF
?
"arithm_absdiff"
:
"arithm_binary_op"
;
const
char
*
const
typeMap
[]
=
{
"uchar"
,
"char"
,
"ushort"
,
"short"
,
"int"
,
"float"
,
"double"
};
const
char
*
const
WTypeMap
[]
=
{
"short"
,
"short"
,
"int"
,
"int"
,
"int"
,
"float"
,
"double"
};
const
char
operationsMap
[]
=
{
'+'
,
'-'
,
'*'
,
'/'
,
'-'
};
const
char
*
const
channelMap
[]
=
{
""
,
""
,
"2"
,
"4"
,
"4"
};
bool
haveScalar
=
use_scalar
||
src2
.
empty
();
int
WDepth
=
depth
;
if
(
haveScalar
)
WDepth
=
hasDouble
&&
WDepth
==
CV_64F
?
CV_64F
:
CV_32F
;
if
(
op_type
==
DIV
)
WDepth
=
hasDouble
?
CV_64F
:
CV_32F
;
else
if
(
op_type
==
MUL
)
WDepth
=
hasDouble
&&
(
depth
==
CV_32S
||
depth
==
CV_64F
)
?
CV_64F
:
CV_32F
;
std
::
string
buildOptions
=
format
(
"-D T=%s%s -D WT=%s%s -D convertToT=convert_%s%s%s -D Operation=%c"
" -D convertToWT=convert_%s%s"
,
typeMap
[
depth
],
channelMap
[
oclChannels
],
WTypeMap
[
WDepth
],
channelMap
[
oclChannels
],
typeMap
[
depth
],
channelMap
[
oclChannels
],
(
depth
>=
CV_32F
?
""
:
(
depth
==
CV_32S
?
"_rte"
:
"_sat_rte"
)),
operationsMap
[
op_type
],
WTypeMap
[
WDepth
],
channelMap
[
oclChannels
]);
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src1
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src2
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mask
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mask
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
mask
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1step1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1offset1
));
switch
(
op_type
)
if
(
!
src2
.
empty
()
)
{
case
MAT_ADD
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
"-D ARITHM_ADD"
);
break
;
case
MAT_SUB
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
"-D ARITHM_SUB"
);
break
;
default
:
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
);
}
}
void
cv
::
ocl
::
add
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
"arithm_add"
,
&
arithm_add
,
MAT_ADD
);
}
void
cv
::
ocl
::
add
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
mask
,
"arithm_add_with_mask"
,
&
arithm_add
,
MAT_ADD
);
}
void
cv
::
ocl
::
subtract
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
"arithm_add"
,
&
arithm_add
,
MAT_SUB
);
}
void
cv
::
ocl
::
subtract
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
mask
,
"arithm_add_with_mask"
,
&
arithm_add
,
MAT_SUB
);
}
typedef
void
(
*
MulDivFunc
)(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
void
*
scalar
);
void
cv
::
ocl
::
multiply
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
double
scalar
)
{
if
(
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
(
src1
.
depth
()
==
CV_64F
))
arithmetic_run
<
double
>
(
src1
,
src2
,
dst
,
"arithm_mul"
,
&
arithm_mul
,
(
void
*
)(
&
scalar
));
else
arithmetic_run
<
float
>
(
src1
,
src2
,
dst
,
"arithm_mul"
,
&
arithm_mul
,
(
void
*
)(
&
scalar
));
}
void
cv
::
ocl
::
divide
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
double
scalar
)
{
if
(
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
))
arithmetic_run
<
double
>
(
src1
,
src2
,
dst
,
"arithm_div"
,
&
arithm_div
,
(
void
*
)(
&
scalar
));
else
arithmetic_run
<
float
>
(
src1
,
src2
,
dst
,
"arithm_div"
,
&
arithm_div
,
(
void
*
)(
&
scalar
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src2
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2step1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src2offset1
));
}
template
<
typename
WT
,
typename
CL_WT
>
void
arithmetic_scalar_run
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
int
isMatSubScalar
)
{
if
(
!
src1
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src1
.
type
()
==
CV_64F
)
{
CV_Error
(
CV_GpuNotSupported
,
"Selected device don't support double
\r\n
"
);
return
;
kernelName
+=
"_mat"
;
}
dst
.
create
(
src1
.
size
(),
src1
.
type
());
CV_Assert
(
src1
.
cols
==
dst
.
cols
&&
src1
.
rows
==
dst
.
rows
&&
src1
.
type
()
==
dst
.
type
());
//CV_Assert(src1.depth() != CV_8S);
if
(
mask
.
data
)
if
(
haveScalar
)
{
CV_Assert
(
mask
.
type
()
==
CV_8U
&&
src1
.
rows
==
mask
.
rows
&&
src1
.
cols
==
mask
.
cols
);
}
Context
*
clCxt
=
src1
.
clCxt
;
int
channels
=
dst
.
oclchannels
();
int
depth
=
dst
.
depth
();
WT
s
[
4
]
=
{
saturate_cast
<
WT
>
(
src2
.
val
[
0
]),
saturate_cast
<
WT
>
(
src2
.
val
[
1
]),
saturate_cast
<
WT
>
(
src2
.
val
[
2
]),
saturate_cast
<
WT
>
(
src2
.
val
[
3
])
};
int
vector_lengths
[
4
][
7
]
=
{{
4
,
0
,
2
,
2
,
1
,
1
,
1
},
{
2
,
0
,
1
,
1
,
1
,
1
,
1
},
{
4
,
0
,
2
,
2
,
1
,
1
,
1
},
{
1
,
0
,
1
,
1
,
1
,
1
,
1
}
};
const
int
WDepthMap
[]
=
{
CV_16S
,
CV_16S
,
CV_32S
,
CV_32S
,
CV_32S
,
CV_32F
,
CV_64F
};
m
.
create
(
1
,
1
,
CV_MAKE_TYPE
(
WDepthMap
[
WDepth
],
oclChannels
));
m
.
setTo
(
scalar
);
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
depth
];
int
offset_cols
=
((
dst
.
offset
%
dst
.
step
)
/
dst
.
elemSize
())
&
(
vector_length
-
1
);
int
cols
=
divUp
(
dst
.
cols
+
offset_cols
,
vector_length
);
size_t
localThreads
[
3
]
=
{
64
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
dst
.
rows
,
1
};
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src1
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src1
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src1
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
m
.
data
));
if
(
mask
.
data
)
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
mask
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
mask
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
mask
.
offset
));
kernelName
+=
"_scalar"
;
}
args
.
push_back
(
make_pair
(
sizeof
(
CL_WT
)
,
(
void
*
)
&
s
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src1
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_step1
));
if
(
isMatSubScalar
!=
0
)
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
"-D ARITHM_SUB"
);
else
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
channels
,
depth
,
"-D ARITHM_ADD"
);
}
static
void
arithmetic_scalar_run
(
const
oclMat
&
src
,
oclMat
&
dst
,
string
kernelName
,
const
char
**
kernelString
,
double
scalar
)
{
if
(
!
src
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
src
.
type
()
==
CV_64F
)
if
(
!
mask
.
empty
())
{
CV_Error
(
CV_GpuNotSupported
,
"Selected device don't support double
\r\n
"
);
return
;
}
dst
.
create
(
src
.
size
(),
src
.
type
());
CV_Assert
(
src
.
cols
==
dst
.
cols
&&
src
.
rows
==
dst
.
rows
);
CV_Assert
(
src
.
type
()
==
dst
.
type
());
CV_Assert
(
src
.
depth
()
!=
CV_8S
);
Context
*
clCxt
=
src
.
clCxt
;
int
channels
=
dst
.
oclchannels
();
int
depth
=
dst
.
depth
();
int
vector_lengths
[
4
][
7
]
=
{{
4
,
0
,
4
,
4
,
1
,
1
,
1
},
{
4
,
0
,
4
,
4
,
1
,
1
,
1
},
{
4
,
0
,
4
,
4
,
1
,
1
,
1
},
{
4
,
0
,
4
,
4
,
1
,
1
,
1
}
};
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
mask
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
maskstep1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
maskoffset1
));
size_t
vector_length
=
vector_lengths
[
channels
-
1
][
depth
];
int
offset_cols
=
(
dst
.
offset
/
dst
.
elemSize1
())
&
(
vector_length
-
1
);
int
cols
=
divUp
(
dst
.
cols
*
channels
+
offset_cols
,
vector_length
);
kernelName
+=
"_mask"
;
}
size_t
localThreads
[
3
]
=
{
64
,
4
,
1
};
size_t
globalThreads
[
3
]
=
{
cols
,
dst
.
rows
,
1
}
;
if
(
op_type
==
DIV
)
kernelName
+=
"_div"
;
int
dst_step1
=
dst
.
cols
*
dst
.
elemSize
();
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
dst
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dst_step1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dststep1
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
dstoffset1
));
float
f_scalar
=
(
float
)
scalar
;
if
(
src
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
))
args
.
push_back
(
make_pair
(
sizeof
(
cl_double
),
(
void
*
)
&
scalar
));
else
{
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
f_scalar
));
}
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
src1
.
rows
));
openCLExecuteKernel
(
clCxt
,
kernelString
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
openCLExecuteKernel
(
clCxt
,
mask
.
empty
()
?
(
!
src2
.
empty
()
?
&
arithm_add
:
&
arithm_add_scalar
)
:
(
!
src2
.
empty
()
?
&
arithm_add_mask
:
&
arithm_add_scalar_mask
),
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
buildOptions
.
c_str
());
}
typedef
void
(
*
ArithmeticFuncS
)(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
int
isMatSubScalar
);
static
void
arithmetic_scalar
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
,
int
isMatSubScalar
)
{
static
ArithmeticFuncS
tab
[
8
]
=
{
arithmetic_scalar_run
<
int
,
cl_int4
>
,
arithmetic_scalar_run
<
int
,
cl_int4
>
,
arithmetic_scalar_run
<
int
,
cl_int4
>
,
arithmetic_scalar_run
<
int
,
cl_int4
>
,
arithmetic_scalar_run
<
int
,
cl_int4
>
,
arithmetic_scalar_run
<
float
,
cl_float4
>
,
arithmetic_scalar_run
<
double
,
cl_double4
>
,
0
};
ArithmeticFuncS
func
=
tab
[
src1
.
depth
()];
if
(
func
==
0
)
cv
::
ocl
::
error
(
"Unsupported arithmetic operation"
,
__FILE__
,
__LINE__
);
func
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
,
isMatSubScalar
);
}
static
void
arithmetic_scalar
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
,
string
kernelName
,
const
char
**
kernelString
)
void
cv
::
ocl
::
add
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
arithmetic_
scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
,
0
);
arithmetic_
run_generic
(
src1
,
src2
,
Scalar
(),
mask
,
dst
,
ADD
);
}
void
cv
::
ocl
::
add
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
string
kernelName
=
mask
.
data
?
"arithm_s_add_with_mask"
:
"arithm_s_add"
;
const
char
**
kernelString
=
mask
.
data
?
&
arithm_add_scalar_mask
:
&
arithm_add_scalar
;
arithmetic_run_generic
(
src1
,
oclMat
(),
src2
,
mask
,
dst
,
ADD
)
;
}
arithmetic_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
);
void
cv
::
ocl
::
subtract
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
arithmetic_run_generic
(
src1
,
src2
,
Scalar
(),
mask
,
dst
,
SUB
);
}
void
cv
::
ocl
::
subtract
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
,
const
oclMat
&
mask
)
{
string
kernelName
=
mask
.
data
?
"arithm_s_add_with_mask"
:
"arithm_s_add"
;
const
char
**
kernelString
=
mask
.
data
?
&
arithm_add_scalar_mask
:
&
arithm_add_scalar
;
arithmetic_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
,
1
);
arithmetic_run_generic
(
src1
,
oclMat
(),
src2
,
mask
,
dst
,
SUB
);
}
void
cv
::
ocl
::
subtract
(
const
Scalar
&
src2
,
const
oclMat
&
src1
,
oclMat
&
dst
,
const
oclMat
&
mask
)
void
cv
::
ocl
::
multiply
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
double
scalar
)
{
string
kernelName
=
mask
.
data
?
"arithm_s_add_with_mask"
:
"arithm_s_add"
;
const
char
**
kernelString
=
mask
.
data
?
&
arithm_add_scalar_mask
:
&
arithm_add_scalar
;
arithmetic_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
kernelString
,
-
1
);
const
bool
use_scalar
=
!
(
std
::
abs
(
scalar
-
1.0
)
<
std
::
numeric_limits
<
double
>::
epsilon
());
arithmetic_run_generic
(
src1
,
src2
,
Scalar
::
all
(
scalar
),
oclMat
(),
dst
,
MUL
,
use_scalar
);
}
void
cv
::
ocl
::
multiply
(
double
scalar
,
const
oclMat
&
src
,
oclMat
&
dst
)
{
string
kernelName
=
"arithm_muls"
;
arithmetic_scalar_run
(
src
,
dst
,
kernelName
,
&
arithm_mul
,
scalar
);
arithmetic_run_generic
(
src
,
oclMat
(),
Scalar
::
all
(
scalar
),
oclMat
(),
dst
,
MUL
);
}
void
cv
::
ocl
::
divide
(
double
scalar
,
const
oclMat
&
src
,
oclMat
&
dst
)
void
cv
::
ocl
::
divide
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
,
double
scalar
)
{
if
(
!
src
.
clCxt
->
supportsFeature
(
Context
::
CL_DOUBLE
))
{
CV_Error
(
CV_GpuNotSupported
,
"Selected device don't support double
\r\n
"
);
return
;
}
const
bool
use_scalar
=
!
(
std
::
abs
(
scalar
-
1.0
)
<
std
::
numeric_limits
<
double
>::
epsilon
());
arithmetic_run_generic
(
src1
,
src2
,
Scalar
::
all
(
scalar
),
oclMat
(),
dst
,
DIV
,
use_scalar
);
}
string
kernelName
=
"arithm_s_div"
;
arithmetic_scalar_run
(
src
,
dst
,
kernelName
,
&
arithm_div
,
scalar
);
void
cv
::
ocl
::
divide
(
double
scalar
,
const
oclMat
&
src
,
oclMat
&
dst
)
{
arithmetic_run_generic
(
src
,
oclMat
(),
Scalar
::
all
(
scalar
),
oclMat
(),
dst
,
DIV
);
}
//////////////////////////////////////////////////////////////////////////////
///////////////////////////////// Absdiff ///////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
void
cv
::
ocl
::
absdiff
(
const
oclMat
&
src1
,
const
oclMat
&
src2
,
oclMat
&
dst
)
{
arithmetic_run
(
src1
,
src2
,
dst
,
"arithm_absdiff"
,
&
arithm_absdiff
);
arithmetic_run
_generic
(
src1
,
src2
,
Scalar
(),
oclMat
(),
dst
,
ABS_DIFF
);
}
void
cv
::
ocl
::
absdiff
(
const
oclMat
&
src1
,
const
Scalar
&
src2
,
oclMat
&
dst
)
{
string
kernelName
=
"arithm_s_absdiff"
;
oclMat
mask
;
arithmetic_scalar
(
src1
,
src2
,
dst
,
mask
,
kernelName
,
&
arithm_absdiff
);
arithmetic_run_generic
(
src1
,
oclMat
(),
src2
,
oclMat
(),
dst
,
ABS_DIFF
);
}
//////////////////////////////////////////////////////////////////////////////
///////////////////////////////// compare ///////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
...
...
modules/ocl/src/opencl/arithm_add.cl
View file @
0ad03162
...
...
@@ -52,809 +52,105 @@
#
endif
#
endif
#
ifdef
ARITHM_ADD
#
define
ARITHM_OP
(
A,B
)
((
A
)
+
(
B
))
#
elif
defined
ARITHM_SUB
#
define
ARITHM_OP
(
A,B
)
((
A
)
-
(
B
))
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////ADD////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************add
without
mask**************************************/
__kernel
void
arithm_add_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
ARITHM_OP
(
convert_short4_sat
(
src1_data
)
,
convert_short4_sat
(
src2_data
))
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_add_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
convert_int4_sat
(
src2_data
))
;
ushort4
tmp_data
=
convert_ushort4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_add_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
convert_int4_sat
(
src2_data
))
;
short4
tmp_data
=
convert_short4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_add_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
long
tmp
=
ARITHM_OP
((
long
)(
data1
)
,
(
long
)(
data2
))
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
convert_int_sat
(
tmp
)
;
}
}
__kernel
void
arithm_add_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
float
data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
data2
=
*
((
__global
float
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float
tmp
=
ARITHM_OP
(
data1,
data2
)
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_add_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
double
data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
data2
=
*
((
__global
double
*
)((
__global
char
*
)
src2
+
src2_index
))
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
ARITHM_OP
(
data1,
data2
)
;
}
}
#
endif
/**************************************add
with
mask**************************************/
__kernel
void
arithm_add_with_mask_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
mask_index_fix
=
mask_index
<
0
?
0
:
mask_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
uchar4
mask_data
=
vload4
(
0
,
mask
+
mask_index_fix
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
mask_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
mask_index
==
-2
)
?
mask_data.zwxy:mask_data.yzwx
;
mask_data.xyzw
=
(
mask_index
==
-1
)
?
mask_data.wxyz:tmp.xyzw
;
}
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
ARITHM_OP
(
convert_short4_sat
(
src1_data
)
,
convert_short4_sat
(
src2_data
))
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
mask_data.z
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
mask_data.w
)
&&
(
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src2_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index
))
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
convert_int2_sat
(
src2_data
))
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src2_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index
))
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
convert_int2_sat
(
src2_data
))
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
convert_int_sat
(
ARITHM_OP
((
long
)
src_data1,
(
long
)
src_data2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C1_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float
src_data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
src_data2
=
*
((
__global
float
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float
dst_data
=
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_add_with_mask_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double
src_data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
src_data2
=
*
((
__global
double
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double
dst_data
=
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_add_with_mask_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
short4
tmp
=
ARITHM_OP
(
convert_short4_sat
(
src1_data
)
,
convert_short4_sat
(
src2_data
))
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
data.zw
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort2
src_data2
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
convert_int2_sat
(
src_data2
))
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short2
src_data2
=
*
((
__global
short2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
convert_int2_sat
(
src_data2
))
;
short2
data
=
convert_short2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
*
((
__global
int2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
convert_int2_sat
(
ARITHM_OP
(
convert_long2_sat
(
src_data1
)
,
convert_long2_sat
(
src_data2
)))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float2
src_data1
=
*
((
__global
float2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float2
src_data2
=
*
((
__global
float2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
/////////////////////////////////////////////
ADD
////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_add_with_mask_C2_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
arithm_binary_op_mat
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
T
*src2,
int
src2_step,
int
src2_offset,
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double2
src_data1
=
*
((
__global
double2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double2
src_data2
=
*
((
__global
double2
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
dst[dst_index]
=
convertToT
(
convertToWT
(
src1[src1_index]
)
Operation
convertToWT
(
src2[src2_index]
))
;
}
}
#
endif
__kernel
void
arithm_add_with_mask_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
arithm_binary_op_mat_div
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
T
*src2,
int
src2_step,
int
src2_offset,
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
src_data2
=
*
((
__global
uchar4
*
)(
src2
+
src2_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
data
=
convert_uchar4_sat
(
ARITHM_OP
(
convert_short4_sat
(
src_data1
)
,
convert_short4_sat
(
src_data2
)))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
T
zero
=
(
T
)(
0
)
;
dst[dst_index]
=
src2[src2_index]
==
zero
?
zero
:
convertToT
(
convertToWT
(
src1[src1_index]
)
/
convertToWT
(
src2[src2_index]
))
;
}
}
__kernel
void
arithm_add_with_mask_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
src_data2
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
data
=
convert_ushort4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
convert_int4_sat
(
src_data2
)))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
arithm_absdiff_mat
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
T
*src2,
int
src2_step,
int
src2_offset,
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
src_data2
=
*
((
__global
short4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
data
=
convert_short4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
convert_int4_sat
(
src_data2
)))
;
data
=
mask_data
?
data
:
dst_data
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
WT
value
=
convertToWT
(
src1[src1_index]
)
-
convertToWT
(
src2[src2_index]
)
;
value
=
value
>
(
WT
)(
0
)
?
value
:
-value
;
dst[dst_index]
=
convertToT
(
value
)
;
}
}
__kernel
void
arithm_add_with_mask_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
src_data2
=
*
((
__global
int4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
data
=
convert_int4_sat
(
ARITHM_OP
(
convert_long4_sat
(
src_data1
)
,
convert_long4_sat
(
src_data2
)))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_add_with_mask_C4_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
//
add
mat
with
scale
for
multiply
__kernel
void
arithm_binary_op_mat_scalar
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
T
*src2,
int
src2_step,
int
src2_offset,
__global
WT
*scalar,
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
4
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float4
src_data1
=
*
((
__global
float4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float4
src_data2
=
*
((
__global
float4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float4
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
dst[dst_index]
=
convertToT
(
convertToWT
(
src1[src1_index]
)
*
scalar[0]
*
convertToWT
(
src2[src2_index]
))
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_
add_with_mask_C4_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset
,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
//
add
mat
with
scale
for
divide
__kernel
void
arithm_
binary_op_mat_scalar_div
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
T
*src2,
int
src2_step,
int
src2_offset,
__global
WT
*scalar
,
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
5
)
+
src2_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double4
src_data1
=
*
((
__global
double4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double4
src_data2
=
*
((
__global
double4
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double4
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
T
zero
=
(
T
)(
0
)
;
dst[dst_index]
=
src2[src2_index]
==
zero
?
zero
:
convertToT
(
convertToWT
(
src1[src1_index]
)
*
scalar[0]
/
convertToWT
(
src2[src2_index]
))
;
}
}
#
endif
modules/ocl/src/opencl/arithm_add_mask.cl
0 → 100644
View file @
0ad03162
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
oclMaterials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////
add
with
mask
//////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
__kernel
void
arithm_binary_op_mat_mask
(
__global
T
*
src1,
int
src1_step,
int
src1_offset,
__global
T
*
src2,
int
src2_step,
int
src2_offset,
__global
uchar
*
mask,
int
mask_step,
int
mask_offset,
__global
T
*
dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
if
(
mask[mask_index]
)
{
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
)
;
dst[dst_index]
=
convertToT
(
convertToWT
(
src1[src1_index]
)
Operation
convertToWT
(
src2[src2_index]
))
;
}
}
}
modules/ocl/src/opencl/arithm_add_scalar.cl
View file @
0ad03162
...
...
@@ -51,463 +51,61 @@
#
endif
#
endif
#
ifdef
ARITHM_ADD
#
define
ARITHM_OP
(
A,B
)
((
A
)
+
(
B
))
#
elif
defined
ARITHM_SUB
#
define
ARITHM_OP
(
A,B
)
((
A
)
-
(
B
))
#
endif
/**************************************add
with
scalar
without
mask**************************************/
__kernel
void
arithm_s_add_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
int4
src2_data
=
(
int4
)(
src2.x,
src2.x,
src2.x,
src2.x
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
src2_data
)
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.x
:
data.x
;
data.y
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
src2_data
)
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.x
:
data.x
;
data.y
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
src2.x
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
convert_int_sat
(
ARITHM_OP
((
long
)
src_data1,
(
long
)
src_data2
))
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C1_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
float
src_data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
src_data2
=
src2.x
;
float
dst_data
=
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_add_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
double
src_data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
src2_data
=
src2.x
;
double
dst_data
=
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double
data
=
ARITHM_OP
(
src_data1,
src2_data
)
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_add_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
int4
src2_data
=
(
int4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
(
dst_index
+
0
>=
dst_start
)
?
tmp_data.xy
:
data.xy
;
data.zw
=
(
dst_index
+
2
<
dst_end
)
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////
Add
with
scalar
/////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
src_data2
)
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
src_data2
)
;
short2
data
=
convert_short2_sat
(
tmp
)
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
convert_int2_sat
(
ARITHM_OP
(
convert_long2_sat
(
src_data1
)
,
convert_long2_sat
(
src_data2
)))
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
float2
src_data1
=
*
((
__global
float2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float2
src_data2
=
(
float2
)(
src2.x,
src2.y
)
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_add_C2_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
double2
src_data1
=
*
((
__global
double2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double2
src_data2
=
(
double2
)(
src2.x,
src2.y
)
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_add_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
arithm_binary_op_scalar
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
WT
*scalar,
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
uchar4
data
=
convert_uchar4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
))
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
dst[dst_index]
=
convertToT
(
convertToWT
(
src1[src1_index]
)
Operation
scalar[0]
)
;
}
}
__kernel
void
arithm_s_add_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
data
=
convert_ushort4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
))
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
__kernel
void
arithm_absdiff_scalar
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
WT
*src2,
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
data
=
convert_short4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
))
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
WT
value
=
convertToWT
(
src1[src1_index]
)
-
src2[0]
;
value
=
value
>
(
WT
)(
0
)
?
value
:
-value
;
dst[dst_index]
=
convertToT
(
value
)
;
}
}
__kernel
void
arithm_s_add_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
data
=
convert_int4_sat
(
ARITHM_OP
(
convert_long4_sat
(
src_data1
)
,
convert_long4_sat
(
src2
)))
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_C4_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
float4
src_data1
=
*
((
__global
float4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float4
data
=
ARITHM_OP
(
src_data1,
src2
)
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_add_C4_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
//
scalar
divide
to
matrix
__kernel
void
arithm_binary_op_scalar_div
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
WT
*scalar,
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
x
+
dst_offset
)
;
double4
src_data1
=
*
((
__global
double4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double4
data
=
ARITHM_OP
(
src_data1,
src2
)
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
T
zero
=
(
T
)(
0
)
;
dst[dst_index]
=
src1[src1_index]
==
zero
?
zero
:
convertToT
(
scalar[0]
/
convertToWT
(
src1[src1_index]
))
;
}
}
#
endif
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
View file @
0ad03162
...
...
@@ -51,561 +51,28 @@
#
endif
#
endif
#
ifdef
ARITHM_ADD
#
define
ARITHM_OP
(
A,B
)
((
A
)
+
(
B
))
#
elif
defined
ARITHM_SUB
#
define
ARITHM_OP
(
A,B
)
((
A
)
-
(
B
))
#
endif
/**************************************add
with
scalar
with
mask**************************************/
__kernel
void
arithm_s_add_with_mask_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
mask_index_fix
=
mask_index
<
0
?
0
:
mask_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
int4
src2_data
=
(
int4
)(
src2.x,
src2.x,
src2.x,
src2.x
)
;
uchar4
mask_data
=
vload4
(
0
,
mask
+
mask_index_fix
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
mask_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
mask_index
==
-2
)
?
mask_data.zwxy:mask_data.yzwx
;
mask_data.xyzw
=
(
mask_index
==
-1
)
?
mask_data.wxyz:tmp.xyzw
;
}
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
data.y
;
data.z
=
((
mask_data.z
)
&&
(
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
data.z
;
data.w
=
((
mask_data.w
)
&&
(
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C1_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
ushort2
src1_data
=
vload2
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
ushort2
data
=
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
src2_data
)
;
ushort2
tmp_data
=
convert_ushort2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
ushort2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C1_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
short2
src1_data
=
vload2
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src2_data
=
(
int2
)(
src2.x,
src2.x
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
short2
data
=
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src1_data
)
,
src2_data
)
;
short2
tmp_data
=
convert_short2_sat
(
tmp
)
;
data.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.x
:
data.x
;
data.y
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
data.y
;
*
((
__global
short2
*
)((
__global
uchar
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C1_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int
src_data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
src_data2
=
src2.x
;
int
dst_data
=
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
data
=
convert_int_sat
(
ARITHM_OP
((
long
)
src_data1,
(
long
)
src_data2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
///////////////////////////////////////////////////////////////////////////////////
////////////////////////////
Add
with
scalar
with
mask
////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
__kernel
void
arithm_
s_add_with_mask_C1_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset
,
__kernel
void
arithm_
binary_op_scalar_mask
(
__global
T
*src1,
int
src1_step,
int
src1_offset,
__global
WT
*scalar
,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
__global
T
*dst,
int
dst_step,
int
dst_offset,
int
cols,
int
rows
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float
src_data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
src_data2
=
src2.x
;
float
dst_data
=
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_add_with_mask_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double
src_data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
src_data2
=
src2.x
;
double
dst_data
=
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_add_with_mask_C2_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
/
2
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
int4
src2_data
=
(
int4
)(
src2.x,
src2.y,
src2.x,
src2.y
)
;
uchar2
mask_data
=
vload2
(
0
,
mask
+
mask_index
)
;
uchar4
data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
ARITHM_OP
(
convert_int4_sat
(
src1_data
)
,
src2_data
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
data.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data.xy
:
data.xy
;
data.zw
=
((
mask_data.y
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.zw
:
data.zw
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C2_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
if
(
mask[mask_index]
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort2
src_data1
=
*
((
__global
ushort2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
ushort2
dst_data
=
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
)
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
src_data2
)
;
ushort2
data
=
convert_ushort2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
dst[dst_index]
=
convertToT
(
convertToWT
(
src1[src1_index]
)
Operation
scalar[0]
)
;
}
}
__kernel
void
arithm_s_add_with_mask_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short2
src_data1
=
*
((
__global
short2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
short2
dst_data
=
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
tmp
=
ARITHM_OP
(
convert_int2_sat
(
src_data1
)
,
src_data2
)
;
short2
data
=
convert_short2_sat
(
tmp
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C2_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int2
src_data1
=
*
((
__global
int2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int2
src_data2
=
(
int2
)(
src2.x,
src2.y
)
;
int2
dst_data
=
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int2
data
=
convert_int2_sat
(
ARITHM_OP
(
convert_long2_sat
(
src_data1
)
,
convert_long2_sat
(
src_data2
)))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float2
src_data1
=
*
((
__global
float2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float2
src_data2
=
(
float2
)(
src2.x,
src2.y
)
;
float2
dst_data
=
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_add_with_mask_C2_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double2
src_data1
=
*
((
__global
double2
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double2
src_data2
=
(
double2
)(
src2.x,
src2.y
)
;
double2
dst_data
=
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double2
data
=
ARITHM_OP
(
src_data1,
src_data2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double2
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
__kernel
void
arithm_s_add_with_mask_C4_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
uchar4
src_data1
=
*
((
__global
uchar4
*
)(
src1
+
src1_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
data
=
convert_uchar4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C4_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
ushort4
src_data1
=
*
((
__global
ushort4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
data
=
convert_ushort4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C4_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
short4
src_data1
=
*
((
__global
short4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
data
=
convert_short4_sat
(
ARITHM_OP
(
convert_int4_sat
(
src_data1
)
,
src2
))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C4_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
int4
src_data1
=
*
((
__global
int4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
data
=
convert_int4_sat
(
ARITHM_OP
(
convert_long4_sat
(
src_data1
)
,
convert_long4_sat
(
src2
)))
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
__kernel
void
arithm_s_add_with_mask_C4_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
4
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
4
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
float4
src_data1
=
*
((
__global
float4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float4
data
=
ARITHM_OP
(
src_data1,
src2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_add_with_mask_C4_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*mask,
int
mask_step,
int
mask_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
5
)
+
src1_offset
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
5
)
+
dst_offset
)
;
uchar
mask_data
=
*
(
mask
+
mask_index
)
;
double4
src_data1
=
*
((
__global
double4
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double4
data
=
ARITHM_OP
(
src_data1,
src2
)
;
data
=
mask_data
?
data
:
dst_data
;
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
data
;
}
}
#
endif
modules/ocl/src/opencl/arithm_div.cl
deleted
100644 → 0
View file @
5ff5fdd7
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
oclMaterials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
F
;
typedef
double4
F4
;
#
define
convert_F4
convert_double4
#
define
convert_F
double
#
else
typedef
float
F
;
typedef
float4
F4
;
#
define
convert_F4
convert_float4
#
define
convert_F
float
#
endif
inline
uchar
round2_uchar
(
F
v
)
{
return
convert_uchar_sat
(
round
(
v
))
;
}
inline
ushort
round2_ushort
(
F
v
)
{
return
convert_ushort_sat
(
round
(
v
))
;
}
inline
short
round2_short
(
F
v
)
{
return
convert_short_sat
(
round
(
v
))
;
}
inline
int
round2_int
(
F
v
)
{
return
convert_int_sat
(
round
(
v
))
;
}
///////////////////////////////////////////////////////////////////////////////////////
////////////////////////////divide///////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
/**********************************div*********************************************/
__kernel
void
arithm_div_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
F
scalar
)
{
int2
coor
=
(
int2
)(
get_global_id
(
0
)
,
get_global_id
(
1
))
;
if
(
coor.x
<
cols
&&
coor.y
<
rows
)
{
coor.x
=
coor.x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int2
src_index
=
(
int2
)(
mad24
(
coor.y,
src1_step,
coor.x
+
src1_offset
-
dst_align
)
,
mad24
(
coor.y,
src2_step,
coor.x
+
src2_offset
-
dst_align
))
;
int4
dst_args
=
(
int4
)(
mad24
(
coor.y,
dst_step,
dst_offset
)
,
mad24
(
coor.y,
dst_step,
dst_offset
+
dst_step1
)
,
mad24
(
coor.y,
dst_step,
dst_offset
+
coor.x
&
(
int
)
0xfffffffc
)
,
0
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src_index.x
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src_index.y
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_args.z
))
;
F4
tmp
=
convert_F4
(
src1_data
)
*
scalar
;
uchar4
tmp_data
;
tmp_data.x
=
((
tmp.x
==
0
)
|
| (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_args.z)) = dst_data;
}
}
__kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
F4 tmp = convert_F4(src1_data) * scalar;
ushort4 tmp_data;
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (F)src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (F)src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (F)src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (F)src2_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
}
}
__kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
F4 tmp = convert_F4(src1_data) * scalar;
short4 tmp_data;
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (F)src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (F)src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (F)src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (F)src2_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
}
}
__kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
int data1 = *((__global int *)((__global char *)src1 + src1_index));
int data2 = *((__global int *)((__global char *)src2 + src2_index));
F tmp = (convert_F)(data1) * scalar;
int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2));
*((__global int *)((__global char *)dst + dst_index)) =tmp_data;
}
}
__kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global float *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
float data1 = *((__global float *)((__global char *)src1 + src1_index));
float data2 = *((__global float *)((__global char *)src2 + src2_index));
F tmp = (convert_F)(data1) * scalar;
float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2));
*((__global float *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset,
__global double *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
double data1 = *((__global double *)((__global char *)src1 + src1_index));
double data2 = *((__global double *)((__global char *)src2 + src2_index));
double tmp = data1 * scalar;
double tmp_data = (tmp == 0 || data2 == 0) ? 0 : (tmp / data2);
*((__global double *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#endif
/************************************div with scalar************************************/
__kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index = mad24(y, src_step, x + src_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src_data = vload4(0, src + src_index);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data;
tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (F)src_data.x);
tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (F)src_data.y);
tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (F)src_data.z);
tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (F)src_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_index)) = dst_data;
}
}
__kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
ushort4 src_data = vload4(0, (__global ushort *)((__global char *)src + src_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data;
tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (F)src_data.x);
tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (F)src_data.y);
tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (F)src_data.z);
tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (F)src_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
}
}
__kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
short4 src_data = vload4(0, (__global short *)((__global char *)src + src_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 tmp_data;
tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (F)src_data.x);
tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (F)src_data.y);
tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (F)src_data.z);
tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (F)src_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
}
}
__kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src_index = mad24(y, src_step, (x << 2) + src_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
int data = *((__global int *)((__global char *)src + src_index));
int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_F)(data));
*((__global int *)((__global char *)dst + dst_index)) =tmp_data;
}
}
__kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset,
__global float *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src_index = mad24(y, src_step, (x << 2) + src_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
float data = *((__global float *)((__global char *)src + src_index));
float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_F)(data));
*((__global float *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset,
__global double *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src_index = mad24(y, src_step, (x << 3) + src_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
double data = *((__global double *)((__global char *)src + src_index));
double tmp_data = (scalar == 0 |
|
data
==
0
)
?
0
:
(
scalar
/
data
)
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp_data
;
}
}
#
endif
modules/ocl/src/opencl/arithm_mul.cl
deleted
100644 → 0
View file @
5ff5fdd7
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2012,
Institute
Of
Software
Chinese
Academy
Of
Science,
all
rights
reserved.
//
Copyright
(
C
)
2010-2012,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Jia
Haipeng,
jiahaipeng95@gmail.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
GpuMaterials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
int4
round_int4
(
float4
v
)
{
v.s0
=
v.s0
+
(
v.s0
>
0
?
0.5
:
-0.5
)
;
v.s1
=
v.s1
+
(
v.s1
>
0
?
0.5
:
-0.5
)
;
v.s2
=
v.s2
+
(
v.s2
>
0
?
0.5
:
-0.5
)
;
v.s3
=
v.s3
+
(
v.s3
>
0
?
0.5
:
-0.5
)
;
return
convert_int4_sat
(
v
)
;
}
uint4
round_uint4
(
float4
v
)
{
v.s0
=
v.s0
+
(
v.s0
>
0
?
0.5
:
-0.5
)
;
v.s1
=
v.s1
+
(
v.s1
>
0
?
0.5
:
-0.5
)
;
v.s2
=
v.s2
+
(
v.s2
>
0
?
0.5
:
-0.5
)
;
v.s3
=
v.s3
+
(
v.s3
>
0
?
0.5
:
-0.5
)
;
return
convert_uint4_sat
(
v
)
;
}
long
round_int
(
float
v
)
{
v
=
v
+
(
v
>
0
?
0.5
:
-0.5
)
;
return
convert_int_sat
(
v
)
;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////multiply//////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************add
without
mask**************************************/
__kernel
void
arithm_mul_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
float
scalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
,
src2_data
;
src1_data.x=
src1_index+0
>=
0
?
src1[src1_index+0]
:
0
;
src1_data.y=
src1_index+1
>=
0
?
src1[src1_index+1]
:
0
;
src1_data.z=
src1_index+2
>=
0
?
src1[src1_index+2]
:
0
;
src1_data.w=
src1_index+3
>=
0
?
src1[src1_index+3]
:
0
;
src2_data.x=
src2_index+0
>=
0
?
src2[src2_index+0]
:
0
;
src2_data.y=
src2_index+1
>=
0
?
src2[src2_index+1]
:
0
;
src2_data.z=
src2_index+2
>=
0
?
src2[src2_index+2]
:
0
;
src2_data.w=
src2_index+3
>=
0
?
src2[src2_index+3]
:
0
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
convert_int4_sat
(
src2_data
)
;
tmp
=
round_int4
(
convert_float4
(
tmp
)
*
scalar
)
;
uchar4
tmp_data
=
convert_uchar4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_mul_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
float
scalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
uint4
tmp
=
convert_uint4_sat
(
src1_data
)
*
convert_uint4_sat
(
src2_data
)
;
tmp
=
round_uint4
(
convert_float4
(
tmp
)
*
scalar
)
;
ushort4
tmp_data
=
convert_ushort4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_mul_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
float
scalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
convert_int4_sat
(
src2_data
)
;
tmp
=
round_int4
(
convert_float4
(
tmp
)
*
scalar
)
;
short4
tmp_data
=
convert_short4_sat
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.w
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
dst_data
;
}
}
__kernel
void
arithm_mul_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
float
scalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
int
data1
=
*
((
__global
int
*
)((
__global
char
*
)
src1
+
src1_index
))
;
int
data2
=
*
((
__global
int
*
)((
__global
char
*
)
src2
+
src2_index
))
;
int
tmp
=
data1
*
data2
;
tmp
=
round_int
((
float
)
tmp
*
scalar
)
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
))
=
convert_int_sat
(
tmp
)
;
}
}
__kernel
void
arithm_mul_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*src2,
int
src2_step,
int
src2_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
float
scalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
float
data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
data2
=
*
((
__global
float
*
)((
__global
char
*
)
src2
+
src2_index
))
;
float
tmp
=
data1
*
data2
;
tmp
=
tmp
*
scalar
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_mul_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*src2,
int
src2_step,
int
src2_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
double
scalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
)
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
3
)
+
dst_offset
)
;
double
data1
=
*
((
__global
double
*
)((
__global
char
*
)
src1
+
src1_index
))
;
double
data2
=
*
((
__global
double
*
)((
__global
char
*
)
src2
+
src2_index
))
;
double
tmp
=
data1
*
data2
;
tmp
=
tmp
*
scalar
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
endif
#
ifdef
DOUBLE_SUPPORT
#
define
SCALAR_TYPE
double
#
else
#
define
SCALAR_TYPE
float
#
endif
__kernel
void
arithm_muls_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
SCALAR_TYPE
scalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
{
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
)
;
int
dst_index
=
mad24
(
y,
dst_step,
(
x
<<
2
)
+
dst_offset
)
;
float
data1
=
*
((
__global
float
*
)((
__global
char
*
)
src1
+
src1_index
))
;
float
tmp
=
data1
*
scalar
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment