Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
dcce9d70
Commit
dcce9d70
authored
Dec 01, 2013
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added cv::warpAffine to T-API
parent
55af7857
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
189 additions
and
685 deletions
+189
-685
ocl.cpp
modules/core/src/ocl.cpp
+1
-2
imgwarp.cpp
modules/imgproc/src/imgwarp.cpp
+89
-57
warp_affine.cl
modules/imgproc/src/opencl/warp_affine.cl
+99
-626
No files found.
modules/core/src/ocl.cpp
View file @
dcce9d70
...
@@ -2041,7 +2041,6 @@ struct Kernel::Impl
...
@@ -2041,7 +2041,6 @@ struct Kernel::Impl
cl_int
retval
=
0
;
cl_int
retval
=
0
;
handle
=
ph
!=
0
?
handle
=
ph
!=
0
?
clCreateKernel
(
ph
,
kname
,
&
retval
)
:
0
;
clCreateKernel
(
ph
,
kname
,
&
retval
)
:
0
;
printf
(
"kernel creation error code: %d
\n
"
,
retval
);
for
(
int
i
=
0
;
i
<
MAX_ARRS
;
i
++
)
for
(
int
i
=
0
;
i
<
MAX_ARRS
;
i
++
)
u
[
i
]
=
0
;
u
[
i
]
=
0
;
haveTempDstUMats
=
false
;
haveTempDstUMats
=
false
;
...
@@ -2219,7 +2218,7 @@ int Kernel::set(int i, const KernelArg& arg)
...
@@ -2219,7 +2218,7 @@ int Kernel::set(int i, const KernelArg& arg)
else
if
(
arg
.
m
->
dims
<=
2
)
else
if
(
arg
.
m
->
dims
<=
2
)
{
{
UMat2D
u2d
(
*
arg
.
m
);
UMat2D
u2d
(
*
arg
.
m
);
clSetKernelArg
(
p
->
handle
,
(
cl_uint
)
i
,
sizeof
(
h
),
&
h
)
)
;
clSetKernelArg
(
p
->
handle
,
(
cl_uint
)
i
,
sizeof
(
h
),
&
h
);
clSetKernelArg
(
p
->
handle
,
(
cl_uint
)(
i
+
1
),
sizeof
(
u2d
.
step
),
&
u2d
.
step
);
clSetKernelArg
(
p
->
handle
,
(
cl_uint
)(
i
+
1
),
sizeof
(
u2d
.
step
),
&
u2d
.
step
);
clSetKernelArg
(
p
->
handle
,
(
cl_uint
)(
i
+
2
),
sizeof
(
u2d
.
offset
),
&
u2d
.
offset
);
clSetKernelArg
(
p
->
handle
,
(
cl_uint
)(
i
+
2
),
sizeof
(
u2d
.
offset
),
&
u2d
.
offset
);
i
+=
3
;
i
+=
3
;
...
...
modules/imgproc/src/imgwarp.cpp
View file @
dcce9d70
...
@@ -3789,6 +3789,87 @@ private:
...
@@ -3789,6 +3789,87 @@ private:
};
};
#endif
#endif
enum
{
OCL_OP_PERSPECTIVE
=
1
,
OCL_OP_AFFINE
=
0
};
static
bool
ocl_warpTransform
(
InputArray
_src
,
OutputArray
_dst
,
InputArray
_M0
,
Size
dsize
,
int
flags
,
int
borderType
,
const
Scalar
&
borderValue
,
int
op_type
)
{
CV_Assert
(
op_type
==
OCL_OP_AFFINE
||
op_type
==
OCL_OP_PERSPECTIVE
);
int
type
=
_src
.
type
(),
depth
=
CV_MAT_DEPTH
(
type
),
cn
=
CV_MAT_CN
(
type
),
wdepth
=
depth
;
double
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
int
interpolation
=
flags
&
INTER_MAX
;
if
(
interpolation
==
INTER_AREA
)
interpolation
=
INTER_LINEAR
;
if
(
!
(
borderType
==
cv
::
BORDER_CONSTANT
&&
(
interpolation
==
cv
::
INTER_NEAREST
||
interpolation
==
cv
::
INTER_LINEAR
||
interpolation
==
cv
::
INTER_CUBIC
))
||
(
!
doubleSupport
&&
depth
==
CV_64F
)
||
cn
>
4
||
cn
==
3
)
return
false
;
UMat
src
=
_src
.
getUMat
(),
M0
;
_dst
.
create
(
dsize
.
area
()
==
0
?
src
.
size
()
:
dsize
,
src
.
type
()
);
UMat
dst
=
_dst
.
getUMat
();
double
M
[
9
];
int
matRows
=
(
op_type
==
OCL_OP_AFFINE
?
2
:
3
);
Mat
matM
(
matRows
,
3
,
CV_64F
,
M
),
M1
=
_M0
.
getMat
();
CV_Assert
(
(
M1
.
type
()
==
CV_32F
||
M1
.
type
()
==
CV_64F
)
&&
M1
.
rows
==
matRows
&&
M1
.
cols
==
3
);
M1
.
convertTo
(
matM
,
matM
.
type
());
if
(
!
(
flags
&
WARP_INVERSE_MAP
)
)
{
if
(
op_type
==
OCL_OP_PERSPECTIVE
)
invert
(
matM
,
matM
);
else
{
double
D
=
M
[
0
]
*
M
[
4
]
-
M
[
1
]
*
M
[
3
];
D
=
D
!=
0
?
1.
/
D
:
0
;
double
A11
=
M
[
4
]
*
D
,
A22
=
M
[
0
]
*
D
;
M
[
0
]
=
A11
;
M
[
1
]
*=
-
D
;
M
[
3
]
*=
-
D
;
M
[
4
]
=
A22
;
double
b1
=
-
M
[
0
]
*
M
[
2
]
-
M
[
1
]
*
M
[
5
];
double
b2
=
-
M
[
3
]
*
M
[
2
]
-
M
[
4
]
*
M
[
5
];
M
[
2
]
=
b1
;
M
[
5
]
=
b2
;
}
}
matM
.
convertTo
(
M0
,
doubleSupport
?
CV_64F
:
CV_32F
);
const
char
*
const
interpolationMap
[
3
]
=
{
"NEAREST"
,
"LINEAR"
,
"CUBIC"
};
ocl
::
ProgramSource2
program
=
op_type
==
OCL_OP_AFFINE
?
ocl
::
imgproc
::
warp_affine_oclsrc
:
ocl
::
imgproc
::
warp_perspective_oclsrc
;
const
char
*
const
kernelName
=
op_type
==
OCL_OP_AFFINE
?
"warpAffine"
:
"warpPerspective"
;
ocl
::
Kernel
k
;
if
(
interpolation
==
INTER_NEAREST
)
{
k
.
create
(
kernelName
,
program
,
format
(
"-D INTER_NEAREST -D T=%s%s"
,
ocl
::
typeToStr
(
type
),
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
));
}
else
{
char
cvt
[
2
][
50
];
wdepth
=
std
::
max
(
CV_32S
,
depth
);
k
.
create
(
kernelName
,
program
,
format
(
"-D INTER_%s -D T=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s"
,
interpolationMap
[
interpolation
],
ocl
::
typeToStr
(
type
),
ocl
::
typeToStr
(
CV_MAKE_TYPE
(
wdepth
,
cn
)),
depth
,
ocl
::
convertTypeStr
(
depth
,
wdepth
,
cn
,
cvt
[
0
]),
ocl
::
convertTypeStr
(
wdepth
,
depth
,
cn
,
cvt
[
1
]),
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
));
}
k
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
WriteOnly
(
dst
),
ocl
::
KernelArg
::
PtrOnly
(
M0
),
ocl
::
KernelArg
::
Constant
(
Mat
(
1
,
1
,
CV_MAKE_TYPE
(
wdepth
,
cn
),
borderValue
)));
size_t
globalThreads
[
2
]
=
{
dst
.
cols
,
dst
.
rows
};
return
k
.
run
(
2
,
globalThreads
,
NULL
,
false
);
}
}
}
...
@@ -3796,6 +3877,11 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
...
@@ -3796,6 +3877,11 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
InputArray
_M0
,
Size
dsize
,
InputArray
_M0
,
Size
dsize
,
int
flags
,
int
borderType
,
const
Scalar
&
borderValue
)
int
flags
,
int
borderType
,
const
Scalar
&
borderValue
)
{
{
if
(
ocl
::
useOpenCL
()
&&
_dst
.
isUMat
()
&&
ocl_warpTransform
(
_src
,
_dst
,
_M0
,
dsize
,
flags
,
borderType
,
borderValue
,
OCL_OP_AFFINE
))
return
;
Mat
src
=
_src
.
getMat
(),
M0
=
_M0
.
getMat
();
Mat
src
=
_src
.
getMat
(),
M0
=
_M0
.
getMat
();
_dst
.
create
(
dsize
.
area
()
==
0
?
src
.
size
()
:
dsize
,
src
.
type
()
);
_dst
.
create
(
dsize
.
area
()
==
0
?
src
.
size
()
:
dsize
,
src
.
type
()
);
Mat
dst
=
_dst
.
getMat
();
Mat
dst
=
_dst
.
getMat
();
...
@@ -4030,62 +4116,6 @@ private:
...
@@ -4030,62 +4116,6 @@ private:
};
};
#endif
#endif
static
bool
ocl_warpPerspective
(
InputArray
_src
,
OutputArray
_dst
,
InputArray
_M0
,
Size
dsize
,
int
flags
,
int
borderType
,
const
Scalar
&
borderValue
)
{
int
type
=
_src
.
type
(),
depth
=
CV_MAT_DEPTH
(
type
),
cn
=
CV_MAT_CN
(
type
),
wdepth
=
depth
;
double
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
int
interpolation
=
flags
&
INTER_MAX
;
if
(
interpolation
==
INTER_AREA
)
interpolation
=
INTER_LINEAR
;
if
(
!
(
borderType
==
cv
::
BORDER_CONSTANT
&&
(
interpolation
==
cv
::
INTER_NEAREST
||
interpolation
==
cv
::
INTER_LINEAR
||
interpolation
==
cv
::
INTER_CUBIC
))
||
(
!
doubleSupport
&&
depth
==
CV_64F
)
||
cn
>
4
||
cn
==
3
)
return
false
;
UMat
src
=
_src
.
getUMat
(),
M0
;
_dst
.
create
(
dsize
.
area
()
==
0
?
src
.
size
()
:
dsize
,
src
.
type
()
);
UMat
dst
=
_dst
.
getUMat
();
double
M
[
9
];
Mat
matM
(
3
,
3
,
doubleSupport
?
CV_64F
:
CV_32F
,
M
),
M1
=
_M0
.
getMat
();
CV_Assert
(
(
M1
.
type
()
==
CV_32F
||
M1
.
type
()
==
CV_64F
)
&&
M1
.
rows
==
3
&&
M1
.
cols
==
3
);
M1
.
convertTo
(
matM
,
matM
.
type
());
if
(
!
(
flags
&
WARP_INVERSE_MAP
)
)
invert
(
matM
,
matM
);
matM
.
copyTo
(
M0
);
const
char
*
const
interpolationMap
[
3
]
=
{
"NEAREST"
,
"LINEAR"
,
"CUBIC"
};
ocl
::
Kernel
k
;
if
(
interpolation
==
INTER_NEAREST
)
{
k
.
create
(
"warpPerspective"
,
ocl
::
imgproc
::
warp_perspective_oclsrc
,
format
(
"-D INTER_NEAREST -D T=%s%s"
,
ocl
::
typeToStr
(
type
),
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
));
}
else
{
char
cvt
[
2
][
50
];
wdepth
=
std
::
max
(
CV_32S
,
depth
);
k
.
create
(
"warpPerspective"
,
ocl
::
imgproc
::
warp_perspective_oclsrc
,
format
(
"-D INTER_%s -D T=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s"
,
interpolationMap
[
interpolation
],
ocl
::
typeToStr
(
type
),
ocl
::
typeToStr
(
CV_MAKE_TYPE
(
wdepth
,
cn
)),
depth
,
ocl
::
convertTypeStr
(
depth
,
wdepth
,
cn
,
cvt
[
0
]),
ocl
::
convertTypeStr
(
wdepth
,
depth
,
cn
,
cvt
[
1
]),
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
));
}
k
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
WriteOnly
(
dst
),
ocl
::
KernelArg
::
PtrOnly
(
M0
),
ocl
::
KernelArg
::
Constant
(
Mat
(
1
,
1
,
CV_MAKE_TYPE
(
wdepth
,
cn
),
borderValue
)));
size_t
globalThreads
[
2
]
=
{
dst
.
cols
,
dst
.
rows
};
return
k
.
run
(
2
,
globalThreads
,
NULL
,
false
);
}
}
}
void
cv
::
warpPerspective
(
InputArray
_src
,
OutputArray
_dst
,
InputArray
_M0
,
void
cv
::
warpPerspective
(
InputArray
_src
,
OutputArray
_dst
,
InputArray
_M0
,
...
@@ -4093,7 +4123,9 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
...
@@ -4093,7 +4123,9 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
{
{
CV_Assert
(
_src
.
total
()
>
0
);
CV_Assert
(
_src
.
total
()
>
0
);
if
(
ocl
::
useOpenCL
()
&&
_dst
.
isUMat
()
&&
ocl_warpPerspective
(
_src
,
_dst
,
_M0
,
dsize
,
flags
,
borderType
,
borderValue
))
if
(
ocl
::
useOpenCL
()
&&
_dst
.
isUMat
()
&&
ocl_warpTransform
(
_src
,
_dst
,
_M0
,
dsize
,
flags
,
borderType
,
borderValue
,
OCL_OP_PERSPECTIVE
))
return
;
return
;
Mat
src
=
_src
.
getMat
(),
M0
=
_M0
.
getMat
();
Mat
src
=
_src
.
getMat
(),
M0
=
_M0
.
getMat
();
...
...
modules/imgproc/src/opencl/warp_affine.cl
View file @
dcce9d70
...
@@ -43,23 +43,15 @@
...
@@ -43,23 +43,15 @@
//
//
//M*/
//M*/
//warpAffine
kernel
//support
data
types:
CV_8UC1,
CV_8UC4,
CV_32FC1,
CV_32FC4,
and
three
interpolation
methods:
NN,
Linear,
Cubic.
#
ifdef
DOUBLE_SUPPORT
#
ifdef
DOUBLE_SUPPORT
#
ifdef
cl_amd_fp64
#
ifdef
cl_amd_fp64
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
elif
defined
(
cl_khr_fp64
)
#
elif
defined
(
cl_khr_fp64
)
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
endif
#
endif
typedef
double
F
;
#
define
CT
double
typedef
double4
F4
;
#
define
convert_F4
convert_double4
#
else
#
else
typedef
float
F
;
#
define
CT
float
typedef
float4
F4
;
#
define
convert_F4
convert_float4
#
endif
#
endif
#
define
INTER_BITS
5
#
define
INTER_BITS
5
...
@@ -70,286 +62,56 @@ typedef float4 F4;
...
@@ -70,286 +62,56 @@ typedef float4 F4;
#
define
INTER_REMAP_COEF_BITS
15
#
define
INTER_REMAP_COEF_BITS
15
#
define
INTER_REMAP_COEF_SCALE
(
1
<<
INTER_REMAP_COEF_BITS
)
#
define
INTER_REMAP_COEF_SCALE
(
1
<<
INTER_REMAP_COEF_BITS
)
inline
void
interpolateCubic
(
float
x,
float*
coeffs
)
#
define
noconvert
{
const
float
A
=
-0.75f
;
coeffs[0]
=
((
A*
(
x
+
1.f
)
-
5.0f*A
)
*
(
x
+
1.f
)
+
8.0f*A
)
*
(
x
+
1.f
)
-
4.0f*A
;
coeffs[1]
=
((
A
+
2.f
)
*x
-
(
A
+
3.f
))
*x*
x
+
1.f
;
coeffs[2]
=
((
A
+
2.f
)
*
(
1.f
-
x
)
-
(
A
+
3.f
))
*
(
1.f
-
x
)
*
(
1.f
-
x
)
+
1.f
;
coeffs[3]
=
1.f
-
coeffs[0]
-
coeffs[1]
-
coeffs[2]
;
}
/**********************************************8UC1*********************************************
***********************************************************************************************
/
__kernel
void
warpAffineNN_C1_D0
(
__global
uchar
const
*
restrict
src,
__global
uchar
*
dst,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
{
dx
=
(
dx<<2
)
-
(
dst_offset&3
)
;
int
round_delta
=
(
AB_SCALE>>1
)
;
int4
X,
Y
;
int4
sx,
sy
;
int4
DX
=
(
int4
)(
dx,
dx+1,
dx+2,
dx+3
)
;
DX
=
(
DX
<<
AB_BITS
)
;
F4
M0DX,
M3DX
;
M0DX
=
M[0]
*
convert_F4
(
DX
)
;
M3DX
=
M[3]
*
convert_F4
(
DX
)
;
X
=
convert_int4
(
rint
(
M0DX
))
;
Y
=
convert_int4
(
rint
(
M3DX
))
;
int
tmp1,
tmp2
;
tmp1
=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
;
tmp2
=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
;
X
+=
tmp1
+
round_delta
;
Y
+=
tmp2
+
round_delta
;
sx
=
convert_int4
(
convert_short4
(
X
>>
AB_BITS
))
;
sy
=
convert_int4
(
convert_short4
(
Y
>>
AB_BITS
))
;
__global
uchar4
*
d
=
(
__global
uchar4
*
)(
dst+dst_offset+dy*dstStep+dx
)
;
uchar4
dval
=
*d
;
DX
=
(
int4
)(
dx,
dx+1,
dx+2,
dx+3
)
;
int4
dcon
=
DX
>=
0
&&
DX
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
;
int4
scon
=
sx
>=
0
&&
sx
<
src_cols
&&
sy
>=
0
&&
sy
<
src_rows
;
int4
spos
=
src_offset
+
sy
*
srcStep
+
sx
;
uchar4
sval
;
sval.s0
=
scon.s0
?
src[spos.s0]
:
0
;
sval.s1
=
scon.s1
?
src[spos.s1]
:
0
;
sval.s2
=
scon.s2
?
src[spos.s2]
:
0
;
sval.s3
=
scon.s3
?
src[spos.s3]
:
0
;
dval
=
convert_uchar4
(
dcon
)
!=
(
uchar4
)(
0
,
0
,
0
,
0
)
?
sval
:
dval
;
*d
=
dval
;
}
}
__kernel
void
warpAffineLinear_C1_D0
(
__global
const
uchar
*
restrict
src,
__global
uchar
*
dst,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
#
ifdef
INTER_NEAREST
{
dx
=
(
dx<<2
)
-
(
dst_offset&3
)
;
int
round_delta
=
((
AB_SCALE
>>
INTER_BITS
)
>>
1
)
;
int4
X,
Y
;
short4
ax,
ay
;
int4
sx,
sy
;
int4
DX
=
(
int4
)(
dx,
dx+1,
dx+2,
dx+3
)
;
DX
=
(
DX
<<
AB_BITS
)
;
F4
M0DX,
M3DX
;
M0DX
=
M[0]
*
convert_F4
(
DX
)
;
M3DX
=
M[3]
*
convert_F4
(
DX
)
;
X
=
convert_int4
(
rint
(
M0DX
))
;
Y
=
convert_int4
(
rint
(
M3DX
))
;
int
tmp1,
tmp2
;
tmp1
=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
;
tmp2
=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
;
X
+=
tmp1
+
round_delta
;
Y
+=
tmp2
+
round_delta
;
X
=
X
>>
(
AB_BITS
-
INTER_BITS
)
;
Y
=
Y
>>
(
AB_BITS
-
INTER_BITS
)
;
sx
=
convert_int4
(
convert_short4
(
X
>>
INTER_BITS
))
;
sy
=
convert_int4
(
convert_short4
(
Y
>>
INTER_BITS
))
;
ax
=
convert_short4
(
X
&
(
INTER_TAB_SIZE-1
))
;
ay
=
convert_short4
(
Y
&
(
INTER_TAB_SIZE-1
))
;
uchar4
v0,
v1,
v2,v3
;
int4
scon0,
scon1,
scon2,
scon3
;
int4
spos0,
spos1,
spos2,
spos3
;
scon0
=
(
sx
>=
0
&&
sx
<
src_cols
&&
sy
>=
0
&&
sy
<
src_rows
)
;
scon1
=
(
sx+1
>=
0
&&
sx+1
<
src_cols
&&
sy
>=
0
&&
sy
<
src_rows
)
;
scon2
=
(
sx
>=
0
&&
sx
<
src_cols
&&
sy+1
>=
0
&&
sy+1
<
src_rows
)
;
scon3
=
(
sx+1
>=
0
&&
sx+1
<
src_cols
&&
sy+1
>=
0
&&
sy+1
<
src_rows
)
;
spos0
=
src_offset
+
sy
*
srcStep
+
sx
;
spos1
=
src_offset
+
sy
*
srcStep
+
sx
+
1
;
spos2
=
src_offset
+
(
sy+1
)
*
srcStep
+
sx
;
spos3
=
src_offset
+
(
sy+1
)
*
srcStep
+
sx
+
1
;
v0.s0
=
scon0.s0
?
src[spos0.s0]
:
0
;
v1.s0
=
scon1.s0
?
src[spos1.s0]
:
0
;
v2.s0
=
scon2.s0
?
src[spos2.s0]
:
0
;
v3.s0
=
scon3.s0
?
src[spos3.s0]
:
0
;
v0.s1
=
scon0.s1
?
src[spos0.s1]
:
0
;
v1.s1
=
scon1.s1
?
src[spos1.s1]
:
0
;
v2.s1
=
scon2.s1
?
src[spos2.s1]
:
0
;
v3.s1
=
scon3.s1
?
src[spos3.s1]
:
0
;
v0.s2
=
scon0.s2
?
src[spos0.s2]
:
0
;
v1.s2
=
scon1.s2
?
src[spos1.s2]
:
0
;
v2.s2
=
scon2.s2
?
src[spos2.s2]
:
0
;
v3.s2
=
scon3.s2
?
src[spos3.s2]
:
0
;
v0.s3
=
scon0.s3
?
src[spos0.s3]
:
0
;
v1.s3
=
scon1.s3
?
src[spos1.s3]
:
0
;
v2.s3
=
scon2.s3
?
src[spos2.s3]
:
0
;
v3.s3
=
scon3.s3
?
src[spos3.s3]
:
0
;
short4
itab0,
itab1,
itab2,
itab3
;
float4
taby,
tabx
;
taby
=
INTER_SCALE
*
convert_float4
(
ay
)
;
tabx
=
INTER_SCALE
*
convert_float4
(
ax
)
;
itab0
=
convert_short4_sat
((
(
1.0f-taby
)
*
(
1.0f-tabx
)
*
(
float4
)
INTER_REMAP_COEF_SCALE
))
;
itab1
=
convert_short4_sat
((
(
1.0f-taby
)
*tabx
*
(
float4
)
INTER_REMAP_COEF_SCALE
))
;
itab2
=
convert_short4_sat
((
taby*
(
1.0f-tabx
)
*
(
float4
)
INTER_REMAP_COEF_SCALE
))
;
itab3
=
convert_short4_sat
((
taby*tabx
*
(
float4
)
INTER_REMAP_COEF_SCALE
))
;
int4
val
;
uchar4
tval
;
val
=
convert_int4
(
v0
)
*
convert_int4
(
itab0
)
+
convert_int4
(
v1
)
*
convert_int4
(
itab1
)
+
convert_int4
(
v2
)
*
convert_int4
(
itab2
)
+
convert_int4
(
v3
)
*
convert_int4
(
itab3
)
;
tval
=
convert_uchar4_sat
(
(
val
+
(
1
<<
(
INTER_REMAP_COEF_BITS-1
)))
>>
INTER_REMAP_COEF_BITS
)
;
__global
uchar4
*
d
=
(
__global
uchar4
*
)(
dst+dst_offset+dy*dstStep+dx
)
;
uchar4
dval
=
*d
;
DX
=
(
int4
)(
dx,
dx+1,
dx+2,
dx+3
)
;
int4
dcon
=
DX
>=
0
&&
DX
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
;
dval
=
convert_uchar4
(
dcon
!=
0
)
?
tval
:
dval
;
*d
=
dval
;
}
}
__kernel
void
warpAffine
Cubic_C1_D0
(
__global
uchar
*
src,
__global
uchar
*
dst,
int
src_cols,
int
src_row
s,
__kernel
void
warpAffine
(
__global
const
uchar
*
srcptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_col
s,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep
,
__global
uchar
*
dstptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols
,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
__constant
CT
*
M,
T
scalar
)
{
{
int
dx
=
get_global_id
(
0
)
;
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadC
ols
&&
dy
<
dst_rows
)
if
(
dx
<
dst_c
ols
&&
dy
<
dst_rows
)
{
{
int
round_delta
=
(
(
AB_SCALE>>INTER_BITS
)
>>
1
)
;
int
round_delta
=
(
AB_SCALE
>>
1
)
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
int
X
=
X0
>>
(
AB_BITS
-
INTER_BITS
)
;
int
Y
=
Y0
>>
(
AB_BITS
-
INTER_BITS
)
;
short
sx
=
(
short
)(
X
>>
INTER_BITS
)
-
1
;
short
sy
=
(
short
)(
Y
>>
INTER_BITS
)
-
1
;
short
ay
=
(
short
)(
Y
&
(
INTER_TAB_SIZE-1
))
;
short
ax
=
(
short
)(
X
&
(
INTER_TAB_SIZE-1
))
;
uchar
v[16]
;
int
i,
j
;
#
pragma
unroll
4
for
(
i=0
; i<4; i++)
for
(
j=0
; j<4; j++)
{
v[i*4+j]
=
(
sx+j
>=
0
&&
sx+j
<
src_cols
&&
sy+i
>=
0
&&
sy+i
<
src_rows
)
?
src[src_offset+
(
sy+i
)
*
srcStep
+
(
sx+j
)
]
:
0
;
}
short
itab[16]
;
float
tab1y[4],
tab1x[4]
;
float
axx,
ayy
;
ayy
=
1.f/INTER_TAB_SIZE
*
ay
;
short
sx
=
convert_short_sat
(
X0
>>
AB_BITS
)
;
axx
=
1.f/INTER_TAB_SIZE
*
ax
;
short
sy
=
convert_short_sat
(
Y0
>>
AB_BITS
)
;
interpolateCubic
(
ayy,
tab1y
)
;
interpolateCubic
(
axx,
tab1x
)
;
int
isum
=
0
;
#
pragma
unroll
16
int
dst_index
=
mad24
(
dy,
dst_step,
dst_offset
+
dx
*
(
int
)
sizeof
(
T
))
;
for
(
i=0
; i<16; i++ )
__global
T
*
dst
=
(
__global
T
*
)(
dstptr
+
dst_index
)
;
{
F
v
=
tab1y[
(
i>>2
)
]
*
tab1x[
(
i&3
)
]
;
isum
+=
itab[i]
=
convert_short_sat
(
rint
(
v
*
INTER_REMAP_COEF_SCALE
)
)
;
}
if
(
isum
!=
INTER_REMAP_COEF_SCALE
)
{
int
k1,
k2
;
int
diff
=
isum
-
INTER_REMAP_COEF_SCALE
;
int
Mk1=2,
Mk2=2,
mk1=2,
mk2=2
;
for
(
k1
=
2
; k1 < 4; k1++ )
for
(
k2
=
2
; k2 < 4; k2++ )
{
if
(
itab[
(
k1<<2
)
+k2]
<
itab[
(
mk1<<2
)
+mk2]
)
mk1
=
k1,
mk2
=
k2
;
else
if
(
itab[
(
k1<<2
)
+k2]
>
itab[
(
Mk1<<2
)
+Mk2]
)
Mk1
=
k1,
Mk2
=
k2
;
}
diff<0
?
(
itab[
(
Mk1<<2
)
+Mk2]=
(
short
)(
itab[
(
Mk1<<2
)
+Mk2]-diff
))
:
(
itab[
(
mk1<<2
)
+mk2]=
(
short
)(
itab[
(
mk1<<2
)
+mk2]-diff
))
;
}
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst
_rows
)
if
(
sx
>=
0
&&
sx
<
src_cols
&&
sy
>=
0
&&
sy
<
src
_rows
)
{
{
int
sum=0
;
int
src_index
=
mad24
(
sy,
src_step,
src_offset
+
sx
*
(
int
)
sizeof
(
T
))
;
for
(
i
=0
; i<16; i++ )
__global
const
T
*
src
=
(
__global
const
T
*
)(
srcptr
+
src_index
)
;
{
dst[0]
=
src[0]
;
sum
+=
v[i]
*
itab[i]
;
}
dst[dst_offset+dy*dstStep+dx]
=
convert_uchar_sat
(
(
sum
+
(
1
<<
(
INTER_REMAP_COEF_BITS-1
)))
>>
INTER_REMAP_COEF_BITS
)
;
}
}
else
dst[0]
=
scalar
;
}
}
}
}
/**********************************************8UC4*********************************************
#
elif
defined
INTER_LINEAR
***********************************************************************************************
/
__kernel
void
warpAffine
NN_C4_D0
(
__global
uchar4
const
*
restrict
src,
__global
uchar4
*
dst,
int
src_cols,
int
src_row
s,
__kernel
void
warpAffine
(
__global
const
uchar
*
srcptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_col
s,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep
,
__global
uchar
*
dstptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols
,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
__constant
CT
*
M,
WT
scalar
)
{
{
int
dx
=
get_global_id
(
0
)
;
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
if
(
dx
<
dst_cols
&&
dy
<
dst_rows
)
{
int
round_delta
=
(
AB_SCALE
>>
1
)
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
int
sx0
=
(
short
)(
X0
>>
AB_BITS
)
;
int
sy0
=
(
short
)(
Y0
>>
AB_BITS
)
;
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
dst[
(
dst_offset>>2
)
+dy*
(
dstStep>>2
)
+dx]=
(
sx0>=0
&&
sx0<src_cols
&&
sy0>=0
&&
sy0<src_rows
)
?
src[
(
src_offset>>2
)
+sy0*
(
srcStep>>2
)
+sx0]
:
(
uchar4
)
0
;
}
}
__kernel
void
warpAffineLinear_C4_D0
(
__global
uchar4
const
*
restrict
src,
__global
uchar4
*
dst,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
{
{
int
round_delta
=
AB_SCALE/INTER_TAB_SIZE/2
;
int
round_delta
=
AB_SCALE/INTER_TAB_SIZE/2
;
src_offset
=
(
src_offset>>2
)
;
srcStep
=
(
srcStep>>2
)
;
int
tmp
=
(
dx
<<
AB_BITS
)
;
int
tmp
=
(
dx
<<
AB_BITS
)
;
int
X0
=
rint
(
M[0]
*
tmp
)
;
int
X0
=
rint
(
M[0]
*
tmp
)
;
int
Y0
=
rint
(
M[3]
*
tmp
)
;
int
Y0
=
rint
(
M[3]
*
tmp
)
;
...
@@ -358,52 +120,65 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
...
@@ -358,52 +120,65 @@ __kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __glo
X0
=
X0
>>
(
AB_BITS
-
INTER_BITS
)
;
X0
=
X0
>>
(
AB_BITS
-
INTER_BITS
)
;
Y0
=
Y0
>>
(
AB_BITS
-
INTER_BITS
)
;
Y0
=
Y0
>>
(
AB_BITS
-
INTER_BITS
)
;
short
sx0
=
(
short
)(
X0
>>
INTER_BITS
)
;
short
sx
=
convert_short_sat
(
X0
>>
INTER_BITS
)
;
short
sy0
=
(
short
)(
Y0
>>
INTER_BITS
)
;
short
sy
=
convert_short_sat
(
Y0
>>
INTER_BITS
)
;
short
ax0
=
(
short
)(
X0
&
(
INTER_TAB_SIZE-1
))
;
short
ax
=
convert_short
(
X0
&
(
INTER_TAB_SIZE-1
))
;
short
ay0
=
(
short
)(
Y0
&
(
INTER_TAB_SIZE-1
))
;
short
ay
=
convert_short
(
Y0
&
(
INTER_TAB_SIZE-1
))
;
int4
v0,
v1,
v2,
v3
;
WT
v0
=
(
sx
>=
0
&&
sx
<
src_cols
&&
sy
>=
0
&&
sy
<
src_rows
)
?
convertToWT
(
*
(
__global
const
T
*
)(
srcptr
+
mad24
(
sy,
src_step,
src_offset
+
sx
*
(
int
)
sizeof
(
T
))))
:
scalar
;
v0
=
(
sx0
>=
0
&&
sx0
<
src_cols
&&
sy0
>=
0
&&
sy0
<
src_rows
)
?
convert_int4
(
src[src_offset+sy0
*
srcStep
+
sx0]
)
:
0
;
WT
v1
=
(
sx+1
>=
0
&&
sx+1
<
src_cols
&&
sy
>=
0
&&
sy
<
src_rows
)
?
v1
=
(
sx0+1
>=
0
&&
sx0+1
<
src_cols
&&
sy0
>=
0
&&
sy0
<
src_rows
)
?
convert_int4
(
src[src_offset+sy0
*
srcStep
+
sx0+1]
)
:
0
;
convertToWT
(
*
(
__global
const
T
*
)(
srcptr
+
mad24
(
sy,
src_step,
src_offset
+
(
sx+1
)
*
(
int
)
sizeof
(
T
))))
:
scalar
;
v2
=
(
sx0
>=
0
&&
sx0
<
src_cols
&&
sy0+1
>=
0
&&
sy0+1
<
src_rows
)
?
convert_int4
(
src[src_offset+
(
sy0+1
)
*
srcStep
+
sx0]
)
:
0
;
WT
v2
=
(
sx
>=
0
&&
sx
<
src_cols
&&
sy+1
>=
0
&&
sy+1
<
src_rows
)
?
v3
=
(
sx0+1
>=
0
&&
sx0+1
<
src_cols
&&
sy0+1
>=
0
&&
sy0+1
<
src_rows
)
?
convert_int4
(
src[src_offset+
(
sy0+1
)
*
srcStep
+
sx0+1]
)
:
0
;
convertToWT
(
*
(
__global
const
T
*
)(
srcptr
+
mad24
(
sy+1,
src_step,
src_offset
+
sx
*
(
int
)
sizeof
(
T
))))
:
scalar
;
WT
v3
=
(
sx+1
>=
0
&&
sx+1
<
src_cols
&&
sy+1
>=
0
&&
sy+1
<
src_rows
)
?
int
itab0,
itab1,
itab2,
itab3
;
convertToWT
(
*
(
__global
const
T
*
)(
srcptr
+
mad24
(
sy+1,
src_step,
src_offset
+
(
sx+1
)
*
(
int
)
sizeof
(
T
))))
:
scalar
;
float
taby,
tabx
;
taby
=
1.f/INTER_TAB_SIZE*ay0
;
float
taby
=
1.f/INTER_TAB_SIZE*ay
;
tabx
=
1.f/INTER_TAB_SIZE*ax0
;
float
tabx
=
1.f/INTER_TAB_SIZE*ax
;
int
dst_index
=
mad24
(
dy,
dst_step,
dst_offset
+
dx
*
(
int
)
sizeof
(
T
))
;
__global
T
*
dst
=
(
__global
T
*
)(
dstptr
+
dst_index
)
;
#
if
depth
<=
4
int
itab0
=
convert_short_sat_rte
(
(
1.0f-taby
)
*
(
1.0f-tabx
)
*
INTER_REMAP_COEF_SCALE
)
;
int
itab1
=
convert_short_sat_rte
(
(
1.0f-taby
)
*tabx
*
INTER_REMAP_COEF_SCALE
)
;
int
itab2
=
convert_short_sat_rte
(
taby*
(
1.0f-tabx
)
*
INTER_REMAP_COEF_SCALE
)
;
int
itab3
=
convert_short_sat_rte
(
taby*tabx
*
INTER_REMAP_COEF_SCALE
)
;
WT
val
=
v0
*
itab0
+
v1
*
itab1
+
v2
*
itab2
+
v3
*
itab3
;
dst[0]
=
convertToT
((
val
+
(
1
<<
(
INTER_REMAP_COEF_BITS-1
)))
>>
INTER_REMAP_COEF_BITS
)
;
#
else
float
tabx2
=
1.0f
-
tabx,
taby2
=
1.0f
-
taby
;
WT
val
=
v0
*
tabx2
*
taby2
+
v1
*
tabx
*
taby2
+
v2
*
tabx2
*
taby
+
v3
*
tabx
*
taby
;
dst[0]
=
convertToT
(
val
)
;
#
endif
}
}
itab0
=
convert_short_sat
(
rint
(
(
1.0f-taby
)
*
(
1.0f-tabx
)
*
INTER_REMAP_COEF_SCALE
))
;
#
elif
defined
INTER_CUBIC
itab1
=
convert_short_sat
(
rint
(
(
1.0f-taby
)
*tabx
*
INTER_REMAP_COEF_SCALE
))
;
itab2
=
convert_short_sat
(
rint
(
taby*
(
1.0f-tabx
)
*
INTER_REMAP_COEF_SCALE
))
;
itab3
=
convert_short_sat
(
rint
(
taby*tabx
*
INTER_REMAP_COEF_SCALE
))
;
int4
val
;
inline
void
interpolateCubic
(
float
x,
float*
coeffs
)
val
=
v0
*
itab0
+
v1
*
itab1
+
v2
*
itab2
+
v3
*
itab3
;
{
const
float
A
=
-0.75f
;
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
coeffs[0]
=
((
A*
(
x
+
1.f
)
-
5.0f*A
)
*
(
x
+
1.f
)
+
8.0f*A
)
*
(
x
+
1.f
)
-
4.0f*A
;
dst[
(
dst_offset>>2
)
+dy*
(
dstStep>>2
)
+dx]
=
convert_uchar4_sat
(
(
val
+
(
1
<<
(
INTER_REMAP_COEF_BITS-1
)))
>>
INTER_REMAP_COEF_BITS
)
;
coeffs[1]
=
((
A
+
2.f
)
*x
-
(
A
+
3.f
))
*x*
x
+
1.f
;
}
coeffs[2]
=
((
A
+
2.f
)
*
(
1.f
-
x
)
-
(
A
+
3.f
))
*
(
1.f
-
x
)
*
(
1.f
-
x
)
+
1.f
;
coeffs[3]
=
1.f
-
coeffs[0]
-
coeffs[1]
-
coeffs[2]
;
}
}
__kernel
void
warpAffine
Cubic_C4_D0
(
__global
uchar4
const
*
restrict
src,
__global
uchar4
*
dst,
int
src_cols,
int
src_row
s,
__kernel
void
warpAffine
(
__global
const
uchar
*
srcptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_col
s,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep
,
__global
uchar
*
dstptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols
,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
__constant
CT
*
M,
WT
scalar
)
{
{
int
dx
=
get_global_id
(
0
)
;
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadC
ols
&&
dy
<
dst_rows
)
if
(
dx
<
dst_c
ols
&&
dy
<
dst_rows
)
{
{
int
round_delta
=
((
AB_SCALE>>INTER_BITS
)
>>1
)
;
int
round_delta
=
((
AB_SCALE>>INTER_BITS
)
>>1
)
;
src_offset
=
(
src_offset>>2
)
;
srcStep
=
(
srcStep>>2
)
;
dst_offset
=
(
dst_offset>>2
)
;
dstStep
=
(
dstStep>>2
)
;
int
tmp
=
(
dx
<<
AB_BITS
)
;
int
tmp
=
(
dx
<<
AB_BITS
)
;
int
X0
=
rint
(
M[0]
*
tmp
)
;
int
X0
=
rint
(
M[0]
*
tmp
)
;
int
Y0
=
rint
(
M[3]
*
tmp
)
;
int
Y0
=
rint
(
M[3]
*
tmp
)
;
...
@@ -417,345 +192,43 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
...
@@ -417,345 +192,43 @@ __kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __glob
int
ay
=
(
short
)(
Y0
&
(
INTER_TAB_SIZE-1
))
;
int
ay
=
(
short
)(
Y0
&
(
INTER_TAB_SIZE-1
))
;
int
ax
=
(
short
)(
X0
&
(
INTER_TAB_SIZE-1
))
;
int
ax
=
(
short
)(
X0
&
(
INTER_TAB_SIZE-1
))
;
uchar4
v[16]
;
WT
v[16]
;
int
i,j
;
#
pragma
unroll
#
pragma
unroll
4
for
(
int
y
=
0
; y < 4; y++)
for
(
i=0
; i<4; i++)
#
pragma
unroll
for
(
j=0
; j<4; j++)
for
(
int
x
=
0
; x < 4; x++)
{
v[mad24
(
y,
4
,
x
)
]
=
(
sx+x
>=
0
&&
sx+x
<
src_cols
&&
sy+y
>=
0
&&
sy+y
<
src_rows
)
?
v[i*4+j]
=
(
sx+j
>=
0
&&
sx+j
<
src_cols
&&
sy+i
>=
0
&&
sy+i
<
src_rows
)
?
(
src[src_offset+
(
sy+i
)
*
srcStep
+
(
sx+j
)
]
)
:
(
uchar4
)
0
;
convertToWT
(
*
(
__global
const
T
*
)(
srcptr
+
mad24
(
sy+y,
src_step,
src_offset
+
(
sx+x
)
*
(
int
)
sizeof
(
T
))))
:
scalar
;
}
int
itab[16]
;
float
tab1y[4],
tab1x[4]
;
float
axx,
ayy
;
ayy
=
INTER_SCALE
*
ay
;
axx
=
INTER_SCALE
*
ax
;
interpolateCubic
(
ayy,
tab1y
)
;
interpolateCubic
(
axx,
tab1x
)
;
int
isum
=
0
;
#
pragma
unroll
16
for
(
i=0
; i<16; i++ )
{
float
tmp
;
tmp
=
tab1y[
(
i>>2
)
]
*
tab1x[
(
i&3
)
]
*
INTER_REMAP_COEF_SCALE
;
itab[i]
=
rint
(
tmp
)
;
isum
+=
itab[i]
;
}
if
(
isum
!=
INTER_REMAP_COEF_SCALE
)
{
int
k1,
k2
;
int
diff
=
isum
-
INTER_REMAP_COEF_SCALE
;
int
Mk1=2,
Mk2=2,
mk1=2,
mk2=2
;
for
(
k1
=
2
; k1 < 4; k1++ )
for
(
k2
=
2
; k2 < 4; k2++ )
{
if
(
itab[
(
k1<<2
)
+k2]
<
itab[
(
mk1<<2
)
+mk2]
)
mk1
=
k1,
mk2
=
k2
;
else
if
(
itab[
(
k1<<2
)
+k2]
>
itab[
(
Mk1<<2
)
+Mk2]
)
Mk1
=
k1,
Mk2
=
k2
;
}
diff<0
?
(
itab[
(
Mk1<<2
)
+Mk2]=
(
short
)(
itab[
(
Mk1<<2
)
+Mk2]-diff
))
:
(
itab[
(
mk1<<2
)
+mk2]=
(
short
)(
itab[
(
mk1<<2
)
+mk2]-diff
))
;
}
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
{
int4
sum=0
;
for
(
i
=0
; i<16; i++ )
{
sum
+=
convert_int4
(
v[i]
)
*
itab[i]
;
}
dst[dst_offset+dy*dstStep+dx]
=
convert_uchar4_sat
(
(
sum
+
(
1
<<
(
INTER_REMAP_COEF_BITS-1
)))
>>
INTER_REMAP_COEF_BITS
)
;
}
}
}
/**********************************************32FC1********************************************
***********************************************************************************************
/
__kernel
void
warpAffineNN_C1_D5
(
__global
float
*
src,
__global
float
*
dst,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
{
int
round_delta
=
AB_SCALE/2
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
short
sx0
=
(
short
)(
X0
>>
AB_BITS
)
;
short
sy0
=
(
short
)(
Y0
>>
AB_BITS
)
;
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
dst[
(
dst_offset>>2
)
+dy*dstStep+dx]=
(
sx0>=0
&&
sx0<src_cols
&&
sy0>=0
&&
sy0<src_rows
)
?
src[
(
src_offset>>2
)
+sy0*srcStep+sx0]
:
0
;
}
}
__kernel
void
warpAffineLinear_C1_D5
(
__global
float
*
src,
__global
float
*
dst,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
{
int
round_delta
=
AB_SCALE/INTER_TAB_SIZE/2
;
src_offset
=
(
src_offset>>2
)
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
X0
=
X0
>>
(
AB_BITS
-
INTER_BITS
)
;
Y0
=
Y0
>>
(
AB_BITS
-
INTER_BITS
)
;
short
sx0
=
(
short
)(
X0
>>
INTER_BITS
)
;
short
sy0
=
(
short
)(
Y0
>>
INTER_BITS
)
;
short
ax0
=
(
short
)(
X0
&
(
INTER_TAB_SIZE-1
))
;
short
ay0
=
(
short
)(
Y0
&
(
INTER_TAB_SIZE-1
))
;
float
v0,
v1,
v2,
v3
;
v0
=
(
sx0
>=
0
&&
sx0
<
src_cols
&&
sy0
>=
0
&&
sy0
<
src_rows
)
?
src[src_offset+sy0
*
srcStep
+
sx0]
:
0
;
v1
=
(
sx0+1
>=
0
&&
sx0+1
<
src_cols
&&
sy0
>=
0
&&
sy0
<
src_rows
)
?
src[src_offset+sy0
*
srcStep
+
sx0+1]
:
0
;
v2
=
(
sx0
>=
0
&&
sx0
<
src_cols
&&
sy0+1
>=
0
&&
sy0+1
<
src_rows
)
?
src[src_offset+
(
sy0+1
)
*
srcStep
+
sx0]
:
0
;
v3
=
(
sx0+1
>=
0
&&
sx0+1
<
src_cols
&&
sy0+1
>=
0
&&
sy0+1
<
src_rows
)
?
src[src_offset+
(
sy0+1
)
*
srcStep
+
sx0+1]
:
0
;
float
tab[4]
;
float
taby[2],
tabx[2]
;
taby[0]
=
1.0f
-
1.f/INTER_TAB_SIZE*ay0
;
taby[1]
=
1.f/INTER_TAB_SIZE*ay0
;
tabx[0]
=
1.0f
-
1.f/INTER_TAB_SIZE*ax0
;
tabx[1]
=
1.f/INTER_TAB_SIZE*ax0
;
tab[0]
=
taby[0]
*
tabx[0]
;
tab[1]
=
taby[0]
*
tabx[1]
;
tab[2]
=
taby[1]
*
tabx[0]
;
tab[3]
=
taby[1]
*
tabx[1]
;
float
sum
=
0
;
sum
+=
v0
*
tab[0]
+
v1
*
tab[1]
+
v2
*
tab[2]
+
v3
*
tab[3]
;
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
dst[
(
dst_offset>>2
)
+dy*dstStep+dx]
=
sum
;
}
}
__kernel
void
warpAffineCubic_C1_D5
(
__global
float
*
src,
__global
float
*
dst,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
{
int
round_delta
=
AB_SCALE/INTER_TAB_SIZE/2
;
src_offset
=
(
src_offset>>2
)
;
dst_offset
=
(
dst_offset>>2
)
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
X0
=
X0
>>
(
AB_BITS
-
INTER_BITS
)
;
Y0
=
Y0
>>
(
AB_BITS
-
INTER_BITS
)
;
short
sx
=
(
short
)(
X0
>>
INTER_BITS
)
-
1
;
short
sy
=
(
short
)(
Y0
>>
INTER_BITS
)
-
1
;
short
ay
=
(
short
)(
Y0
&
(
INTER_TAB_SIZE-1
))
;
short
ax
=
(
short
)(
X0
&
(
INTER_TAB_SIZE-1
))
;
float
v[16]
;
int
i
;
for
(
i=0
; i<16; i++)
v[i]
=
(
sx+
(
i&3
)
>=
0
&&
sx+
(
i&3
)
<
src_cols
&&
sy+
(
i>>2
)
>=
0
&&
sy+
(
i>>2
)
<
src_rows
)
?
src[src_offset+
(
sy+
(
i>>2
))
*
srcStep
+
(
sx+
(
i&3
))
]
:
0
;
float
tab[16]
;
float
tab1y[4],
tab1x[4]
;
float
tab1y[4],
tab1x[4]
;
float
axx,
ayy
;
ayy
=
1.f/INTER_TAB_SIZ
E
*
ay
;
float
ayy
=
INTER_SCAL
E
*
ay
;
axx
=
1.f/INTER_TAB_SIZ
E
*
ax
;
float
axx
=
INTER_SCAL
E
*
ax
;
interpolateCubic
(
ayy,
tab1y
)
;
interpolateCubic
(
ayy,
tab1y
)
;
interpolateCubic
(
axx,
tab1x
)
;
interpolateCubic
(
axx,
tab1x
)
;
#
pragma
unroll
4
int
dst_index
=
mad24
(
dy,
dst_step,
dst_offset
+
dx
*
(
int
)
sizeof
(
T
))
;
for
(
i=0
; i<16; i++ )
__global
T
*
dst
=
(
__global
T
*
)(
dstptr
+
dst_index
)
;
{
tab[i]
=
tab1y[
(
i>>2
)
]
*
tab1x[
(
i&3
)
]
;
}
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
{
float
sum
=
0
;
#
pragma
unroll
4
for
(
i
=0
; i<16; i++ )
{
sum
+=
v[i]
*
tab[i]
;
}
dst[dst_offset+dy*dstStep+dx]
=
sum
;
}
}
}
/**********************************************32FC4********************************************
***********************************************************************************************
/
__kernel
void
warpAffineNN_C4_D5
(
__global
float4
*
src,
__global
float4
*
dst,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
{
int
round_delta
=
AB_SCALE/2
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
short
sx0
=
(
short
)(
X0
>>
AB_BITS
)
;
short
sy0
=
(
short
)(
Y0
>>
AB_BITS
)
;
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
dst[
(
dst_offset>>4
)
+dy*
(
dstStep>>2
)
+dx]=
(
sx0>=0
&&
sx0<src_cols
&&
sy0>=0
&&
sy0<src_rows
)
?
src[
(
src_offset>>4
)
+sy0*
(
srcStep>>2
)
+sx0]
:
(
float4
)
0
;
}
}
__kernel
void
warpAffineLinear_C4_D5
(
__global
float4
*
src,
__global
float4
*
dst,
int
src_cols,
int
src_rows,
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
{
int
round_delta
=
AB_SCALE/INTER_TAB_SIZE/2
;
src_offset
=
(
src_offset>>4
)
;
WT
sum
=
(
WT
)(
0
)
;
dst_offset
=
(
dst_offset>>4
)
;
#
if
depth
<=
4
srcStep
=
(
srcStep>>2
)
;
int
itab[16]
;
dstStep
=
(
dstStep>>2
)
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
#
pragma
unroll
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
for
(
int
i
=
0
; i < 16; i++)
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
itab[i]
=
rint
(
tab1y[
(
i>>2
)
]
*
tab1x[
(
i&3
)
]
*
INTER_REMAP_COEF_SCALE
)
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
X0
=
X0
>>
(
AB_BITS
-
INTER_BITS
)
;
Y0
=
Y0
>>
(
AB_BITS
-
INTER_BITS
)
;
short
sx0
=
(
short
)(
X0
>>
INTER_BITS
)
;
#
pragma
unroll
short
sy0
=
(
short
)(
Y0
>>
INTER_BITS
)
;
for
(
int
i
=
0
; i < 16; i++)
short
ax0
=
(
short
)(
X0
&
(
INTER_TAB_SIZE-1
))
;
sum
+=
v[i]
*
itab[i]
;
short
ay0
=
(
short
)(
Y0
&
(
INTER_TAB_SIZE-1
))
;
dst[0]
=
convertToT
(
(
sum
+
(
1
<<
(
INTER_REMAP_COEF_BITS-1
)))
>>
INTER_REMAP_COEF_BITS
)
;
#
else
float4
v0,
v1,
v2,
v3
;
#
pragma
unroll
for
(
int
i
=
0
; i < 16; i++)
v0
=
(
sx0
>=
0
&&
sx0
<
src_cols
&&
sy0
>=
0
&&
sy0
<
src_rows
)
?
src[src_offset+sy0
*
srcStep
+
sx0]
:
(
float4
)
0
;
sum
+=
v[i]
*
tab1y[
(
i>>2
)
]
*
tab1x[
(
i&3
)
]
;
v1
=
(
sx0+1
>=
0
&&
sx0+1
<
src_cols
&&
sy0
>=
0
&&
sy0
<
src_rows
)
?
src[src_offset+sy0
*
srcStep
+
sx0+1]
:
(
float4
)
0
;
dst[0]
=
convertToT
(
sum
)
;
v2
=
(
sx0
>=
0
&&
sx0
<
src_cols
&&
sy0+1
>=
0
&&
sy0+1
<
src_rows
)
?
src[src_offset+
(
sy0+1
)
*
srcStep
+
sx0]
:
(
float4
)
0
;
#
endif
v3
=
(
sx0+1
>=
0
&&
sx0+1
<
src_cols
&&
sy0+1
>=
0
&&
sy0+1
<
src_rows
)
?
src[src_offset+
(
sy0+1
)
*
srcStep
+
sx0+1]
:
(
float4
)
0
;
float
tab[4]
;
float
taby[2],
tabx[2]
;
taby[0]
=
1.0f
-
1.f/INTER_TAB_SIZE*ay0
;
taby[1]
=
1.f/INTER_TAB_SIZE*ay0
;
tabx[0]
=
1.0f
-
1.f/INTER_TAB_SIZE*ax0
;
tabx[1]
=
1.f/INTER_TAB_SIZE*ax0
;
tab[0]
=
taby[0]
*
tabx[0]
;
tab[1]
=
taby[0]
*
tabx[1]
;
tab[2]
=
taby[1]
*
tabx[0]
;
tab[3]
=
taby[1]
*
tabx[1]
;
float4
sum
=
0
;
sum
+=
v0
*
tab[0]
+
v1
*
tab[1]
+
v2
*
tab[2]
+
v3
*
tab[3]
;
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
dst[dst_offset+dy*dstStep+dx]
=
sum
;
}
}
}
}
__kernel
void
warpAffineCubic_C4_D5
(
__global
float4
*
src,
__global
float4
*
dst,
int
src_cols,
int
src_rows,
#
endif
int
dst_cols,
int
dst_rows,
int
srcStep,
int
dstStep,
int
src_offset,
int
dst_offset,
__constant
F
*
M,
int
threadCols
)
{
int
dx
=
get_global_id
(
0
)
;
int
dy
=
get_global_id
(
1
)
;
if
(
dx
<
threadCols
&&
dy
<
dst_rows
)
{
int
round_delta
=
AB_SCALE/INTER_TAB_SIZE/2
;
src_offset
=
(
src_offset>>4
)
;
dst_offset
=
(
dst_offset>>4
)
;
srcStep
=
(
srcStep>>2
)
;
dstStep
=
(
dstStep>>2
)
;
int
X0
=
rint
(
M[0]
*
dx
*
AB_SCALE
)
;
int
Y0
=
rint
(
M[3]
*
dx
*
AB_SCALE
)
;
X0
+=
rint
((
M[1]*dy
+
M[2]
)
*
AB_SCALE
)
+
round_delta
;
Y0
+=
rint
((
M[4]*dy
+
M[5]
)
*
AB_SCALE
)
+
round_delta
;
X0
=
X0
>>
(
AB_BITS
-
INTER_BITS
)
;
Y0
=
Y0
>>
(
AB_BITS
-
INTER_BITS
)
;
short
sx
=
(
short
)(
X0
>>
INTER_BITS
)
-
1
;
short
sy
=
(
short
)(
Y0
>>
INTER_BITS
)
-
1
;
short
ay
=
(
short
)(
Y0
&
(
INTER_TAB_SIZE-1
))
;
short
ax
=
(
short
)(
X0
&
(
INTER_TAB_SIZE-1
))
;
float4
v[16]
;
int
i
;
for
(
i=0
; i<16; i++)
v[i]
=
(
sx+
(
i&3
)
>=
0
&&
sx+
(
i&3
)
<
src_cols
&&
sy+
(
i>>2
)
>=
0
&&
sy+
(
i>>2
)
<
src_rows
)
?
src[src_offset+
(
sy+
(
i>>2
))
*
srcStep
+
(
sx+
(
i&3
))
]
:
(
float4
)
0
;
float
tab[16]
;
float
tab1y[4],
tab1x[4]
;
float
axx,
ayy
;
ayy
=
1.f/INTER_TAB_SIZE
*
ay
;
axx
=
1.f/INTER_TAB_SIZE
*
ax
;
interpolateCubic
(
ayy,
tab1y
)
;
interpolateCubic
(
axx,
tab1x
)
;
#
pragma
unroll
4
for
(
i=0
; i<16; i++ )
{
tab[i]
=
tab1y[
(
i>>2
)
]
*
tab1x[
(
i&3
)
]
;
}
if
(
dx
>=
0
&&
dx
<
dst_cols
&&
dy
>=
0
&&
dy
<
dst_rows
)
{
float4
sum
=
0
;
#
pragma
unroll
4
for
(
i
=0
; i<16; i++ )
{
sum
+=
v[i]
*
tab[i]
;
}
dst[dst_offset+dy*dstStep+dx]
=
sum
;
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment