Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
beb14c70
Commit
beb14c70
authored
Nov 08, 2019
by
Anna Khakimova
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
GAPI Fluid: SIMD optimization for sep filters 5x5 kernel size (gaussBlur)
parent
7a52a4c1
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
640 additions
and
14 deletions
+640
-14
gfluidimgproc.cpp
modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+12
-2
gfluidimgproc_func.dispatch.cpp
...s/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+22
-0
gfluidimgproc_func.hpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+19
-0
gfluidimgproc_func.simd.hpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+587
-12
No files found.
modules/gapi/src/backends/fluid/gfluidimgproc.cpp
View file @
beb14c70
...
...
@@ -599,6 +599,7 @@ static void run_sepfilter(Buffer& dst, const View& src,
{
constexpr
int
kMax
=
11
;
GAPI_Assert
(
kxLen
<=
kMax
&&
kyLen
<=
kMax
);
GAPI_Assert
(
kxLen
==
kyLen
);
const
SRC
*
in
[
kMax
];
DST
*
out
;
...
...
@@ -625,6 +626,13 @@ static void run_sepfilter(Buffer& dst, const View& src,
int
border
=
xborder
;
run_sepfilter3x3_impl
(
out
,
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
}
else
if
(
kxLen
==
5
&&
kyLen
==
5
)
{
int
y
=
dst
.
y
();
int
y0
=
dst
.
priv
().
writeStart
();
run_sepfilter5x5_impl
(
out
,
in
,
width
,
chan
,
kx
,
ky
,
xborder
,
scale
,
delta
,
buf
,
y
,
y0
);
}
else
{
int
length
=
chan
*
width
;
...
...
@@ -788,7 +796,9 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
Buffer
&
dst
,
Buffer
&
scratch
)
{
int
kxsize
=
ksize
.
width
;
GAPI_Assert
(
ksize
.
height
==
ksize
.
width
);
GAPI_Assert
((
ksize
.
height
==
3
)
||
(
ksize
.
height
==
5
));
const
int
kxsize
=
ksize
.
width
;
int
kysize
=
ksize
.
height
;
auto
*
kx
=
scratch
.
OutLine
<
float
>
();
// cached kernX data
...
...
@@ -801,7 +811,7 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
constexpr
int
buffSize
=
5
;
GAPI_Assert
(
ksize
.
height
<=
buffSize
);
float
*
buf
[
buffSize
]
{
};
float
*
buf
[
buffSize
]
=
{
nullptr
};
buf
[
0
]
=
ky
+
kysize
;
for
(
int
i
=
1
;
i
<
ksize
.
height
;
++
i
)
...
...
modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
View file @
beb14c70
...
...
@@ -119,6 +119,28 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \
void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0) \
{ \
CV_CPU_DISPATCH(run_sepfilter5x5_impl, \
(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
CV_CPU_DISPATCH_MODES_ALL); \
}
RUN_SEPFILTER5x5_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
short
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
float
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
float
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER5x5_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
...
...
modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
View file @
beb14c70
...
...
@@ -78,6 +78,25 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \
void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0);
RUN_SEPFILTER5x5_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
short
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
float
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
float
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER5x5_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
...
...
modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
View file @
beb14c70
...
...
@@ -100,6 +100,23 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \
void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0);
RUN_SEPFILTER5x5_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
short
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
float
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
float
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER5x5_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
...
...
@@ -978,11 +995,11 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
}
}
//-------------------------
//-------------------------
----
//
// Fluid kernels: sepFilter
// Fluid kernels: sepFilter
3x3
//
//-------------------------
//-------------------------
----
#if CV_SIMD
// this variant not using buf[] appears 15% faster than reference any-2-float code below
...
...
@@ -1322,7 +1339,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
}
}
}
#endif
#endif
//USE_SEPFILTER3X3_CHAR2SHORT
#endif // CV_SIMD
...
...
@@ -1464,18 +1481,576 @@ void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
} \
}
RUN_SEPFILTER3X3_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
short
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
float
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
short
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
float
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
short
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
float
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
short
,
short
)
RUN_SEPFILTER3X3_IMPL
(
float
,
short
)
RUN_SEPFILTER3X3_IMPL
(
float
,
float
)
RUN_SEPFILTER3X3_IMPL
(
short
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
float
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
short
,
short
)
RUN_SEPFILTER3X3_IMPL
(
float
,
short
)
RUN_SEPFILTER3X3_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER3X3_IMPL
//-----------------------------
//
// Fluid kernels: sepFilter 5x5
//
//-----------------------------
#if CV_SIMD
// this code with manually vectored rounding to uchar
template
<
bool
noscale
,
typename
SRC
>
static
void
run_sepfilter5x5_any2char
(
uchar
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
constexpr
int
kxLen
=
5
;
constexpr
int
kyLen
=
kxLen
;
constexpr
int
buffSize
=
5
;
int
r
[
buffSize
];
for
(
int
n
=
0
;
n
<
buffSize
;
++
n
)
{
r
[
n
]
=
(
y
-
y0
+
n
)
%
5
;
// previous, this, next rows
}
const
int
length
=
width
*
chan
;
const
int
shift
=
chan
;
// horizontal pass
int
k0
=
(
y
==
y0
)
?
0
:
4
;
for
(
int
k
=
k0
;
k
<
kxLen
;
++
k
)
{
const
SRC
*
s
[
kxLen
]
=
{
nullptr
};
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
// previous , this , next pixels
s
[
i
]
=
in
[
k
]
+
(
i
-
border
)
*
shift
;
}
// rely on compiler vectoring
for
(
int
l
=
0
;
l
<
length
;
++
l
)
{
float
sum
=
0
;
for
(
int
j
=
0
;
j
<
kxLen
;
++
j
)
{
sum
+=
s
[
j
][
l
]
*
kx
[
j
];
}
buf
[
r
[
k
]][
l
]
=
sum
;
}
}
// vertical pass
constexpr
int
nlanes
=
v_uint8
::
nlanes
;
for
(
int
l
=
0
;
l
<
length
;)
{
// main part of row
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
v_float32
sum0
=
vx_load
(
&
buf
[
r
[
0
]][
l
])
*
vx_setall_f32
(
ky
[
0
]);
v_float32
sum1
=
vx_load
(
&
buf
[
r
[
0
]][
l
+
nlanes
/
4
])
*
vx_setall_f32
(
ky
[
0
]);
v_float32
sum2
=
vx_load
(
&
buf
[
r
[
0
]][
l
+
2
*
nlanes
/
4
])
*
vx_setall_f32
(
ky
[
0
]);
v_float32
sum3
=
vx_load
(
&
buf
[
r
[
0
]][
l
+
3
*
nlanes
/
4
])
*
vx_setall_f32
(
ky
[
0
]);
for
(
int
n
=
1
;
n
<
kyLen
;
++
n
)
{
sum0
=
v_fma
(
vx_load
(
&
buf
[
r
[
n
]][
l
]),
vx_setall_f32
(
ky
[
n
]),
sum0
);
sum1
=
v_fma
(
vx_load
(
&
buf
[
r
[
n
]][
l
+
nlanes
/
4
]),
vx_setall_f32
(
ky
[
n
]),
sum1
);
sum2
=
v_fma
(
vx_load
(
&
buf
[
r
[
n
]][
l
+
2
*
nlanes
/
4
]),
vx_setall_f32
(
ky
[
n
]),
sum2
);
sum3
=
v_fma
(
vx_load
(
&
buf
[
r
[
n
]][
l
+
3
*
nlanes
/
4
]),
vx_setall_f32
(
ky
[
n
]),
sum3
);
}
if
(
!
noscale
)
{
sum0
=
v_fma
(
sum0
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
sum1
=
v_fma
(
sum1
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
sum2
=
v_fma
(
sum2
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
sum3
=
v_fma
(
sum3
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
}
v_int32
isum0
=
v_round
(
sum0
),
isum1
=
v_round
(
sum1
),
isum2
=
v_round
(
sum2
),
isum3
=
v_round
(
sum3
);
v_int16
ires0
=
v_pack
(
isum0
,
isum1
),
ires1
=
v_pack
(
isum2
,
isum3
);
v_uint8
res
=
v_pack_u
(
ires0
,
ires1
);
v_store
(
reinterpret_cast
<
uchar
*>
(
&
out
[
l
]),
res
);
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
return
;
}
// this variant with manually vectored rounding to short/ushort
template
<
bool
noscale
,
typename
DST
,
typename
SRC
>
static
void
run_sepfilter5x5_any2short
(
DST
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
constexpr
int
kxLen
=
5
;
constexpr
int
kyLen
=
kxLen
;
constexpr
int
buffSize
=
5
;
int
r
[
buffSize
];
for
(
int
n
=
0
;
n
<
buffSize
;
++
n
)
{
r
[
n
]
=
(
y
-
y0
+
n
)
%
5
;
// previous, this, next rows
}
const
int
length
=
width
*
chan
;
const
int
shift
=
chan
;
// horizontal pass
int
k0
=
(
y
==
y0
)
?
0
:
4
;
for
(
int
k
=
k0
;
k
<
kyLen
;
++
k
)
{
const
SRC
*
s
[
kxLen
]
=
{
nullptr
};
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
// previous , this , next pixels
s
[
i
]
=
in
[
k
]
+
(
i
-
border
)
*
shift
;
}
// rely on compiler vectoring
for
(
int
l
=
0
;
l
<
length
;
++
l
)
{
float
sum
=
0
;
for
(
int
j
=
0
;
j
<
kxLen
;
++
j
)
{
sum
+=
s
[
j
][
l
]
*
kx
[
j
];
}
buf
[
r
[
k
]][
l
]
=
sum
;
}
}
// vertical pass
constexpr
int
nlanes
=
v_int16
::
nlanes
;
for
(
int
l
=
0
;
l
<
length
;)
{
//GAPI_Assert(length >= nlanes);
// main part of row
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
v_float32
sum0
=
vx_load
(
&
buf
[
r
[
0
]][
l
])
*
vx_setall_f32
(
ky
[
0
]);
v_float32
sum1
=
vx_load
(
&
buf
[
r
[
0
]][
l
+
nlanes
/
2
])
*
vx_setall_f32
(
ky
[
0
]);
for
(
int
j
=
1
;
j
<
kyLen
;
++
j
)
{
sum0
=
v_fma
(
vx_load
(
&
buf
[
r
[
j
]][
l
]),
vx_setall_f32
(
ky
[
j
]),
sum0
);
sum1
=
v_fma
(
vx_load
(
&
buf
[
r
[
j
]][
l
+
nlanes
/
2
]),
vx_setall_f32
(
ky
[
j
]),
sum1
);
}
if
(
!
noscale
)
{
sum0
=
v_fma
(
sum0
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
sum1
=
v_fma
(
sum1
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
}
v_int32
isum0
=
v_round
(
sum0
),
isum1
=
v_round
(
sum1
);
if
(
std
::
is_same
<
DST
,
short
>::
value
)
{
// signed short
v_int16
res
=
v_pack
(
isum0
,
isum1
);
v_store
(
reinterpret_cast
<
short
*>
(
&
out
[
l
]),
res
);
}
else
{
// unsigned short
v_uint16
res
=
v_pack_u
(
isum0
,
isum1
);
v_store
(
reinterpret_cast
<
ushort
*>
(
&
out
[
l
]),
res
);
}
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
return
;
}
// this variant not using buf[]
template
<
bool
noscale
,
typename
SRC
>
static
void
run_sepfilter5x5_any2float
(
float
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
)
{
constexpr
int
kxLen
=
5
;
constexpr
int
kyLen
=
kxLen
;
constexpr
int
buffSize
=
5
;
const
int
length
=
width
*
chan
;
const
int
shift
=
chan
;
static
const
int
nlanes
=
v_float32
::
nlanes
;
for
(
int
l
=
0
;
l
<
length
;
)
{
//GAPI_Assert(length >= nlanes);
// main part
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
auto
xsum
=
[
l
,
border
,
shift
,
kx
](
const
SRC
inp
[])
{
v_float32
t
[
5
];
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
t
[
i
]
=
vx_load_f32
(
&
inp
[
l
+
(
i
-
border
)
*
shift
]);
}
v_float32
sum
=
t
[
0
]
*
vx_setall_f32
(
kx
[
0
]);
for
(
int
j
=
1
;
j
<
5
;
++
j
)
{
sum
=
v_fma
(
t
[
j
],
vx_setall_f32
(
kx
[
j
]),
sum
);
}
return
sum
;
};
v_float32
s
[
buffSize
];
for
(
int
m
=
0
;
m
<
buffSize
;
++
m
)
{
s
[
m
]
=
xsum
(
in
[
m
]);
}
v_float32
sum
=
s
[
0
]
*
vx_setall_f32
(
ky
[
0
]);
for
(
int
n
=
1
;
n
<
kyLen
;
++
n
)
{
sum
=
v_fma
(
s
[
n
],
vx_setall_f32
(
ky
[
n
]),
sum
);
}
if
(
!
noscale
)
{
sum
=
v_fma
(
sum
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
}
v_store
(
&
out
[
l
],
sum
);
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
return
;
}
#define USE_SEPFILTER5X5_CHAR2SHORT 1
#if USE_SEPFILTER5X5_CHAR2SHORT
template
<
bool
noscale
>
static
void
run_sepfilter5x5_char2short
(
short
out
[],
const
uchar
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
constexpr
int
kxLen
=
5
;
constexpr
int
kyLen
=
kxLen
;
constexpr
int
buffSize
=
5
;
schar
ikx
[
kxLen
];
schar
iky
[
kyLen
];
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
ikx
[
i
]
=
saturate
<
schar
>
(
kx
[
i
],
rintf
);
iky
[
i
]
=
saturate
<
schar
>
(
ky
[
i
],
rintf
);
}
const
short
iscale
=
saturate
<
short
>
(
scale
*
(
1
<<
15
),
rintf
);
const
short
idelta
=
saturate
<
short
>
(
delta
,
rintf
);
// check if this code is applicable
if
(
ikx
[
0
]
!=
kx
[
0
]
||
ikx
[
1
]
!=
kx
[
1
]
||
ikx
[
2
]
!=
kx
[
2
]
||
ikx
[
3
]
!=
kx
[
3
]
||
ikx
[
4
]
!=
kx
[
4
]
||
iky
[
0
]
!=
ky
[
0
]
||
iky
[
1
]
!=
ky
[
1
]
||
iky
[
2
]
!=
ky
[
2
]
||
iky
[
3
]
!=
ky
[
3
]
||
iky
[
4
]
!=
ky
[
4
]
||
idelta
!=
delta
||
std
::
abs
(
scale
)
>
1
||
std
::
abs
(
scale
)
<
0.01
)
{
run_sepfilter5x5_any2short
<
noscale
>
(
out
,
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
short
*
ibuf
[
buffSize
];
int
r
[
buffSize
];
for
(
int
n
=
0
;
n
<
buffSize
;
++
n
)
{
ibuf
[
n
]
=
reinterpret_cast
<
short
*>
(
buf
[
n
]);
r
[
n
]
=
(
y
-
y0
+
n
)
%
5
;
// previous, this, next rows
}
const
int
length
=
width
*
chan
;
const
int
shift
=
chan
;
// horizontal pass
// full horizontal pass is needed only if the very 1st row in ROI is handled;
// for 2nd and further rows, it's enough to convolve only the
// "next" row - as we can reuse buffers from previous calls to
// this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
int
k0
=
(
y
==
y0
)
?
0
:
4
;
constexpr
int
nlanes
=
v_int16
::
nlanes
;
for
(
int
k
=
k0
;
k
<
kyLen
;
++
k
)
{
for
(
int
l
=
0
;
l
<
length
;)
{
GAPI_Assert
(
length
>=
nlanes
);
// main part of output row
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
v_uint16
t
[
kxLen
];
v_int16
sum
;
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
// previous, current, next pixels
t
[
i
]
=
vx_load_expand
(
&
in
[
k
][
l
+
(
i
-
border
)
*
shift
]);
sum
+=
v_reinterpret_as_s16
(
t
[
i
])
*
vx_setall_s16
(
ikx
[
i
]);
}
v_store
(
&
ibuf
[
r
[
k
]][
l
],
sum
);
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
}
// vertical pass
for
(
int
l
=
0
;
l
<
length
;)
{
//GAPI_Assert(length >= nlanes);
// main part of output row
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
v_int16
s
[
buffSize
];
v_int16
sum
;
for
(
int
i
=
0
;
i
<
kyLen
;
++
i
)
{
// previous, current, next rows
s
[
i
]
=
vx_load
(
&
ibuf
[
r
[
i
]][
l
]);
sum
+=
s
[
i
]
*
vx_setall_s16
(
iky
[
i
]);
}
if
(
!
noscale
)
{
sum
=
v_mul_hi
(
sum
<<
1
,
vx_setall_s16
(
iscale
))
+
vx_setall_s16
(
idelta
);
}
v_store
(
&
out
[
l
],
sum
);
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
return
;
}
#endif //USE_SEPFILTER5X5_CHAR2SHORT
#endif //CV_SIMD
template
<
bool
noscale
,
typename
DST
,
typename
SRC
>
static
void
run_sepfilter5x5_reference
(
DST
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
constexpr
int
kxLen
=
5
;
// kernel size
constexpr
int
kyLen
=
kxLen
;
int
r
[
kyLen
];
for
(
int
n
=
0
;
n
<
kyLen
;
++
n
)
{
r
[
n
]
=
(
y
-
y0
+
n
)
%
5
;
// previous, this, next rows
}
int
length
=
width
*
chan
;
int
shift
=
chan
;
// horizontal pass
// full horizontal pass is needed only if very 1st row in ROI;
// for 2nd and further rows, it is enough to convolve only the
// "next" row - as we can reuse buffers from previous calls to
// this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
int
k0
=
(
y
==
y0
)
?
0
:
4
;
for
(
int
k
=
k0
;
k
<
kyLen
;
++
k
)
{
const
SRC
*
s
[
kxLen
]
=
{
nullptr
};
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
// previous , this , next pixels
s
[
i
]
=
in
[
k
]
+
(
i
-
border
)
*
shift
;
}
// rely on compiler vectoring
for
(
int
l
=
0
;
l
<
length
;
++
l
)
{
float
sum
=
0
;
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
sum
+=
s
[
i
][
l
]
*
kx
[
i
];
}
buf
[
r
[
k
]][
l
]
=
sum
;
}
}
// vertical pass
for
(
int
l
=
0
;
l
<
length
;
++
l
)
{
float
sum
=
0
;
for
(
int
j
=
0
;
j
<
kyLen
;
++
j
)
{
sum
+=
buf
[
r
[
j
]][
l
]
*
ky
[
j
];
}
if
(
!
noscale
)
{
sum
=
sum
*
scale
+
delta
;
}
out
[
l
]
=
saturate
<
DST
>
(
sum
,
rintf
);
}
return
;
}
template
<
bool
noscale
,
typename
DST
,
typename
SRC
>
static
void
run_sepfilter5x5_code
(
DST
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
#if CV_SIMD
int
length
=
width
*
chan
;
// length variable may be unused if types do not match at 'if' statements below
(
void
)
length
;
if
(
std
::
is_same
<
DST
,
short
>::
value
&&
std
::
is_same
<
SRC
,
uchar
>::
value
&&
length
>=
v_int16
::
nlanes
)
{
run_sepfilter5x5_char2short
<
noscale
>
(
reinterpret_cast
<
short
*>
(
out
),
reinterpret_cast
<
const
uchar
**>
(
in
),
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
if
(
std
::
is_same
<
DST
,
float
>::
value
&&
std
::
is_same
<
SRC
,
float
>::
value
&&
length
>=
v_float32
::
nlanes
)
{
run_sepfilter5x5_any2float
<
noscale
>
(
reinterpret_cast
<
float
*>
(
out
),
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
);
return
;
}
if
(
std
::
is_same
<
DST
,
short
>::
value
&&
length
>=
v_int16
::
nlanes
)
{
run_sepfilter5x5_any2short
<
noscale
>
(
reinterpret_cast
<
short
*>
(
out
),
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
if
(
std
::
is_same
<
DST
,
ushort
>::
value
&&
length
>=
v_uint16
::
nlanes
)
{
run_sepfilter5x5_any2short
<
noscale
>
(
reinterpret_cast
<
ushort
*>
(
out
),
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
if
(
std
::
is_same
<
DST
,
uchar
>::
value
&&
length
>=
v_uint8
::
nlanes
)
{
run_sepfilter5x5_any2char
<
noscale
>
(
reinterpret_cast
<
uchar
*>
(
out
),
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
#endif // CV_SIMD
// reference code is quite fast for any-to-float case,
// but not for any-to-integral due to very slow rounding
run_sepfilter5x5_reference
<
noscale
>
(
out
,
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
}
#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \
void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, const float kx[], \
const float ky[], int border, float scale, float delta, \
float *buf[], int y, int y0) \
{ \
if (scale == 1 && delta == 0) \
{ \
constexpr bool noscale = true; \
run_sepfilter5x5_code<noscale>(out, in, width, chan, kx, ky, border, \
scale, delta, buf, y, y0); \
} \
else \
{ \
constexpr bool noscale = false; \
run_sepfilter5x5_code<noscale>(out, in, width, chan, kx, ky, border, \
scale, delta, buf, y, y0); \
} \
return; \
}
RUN_SEPFILTER5x5_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
short
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
float
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
float
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER5x5_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment