Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
12656df1
Commit
12656df1
authored
Oct 03, 2010
by
Vadim Pisarevsky
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
much faster exp() and log() with SSE2
parent
46988ca6
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
443 additions
and
130 deletions
+443
-130
mathfuncs.cpp
modules/core/src/mathfuncs.cpp
+443
-130
No files found.
modules/core/src/mathfuncs.cpp
View file @
12656df1
...
...
@@ -719,29 +719,133 @@ static const double expTab[] = {
1.9571441241754002690183222516269
*
EXPPOLY_32F_A0
,
1.9784560263879509682582499181312
*
EXPPOLY_32F_A0
,
};
static
const
double
exp_prescale
=
1.4426950408889634073599246810019
*
(
1
<<
EXPTAB_SCALE
);
static
const
double
exp_postscale
=
1.
/
(
1
<<
EXPTAB_SCALE
);
static
const
double
exp_max_val
=
3000.
*
(
1
<<
EXPTAB_SCALE
);
// log10(DBL_MAX) < 3000
static
CvStatus
CV_STDCALL
Exp_32f
(
const
float
*
_x
,
float
*
y
,
int
n
)
{
static
const
double
EXPPOLY_32F_A4
=
1.000000000000002438532970795181890933776
/
EXPPOLY_32F_A0
,
EXPPOLY_32F_A3
=
.6931471805521448196800669615864773144641
/
EXPPOLY_32F_A0
,
EXPPOLY_32F_A2
=
.2402265109513301490103372422686535526573
/
EXPPOLY_32F_A0
,
EXPPOLY_32F_A1
=
.5550339366753125211915322047004666939128e-1
/
EXPPOLY_32F_A0
;
#undef EXPPOLY
#define EXPPOLY(x) \
(((((x) + EXPPOLY_32F_A1)*(x) + EXPPOLY_32F_A2)*(x) + EXPPOLY_32F_A3)*(x) + EXPPOLY_32F_
A4)
static
const
float
A4
=
(
float
)(
1.000000000000002438532970795181890933776
/
EXPPOLY_32F_A0
)
,
A3
=
(
float
)(
.6931471805521448196800669615864773144641
/
EXPPOLY_32F_A0
)
,
A2
=
(
float
)(
.2402265109513301490103372422686535526573
/
EXPPOLY_32F_A0
)
,
A1
=
(
float
)(
.5550339366753125211915322047004666939128e-1
/
EXPPOLY_32F_A0
)
;
#undef EXPPOLY
#define EXPPOLY(x) \
(((((x) + A1)*(x) + A2)*(x) + A3)*(x) +
A4)
int
i
=
0
;
DBLINT
buf
[
4
];
const
Cv32suf
*
x
=
(
const
Cv32suf
*
)
_x
;
Cv32suf
buf
[
4
];
buf
[
0
].
i
.
lo
=
buf
[
1
].
i
.
lo
=
buf
[
2
].
i
.
lo
=
buf
[
3
].
i
.
lo
=
0
;
#if CV_SSE2
if
(
n
>=
8
&&
checkHardwareSupport
(
CV_CPU_SSE
)
)
{
static
const
__m128d
prescale2
=
_mm_set1_pd
(
exp_prescale
);
static
const
__m128
postscale4
=
_mm_set1_ps
((
float
)
exp_postscale
);
static
const
__m128
maxval4
=
_mm_set1_ps
((
float
)(
exp_max_val
/
exp_prescale
));
static
const
__m128
minval4
=
_mm_set1_ps
((
float
)(
-
exp_max_val
/
exp_prescale
));
static
const
__m128
mA1
=
_mm_set1_ps
(
A1
);
static
const
__m128
mA2
=
_mm_set1_ps
(
A2
);
static
const
__m128
mA3
=
_mm_set1_ps
(
A3
);
static
const
__m128
mA4
=
_mm_set1_ps
(
A4
);
bool
y_aligned
=
(
size_t
)(
void
*
)
y
%
16
==
0
;
ushort
CV_DECL_ALIGNED
(
16
)
tab_idx
[
8
];
for
(
;
i
<=
n
-
8
;
i
+=
8
)
{
__m128
xf0
,
xf1
;
xf0
=
_mm_loadu_ps
(
&
x
[
i
].
f
);
xf1
=
_mm_loadu_ps
(
&
x
[
i
+
4
].
f
);
__m128i
xi0
,
xi1
,
xi2
,
xi3
;
xf0
=
_mm_min_ps
(
_mm_max_ps
(
xf0
,
minval4
),
maxval4
);
xf1
=
_mm_min_ps
(
_mm_max_ps
(
xf1
,
minval4
),
maxval4
);
__m128d
xd0
=
_mm_cvtps_pd
(
xf0
);
__m128d
xd2
=
_mm_cvtps_pd
(
_mm_movehl_ps
(
xf0
,
xf0
));
__m128d
xd1
=
_mm_cvtps_pd
(
xf1
);
__m128d
xd3
=
_mm_cvtps_pd
(
_mm_movehl_ps
(
xf1
,
xf1
));
xd0
=
_mm_mul_pd
(
xd0
,
prescale2
);
xd2
=
_mm_mul_pd
(
xd2
,
prescale2
);
xd1
=
_mm_mul_pd
(
xd1
,
prescale2
);
xd3
=
_mm_mul_pd
(
xd3
,
prescale2
);
xi0
=
_mm_cvtpd_epi32
(
xd0
);
xi2
=
_mm_cvtpd_epi32
(
xd2
);
xi1
=
_mm_cvtpd_epi32
(
xd1
);
xi3
=
_mm_cvtpd_epi32
(
xd3
);
xd0
=
_mm_sub_pd
(
xd0
,
_mm_cvtepi32_pd
(
xi0
));
xd2
=
_mm_sub_pd
(
xd2
,
_mm_cvtepi32_pd
(
xi2
));
xd1
=
_mm_sub_pd
(
xd1
,
_mm_cvtepi32_pd
(
xi1
));
xd3
=
_mm_sub_pd
(
xd3
,
_mm_cvtepi32_pd
(
xi3
));
xf0
=
_mm_movelh_ps
(
_mm_cvtpd_ps
(
xd0
),
_mm_cvtpd_ps
(
xd2
));
xf1
=
_mm_movelh_ps
(
_mm_cvtpd_ps
(
xd1
),
_mm_cvtpd_ps
(
xd3
));
xf0
=
_mm_mul_ps
(
xf0
,
postscale4
);
xf1
=
_mm_mul_ps
(
xf1
,
postscale4
);
xi0
=
_mm_unpacklo_epi64
(
xi0
,
xi2
);
xi1
=
_mm_unpacklo_epi64
(
xi1
,
xi3
);
xi0
=
_mm_packs_epi32
(
xi0
,
xi1
);
_mm_store_si128
((
__m128i
*
)
tab_idx
,
_mm_and_si128
(
xi0
,
_mm_set1_epi16
(
EXPTAB_MASK
)));
xi0
=
_mm_add_epi16
(
_mm_srai_epi16
(
xi0
,
EXPTAB_SCALE
),
_mm_set1_epi16
(
127
));
xi0
=
_mm_max_epi16
(
xi0
,
_mm_setzero_si128
());
xi0
=
_mm_min_epi16
(
xi0
,
_mm_set1_epi16
(
255
));
xi1
=
_mm_unpackhi_epi16
(
xi0
,
_mm_setzero_si128
());
xi0
=
_mm_unpacklo_epi16
(
xi0
,
_mm_setzero_si128
());
__m128d
yd0
=
_mm_unpacklo_pd
(
_mm_load_sd
(
expTab
+
tab_idx
[
0
]),
_mm_load_sd
(
expTab
+
tab_idx
[
1
]));
__m128d
yd1
=
_mm_unpacklo_pd
(
_mm_load_sd
(
expTab
+
tab_idx
[
2
]),
_mm_load_sd
(
expTab
+
tab_idx
[
3
]));
__m128d
yd2
=
_mm_unpacklo_pd
(
_mm_load_sd
(
expTab
+
tab_idx
[
4
]),
_mm_load_sd
(
expTab
+
tab_idx
[
5
]));
__m128d
yd3
=
_mm_unpacklo_pd
(
_mm_load_sd
(
expTab
+
tab_idx
[
6
]),
_mm_load_sd
(
expTab
+
tab_idx
[
7
]));
__m128
yf0
=
_mm_movelh_ps
(
_mm_cvtpd_ps
(
yd0
),
_mm_cvtpd_ps
(
yd1
));
__m128
yf1
=
_mm_movelh_ps
(
_mm_cvtpd_ps
(
yd2
),
_mm_cvtpd_ps
(
yd3
));
yf0
=
_mm_mul_ps
(
yf0
,
_mm_castsi128_ps
(
_mm_slli_epi32
(
xi0
,
23
)));
yf1
=
_mm_mul_ps
(
yf1
,
_mm_castsi128_ps
(
_mm_slli_epi32
(
xi1
,
23
)));
__m128
zf0
=
_mm_add_ps
(
xf0
,
mA1
);
__m128
zf1
=
_mm_add_ps
(
xf1
,
mA1
);
zf0
=
_mm_add_ps
(
_mm_mul_ps
(
zf0
,
xf0
),
mA2
);
zf1
=
_mm_add_ps
(
_mm_mul_ps
(
zf1
,
xf1
),
mA2
);
zf0
=
_mm_add_ps
(
_mm_mul_ps
(
zf0
,
xf0
),
mA3
);
zf1
=
_mm_add_ps
(
_mm_mul_ps
(
zf1
,
xf1
),
mA3
);
zf0
=
_mm_add_ps
(
_mm_mul_ps
(
zf0
,
xf0
),
mA4
);
zf1
=
_mm_add_ps
(
_mm_mul_ps
(
zf1
,
xf1
),
mA4
);
zf0
=
_mm_mul_ps
(
zf0
,
yf0
);
zf1
=
_mm_mul_ps
(
zf1
,
yf1
);
if
(
y_aligned
)
{
_mm_store_ps
(
y
+
i
,
zf0
);
_mm_store_ps
(
y
+
i
+
4
,
zf1
);
}
else
{
_mm_storeu_ps
(
y
+
i
,
zf0
);
_mm_storeu_ps
(
y
+
i
+
4
,
zf1
);
}
}
}
else
#endif
for
(
;
i
<=
n
-
4
;
i
+=
4
)
{
double
x0
=
x
[
i
].
f
*
exp_prescale
;
...
...
@@ -749,183 +853,252 @@ static CvStatus CV_STDCALL Exp_32f( const float *_x, float *y, int n )
double
x2
=
x
[
i
+
2
].
f
*
exp_prescale
;
double
x3
=
x
[
i
+
3
].
f
*
exp_prescale
;
int
val0
,
val1
,
val2
,
val3
,
t
;
if
(
((
x
[
i
].
i
>>
23
)
&
255
)
>
127
+
10
)
x0
=
x
[
i
].
i
<
0
?
-
exp_max_val
:
exp_max_val
;
if
(
((
x
[
i
+
1
].
i
>>
23
)
&
255
)
>
127
+
10
)
x1
=
x
[
i
+
1
].
i
<
0
?
-
exp_max_val
:
exp_max_val
;
if
(
((
x
[
i
+
2
].
i
>>
23
)
&
255
)
>
127
+
10
)
x2
=
x
[
i
+
2
].
i
<
0
?
-
exp_max_val
:
exp_max_val
;
if
(
((
x
[
i
+
3
].
i
>>
23
)
&
255
)
>
127
+
10
)
x3
=
x
[
i
+
3
].
i
<
0
?
-
exp_max_val
:
exp_max_val
;
val0
=
cvRound
(
x0
);
val1
=
cvRound
(
x1
);
val2
=
cvRound
(
x2
);
val3
=
cvRound
(
x3
);
x0
=
(
x0
-
val0
)
*
exp_postscale
;
x1
=
(
x1
-
val1
)
*
exp_postscale
;
x2
=
(
x2
-
val2
)
*
exp_postscale
;
x3
=
(
x3
-
val3
)
*
exp_postscale
;
t
=
(
val0
>>
EXPTAB_SCALE
)
+
1
023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
0
].
i
.
hi
=
t
<<
20
;
t
=
(
val1
>>
EXPTAB_SCALE
)
+
1
023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
1
].
i
.
hi
=
t
<<
20
;
t
=
(
val2
>>
EXPTAB_SCALE
)
+
1
023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
2
].
i
.
hi
=
t
<<
20
;
t
=
(
val3
>>
EXPTAB_SCALE
)
+
1
023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
3
].
i
.
hi
=
t
<<
20
;
x0
=
buf
[
0
].
d
*
expTab
[
val0
&
EXPTAB_MASK
]
*
EXPPOLY
(
x0
);
x1
=
buf
[
1
].
d
*
expTab
[
val1
&
EXPTAB_MASK
]
*
EXPPOLY
(
x1
);
y
[
i
]
=
(
float
)
x0
;
y
[
i
+
1
]
=
(
float
)
x1
;
x2
=
buf
[
2
].
d
*
expTab
[
val2
&
EXPTAB_MASK
]
*
EXPPOLY
(
x2
);
x3
=
buf
[
3
].
d
*
expTab
[
val3
&
EXPTAB_MASK
]
*
EXPPOLY
(
x3
);
y
[
i
+
2
]
=
(
float
)
x2
;
y
[
i
+
3
]
=
(
float
)
x3
;
t
=
(
val0
>>
EXPTAB_SCALE
)
+
1
27
;
t
=
!
(
t
&
~
255
)
?
t
:
t
<
0
?
0
:
255
;
buf
[
0
].
i
=
t
<<
23
;
t
=
(
val1
>>
EXPTAB_SCALE
)
+
1
27
;
t
=
!
(
t
&
~
255
)
?
t
:
t
<
0
?
0
:
255
;
buf
[
1
].
i
=
t
<<
23
;
t
=
(
val2
>>
EXPTAB_SCALE
)
+
1
27
;
t
=
!
(
t
&
~
255
)
?
t
:
t
<
0
?
0
:
255
;
buf
[
2
].
i
=
t
<<
23
;
t
=
(
val3
>>
EXPTAB_SCALE
)
+
1
27
;
t
=
!
(
t
&
~
255
)
?
t
:
t
<
0
?
0
:
255
;
buf
[
3
].
i
=
t
<<
23
;
x0
=
buf
[
0
].
f
*
expTab
[
val0
&
EXPTAB_MASK
]
*
EXPPOLY
(
x0
);
x1
=
buf
[
1
].
f
*
expTab
[
val1
&
EXPTAB_MASK
]
*
EXPPOLY
(
x1
);
y
[
i
]
=
x0
;
y
[
i
+
1
]
=
x1
;
x2
=
buf
[
2
].
f
*
expTab
[
val2
&
EXPTAB_MASK
]
*
EXPPOLY
(
x2
);
x3
=
buf
[
3
].
f
*
expTab
[
val3
&
EXPTAB_MASK
]
*
EXPPOLY
(
x3
);
y
[
i
+
2
]
=
x2
;
y
[
i
+
3
]
=
x3
;
}
for
(
;
i
<
n
;
i
++
)
{
double
x0
=
x
[
i
].
f
*
exp_prescale
;
int
val0
,
t
;
if
(
((
x
[
i
].
i
>>
23
)
&
255
)
>
127
+
10
)
x0
=
x
[
i
].
i
<
0
?
-
exp_max_val
:
exp_max_val
;
val0
=
cvRound
(
x0
);
t
=
(
val0
>>
EXPTAB_SCALE
)
+
1
023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
0
].
i
.
hi
=
t
<<
20
;
t
=
(
val0
>>
EXPTAB_SCALE
)
+
1
27
;
t
=
!
(
t
&
~
255
)
?
t
:
t
<
0
?
0
:
255
;
buf
[
0
].
i
=
t
<<
23
;
x0
=
(
x0
-
val0
)
*
exp_postscale
;
y
[
i
]
=
(
float
)(
buf
[
0
].
d
*
expTab
[
val0
&
EXPTAB_MASK
]
*
EXPPOLY
(
x0
)
);
y
[
i
]
=
buf
[
0
].
f
*
expTab
[
val0
&
EXPTAB_MASK
]
*
EXPPOLY
(
x0
);
}
return
CV_OK
;
}
static
CvStatus
CV_STDCALL
Exp_64f
(
const
double
*
_x
,
double
*
y
,
int
n
)
{
static
const
double
A5
=
.99999999999999999998285227504999
/
EXPPOLY_32F_A0
,
A4
=
.69314718055994546743029643825322
/
EXPPOLY_32F_A0
,
A3
=
.24022650695886477918181338054308
/
EXPPOLY_32F_A0
,
A2
=
.55504108793649567998466049042729e-1
/
EXPPOLY_32F_A0
,
A1
=
.96180973140732918010002372686186e-2
/
EXPPOLY_32F_A0
,
A0
=
.13369713757180123244806654839424e-2
/
EXPPOLY_32F_A0
;
#undef EXPPOLY
#define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
A5
=
.99999999999999999998285227504999
/
EXPPOLY_32F_A0
,
A4
=
.69314718055994546743029643825322
/
EXPPOLY_32F_A0
,
A3
=
.24022650695886477918181338054308
/
EXPPOLY_32F_A0
,
A2
=
.55504108793649567998466049042729e-1
/
EXPPOLY_32F_A0
,
A1
=
.96180973140732918010002372686186e-2
/
EXPPOLY_32F_A0
,
A0
=
.13369713757180123244806654839424e-2
/
EXPPOLY_32F_A0
;
#undef EXPPOLY
#define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
int
i
=
0
;
DBLINT
buf
[
4
];
Cv64suf
buf
[
4
];
const
Cv64suf
*
x
=
(
const
Cv64suf
*
)
_x
;
buf
[
0
].
i
.
lo
=
buf
[
1
].
i
.
lo
=
buf
[
2
].
i
.
lo
=
buf
[
3
].
i
.
lo
=
0
;
#if CV_SSE2
if
(
checkHardwareSupport
(
CV_CPU_SSE
)
)
{
static
const
__m128d
prescale2
=
_mm_set1_pd
(
exp_prescale
);
static
const
__m128d
postscale2
=
_mm_set1_pd
(
exp_postscale
);
static
const
__m128d
maxval2
=
_mm_set1_pd
(
exp_max_val
);
static
const
__m128d
minval2
=
_mm_set1_pd
(
-
exp_max_val
);
static
const
__m128d
mA0
=
_mm_set1_pd
(
A0
);
static
const
__m128d
mA1
=
_mm_set1_pd
(
A1
);
static
const
__m128d
mA2
=
_mm_set1_pd
(
A2
);
static
const
__m128d
mA3
=
_mm_set1_pd
(
A3
);
static
const
__m128d
mA4
=
_mm_set1_pd
(
A4
);
static
const
__m128d
mA5
=
_mm_set1_pd
(
A5
);
int
CV_DECL_ALIGNED
(
16
)
tab_idx
[
4
];
for
(
;
i
<=
n
-
4
;
i
+=
4
)
{
__m128d
xf0
=
_mm_loadu_pd
(
&
x
[
i
].
f
),
xf1
=
_mm_loadu_pd
(
&
x
[
i
+
2
].
f
);
__m128i
xi0
,
xi1
;
xf0
=
_mm_min_pd
(
_mm_max_pd
(
xf0
,
minval2
),
maxval2
);
xf1
=
_mm_min_pd
(
_mm_max_pd
(
xf1
,
minval2
),
maxval2
);
xf0
=
_mm_mul_pd
(
xf0
,
prescale2
);
xf1
=
_mm_mul_pd
(
xf1
,
prescale2
);
xi0
=
_mm_cvtpd_epi32
(
xf0
);
xi1
=
_mm_cvtpd_epi32
(
xf1
);
xf0
=
_mm_mul_pd
(
_mm_sub_pd
(
xf0
,
_mm_cvtepi32_pd
(
xi0
)),
postscale2
);
xf1
=
_mm_mul_pd
(
_mm_sub_pd
(
xf1
,
_mm_cvtepi32_pd
(
xi1
)),
postscale2
);
xi0
=
_mm_unpacklo_epi64
(
xi0
,
xi1
);
_mm_store_si128
((
__m128i
*
)
tab_idx
,
_mm_and_si128
(
xi0
,
_mm_set1_epi32
(
EXPTAB_MASK
)));
xi0
=
_mm_add_epi32
(
_mm_srai_epi32
(
xi0
,
EXPTAB_SCALE
),
_mm_set1_epi32
(
1023
));
xi0
=
_mm_packs_epi32
(
xi0
,
xi0
);
xi0
=
_mm_max_epi16
(
xi0
,
_mm_setzero_si128
());
xi0
=
_mm_min_epi16
(
xi0
,
_mm_set1_epi16
(
2047
));
xi0
=
_mm_unpacklo_epi16
(
xi0
,
_mm_setzero_si128
());
xi1
=
_mm_unpackhi_epi32
(
xi0
,
_mm_setzero_si128
());
xi0
=
_mm_unpacklo_epi32
(
xi0
,
_mm_setzero_si128
());
__m128d
yf0
=
_mm_unpacklo_pd
(
_mm_load_sd
(
expTab
+
tab_idx
[
0
]),
_mm_load_sd
(
expTab
+
tab_idx
[
1
]));
__m128d
yf1
=
_mm_unpacklo_pd
(
_mm_load_sd
(
expTab
+
tab_idx
[
2
]),
_mm_load_sd
(
expTab
+
tab_idx
[
3
]));
yf0
=
_mm_mul_pd
(
yf0
,
_mm_castsi128_pd
(
_mm_slli_epi64
(
xi0
,
52
)));
yf1
=
_mm_mul_pd
(
yf1
,
_mm_castsi128_pd
(
_mm_slli_epi64
(
xi1
,
52
)));
__m128d
zf0
=
_mm_add_pd
(
_mm_mul_pd
(
mA0
,
xf0
),
mA1
);
__m128d
zf1
=
_mm_add_pd
(
_mm_mul_pd
(
mA0
,
xf1
),
mA1
);
zf0
=
_mm_add_pd
(
_mm_mul_pd
(
zf0
,
xf0
),
mA2
);
zf1
=
_mm_add_pd
(
_mm_mul_pd
(
zf1
,
xf1
),
mA2
);
zf0
=
_mm_add_pd
(
_mm_mul_pd
(
zf0
,
xf0
),
mA3
);
zf1
=
_mm_add_pd
(
_mm_mul_pd
(
zf1
,
xf1
),
mA3
);
zf0
=
_mm_add_pd
(
_mm_mul_pd
(
zf0
,
xf0
),
mA4
);
zf1
=
_mm_add_pd
(
_mm_mul_pd
(
zf1
,
xf1
),
mA4
);
zf0
=
_mm_add_pd
(
_mm_mul_pd
(
zf0
,
xf0
),
mA5
);
zf1
=
_mm_add_pd
(
_mm_mul_pd
(
zf1
,
xf1
),
mA5
);
zf0
=
_mm_mul_pd
(
zf0
,
yf0
);
zf1
=
_mm_mul_pd
(
zf1
,
yf1
);
_mm_storeu_pd
(
y
+
i
,
zf0
);
_mm_storeu_pd
(
y
+
i
+
2
,
zf1
);
}
}
else
#endif
for
(
;
i
<=
n
-
4
;
i
+=
4
)
{
double
x0
=
x
[
i
].
f
*
exp_prescale
;
double
x1
=
x
[
i
+
1
].
f
*
exp_prescale
;
double
x2
=
x
[
i
+
2
].
f
*
exp_prescale
;
double
x3
=
x
[
i
+
3
].
f
*
exp_prescale
;
double
y0
,
y1
,
y2
,
y3
;
int
val0
,
val1
,
val2
,
val3
,
t
;
t
=
(
int
)(
x
[
i
].
i
>>
52
);
if
(
(
t
&
2047
)
>
1023
+
10
)
x0
=
t
<
0
?
-
exp_max_val
:
exp_max_val
;
t
=
(
int
)(
x
[
i
+
1
].
i
>>
52
);
if
(
(
t
&
2047
)
>
1023
+
10
)
x1
=
t
<
0
?
-
exp_max_val
:
exp_max_val
;
t
=
(
int
)(
x
[
i
+
2
].
i
>>
52
);
if
(
(
t
&
2047
)
>
1023
+
10
)
x2
=
t
<
0
?
-
exp_max_val
:
exp_max_val
;
t
=
(
int
)(
x
[
i
+
3
].
i
>>
52
);
if
(
(
t
&
2047
)
>
1023
+
10
)
x3
=
t
<
0
?
-
exp_max_val
:
exp_max_val
;
val0
=
cvRound
(
x0
);
val1
=
cvRound
(
x1
);
val2
=
cvRound
(
x2
);
val3
=
cvRound
(
x3
);
x0
=
(
x0
-
val0
)
*
exp_postscale
;
x1
=
(
x1
-
val1
)
*
exp_postscale
;
x2
=
(
x2
-
val2
)
*
exp_postscale
;
x3
=
(
x3
-
val3
)
*
exp_postscale
;
t
=
(
val0
>>
EXPTAB_SCALE
)
+
1023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
0
].
i
.
hi
=
t
<<
20
;
t
=
!
(
t
&
~
2047
)
?
t
:
t
<
0
?
0
:
2047
;
buf
[
0
].
i
=
(
int64
)
t
<<
52
;
t
=
(
val1
>>
EXPTAB_SCALE
)
+
1023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
1
].
i
.
hi
=
t
<<
20
;
t
=
!
(
t
&
~
2047
)
?
t
:
t
<
0
?
0
:
2047
;
buf
[
1
].
i
=
(
int64
)
t
<<
52
;
t
=
(
val2
>>
EXPTAB_SCALE
)
+
1023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
2
].
i
.
hi
=
t
<<
20
;
t
=
!
(
t
&
~
2047
)
?
t
:
t
<
0
?
0
:
2047
;
buf
[
2
].
i
=
(
int64
)
t
<<
52
;
t
=
(
val3
>>
EXPTAB_SCALE
)
+
1023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
3
].
i
.
hi
=
t
<<
20
;
y0
=
buf
[
0
].
d
*
expTab
[
val0
&
EXPTAB_MASK
]
*
EXPPOLY
(
x0
);
y1
=
buf
[
1
].
d
*
expTab
[
val1
&
EXPTAB_MASK
]
*
EXPPOLY
(
x1
);
t
=
!
(
t
&
~
2047
)
?
t
:
t
<
0
?
0
:
2047
;
buf
[
3
].
i
=
(
int64
)
t
<<
52
;
y0
=
buf
[
0
].
f
*
expTab
[
val0
&
EXPTAB_MASK
]
*
EXPPOLY
(
x0
);
y1
=
buf
[
1
].
f
*
expTab
[
val1
&
EXPTAB_MASK
]
*
EXPPOLY
(
x1
);
y
[
i
]
=
y0
;
y
[
i
+
1
]
=
y1
;
y2
=
buf
[
2
].
d
*
expTab
[
val2
&
EXPTAB_MASK
]
*
EXPPOLY
(
x2
);
y3
=
buf
[
3
].
d
*
expTab
[
val3
&
EXPTAB_MASK
]
*
EXPPOLY
(
x3
);
y2
=
buf
[
2
].
f
*
expTab
[
val2
&
EXPTAB_MASK
]
*
EXPPOLY
(
x2
);
y3
=
buf
[
3
].
f
*
expTab
[
val3
&
EXPTAB_MASK
]
*
EXPPOLY
(
x3
);
y
[
i
+
2
]
=
y2
;
y
[
i
+
3
]
=
y3
;
}
for
(
;
i
<
n
;
i
++
)
{
double
x0
=
x
[
i
].
f
*
exp_prescale
;
int
val0
,
t
;
t
=
(
int
)(
x
[
i
].
i
>>
52
);
if
(
(
t
&
2047
)
>
1023
+
10
)
x0
=
t
<
0
?
-
exp_max_val
:
exp_max_val
;
val0
=
cvRound
(
x0
);
t
=
(
val0
>>
EXPTAB_SCALE
)
+
1023
;
t
=
(
t
|
((
t
<
2047
)
-
1
))
&
(((
t
<
0
)
-
1
)
&
2047
)
;
buf
[
0
].
i
.
hi
=
t
<<
20
;
t
=
!
(
t
&
~
2047
)
?
t
:
t
<
0
?
0
:
2047
;
buf
[
0
].
i
=
(
int64
)
t
<<
52
;
x0
=
(
x0
-
val0
)
*
exp_postscale
;
y
[
i
]
=
buf
[
0
].
d
*
expTab
[
val0
&
EXPTAB_MASK
]
*
EXPPOLY
(
x0
);
y
[
i
]
=
buf
[
0
].
f
*
expTab
[
val0
&
EXPTAB_MASK
]
*
EXPPOLY
(
x0
);
}
return
CV_OK
;
}
...
...
@@ -968,7 +1141,7 @@ void exp( const Mat& src, Mat& dst )
#define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1)
#define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1)
static
const
double
icvLogTab
[]
=
{
static
const
double
CV_DECL_ALIGNED
(
16
)
icvLogTab
[]
=
{
0.0000000000000000000000000000000000000000
,
1.000000000000000000000000000000000000000
,
.00389864041565732288852075271279318258166
,
.9961089494163424124513618677042801556420
,
.00778214044205494809292034119607706088573
,
.9922480620155038759689922480620155038760
,
...
...
@@ -1234,26 +1407,75 @@ static const double ln_2 = 0.69314718055994530941723212145818;
static
CvStatus
CV_STDCALL
Log_32f
(
const
float
*
_x
,
float
*
y
,
int
n
)
{
static
const
double
shift
[]
=
{
0
,
-
1.
/
512
};
static
const
double
A0
=
0.3333333333333333333333333
,
A1
=
-
0.5
,
A2
=
1
;
static
const
float
shift
[]
=
{
0
,
-
1.
f
/
512
};
static
const
float
A0
=
0.3333333333333333333333333
f
,
A1
=
-
0.5
f
,
A2
=
1
.
f
;
#undef LOGPOLY
#define LOGPOLY(x
,k) ((x)+=shift[k],
((A0*(x) + A1)*(x) + A2)*(x))
#define LOGPOLY(x
) (
((A0*(x) + A1)*(x) + A2)*(x))
int
i
=
0
;
union
{
int
i
;
float
f
;
}
buf
[
4
];
Cv32suf
buf
[
4
];
const
int
*
x
=
(
const
int
*
)
_x
;
for
(
i
=
0
;
i
<=
n
-
4
;
i
+=
4
)
#if CV_SSE2
if
(
checkHardwareSupport
(
CV_CPU_SSE
)
)
{
static
const
__m128d
ln2_2
=
_mm_set1_pd
(
ln_2
);
static
const
__m128
_1_4
=
_mm_set1_ps
(
1.
f
);
static
const
__m128
shift4
=
_mm_set1_ps
(
-
1.
f
/
512
);
static
const
__m128
mA0
=
_mm_set1_ps
(
A0
);
static
const
__m128
mA1
=
_mm_set1_ps
(
A1
);
static
const
__m128
mA2
=
_mm_set1_ps
(
A2
);
int
CV_DECL_ALIGNED
(
16
)
idx
[
4
];
for
(
;
i
<=
n
-
4
;
i
+=
4
)
{
__m128i
h0
=
_mm_loadu_si128
((
const
__m128i
*
)(
x
+
i
));
__m128i
yi0
=
_mm_sub_epi32
(
_mm_and_si128
(
_mm_srli_epi32
(
h0
,
23
),
_mm_set1_epi32
(
255
)),
_mm_set1_epi32
(
127
));
__m128d
yd0
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
yi0
),
ln2_2
);
__m128d
yd1
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_unpackhi_epi64
(
yi0
,
yi0
)),
ln2_2
);
__m128i
xi0
=
_mm_or_si128
(
_mm_and_si128
(
h0
,
_mm_set1_epi32
(
LOGTAB_MASK2_32F
)),
_mm_set1_epi32
(
127
<<
23
));
h0
=
_mm_and_si128
(
_mm_srli_epi32
(
h0
,
23
-
LOGTAB_SCALE
-
1
),
_mm_set1_epi32
(
LOGTAB_MASK
*
2
));
_mm_store_si128
((
__m128i
*
)
idx
,
h0
);
h0
=
_mm_cmpeq_epi32
(
h0
,
_mm_set1_epi32
(
510
));
__m128d
t0
,
t1
,
t2
,
t3
,
t4
;
t0
=
_mm_load_pd
(
icvLogTab
+
idx
[
0
]);
t2
=
_mm_load_pd
(
icvLogTab
+
idx
[
1
]);
t1
=
_mm_unpackhi_pd
(
t0
,
t2
);
t0
=
_mm_unpacklo_pd
(
t0
,
t2
);
t2
=
_mm_load_pd
(
icvLogTab
+
idx
[
2
]);
t4
=
_mm_load_pd
(
icvLogTab
+
idx
[
3
]);
t3
=
_mm_unpackhi_pd
(
t2
,
t4
);
t2
=
_mm_unpacklo_pd
(
t2
,
t4
);
yd0
=
_mm_add_pd
(
yd0
,
t0
);
yd1
=
_mm_add_pd
(
yd1
,
t2
);
__m128
yf0
=
_mm_movelh_ps
(
_mm_cvtpd_ps
(
yd0
),
_mm_cvtpd_ps
(
yd1
));
__m128
xf0
=
_mm_sub_ps
(
_mm_castsi128_ps
(
xi0
),
_1_4
);
xf0
=
_mm_mul_ps
(
xf0
,
_mm_movelh_ps
(
_mm_cvtpd_ps
(
t1
),
_mm_cvtpd_ps
(
t3
)));
xf0
=
_mm_add_ps
(
xf0
,
_mm_and_ps
(
_mm_castsi128_ps
(
h0
),
shift4
));
__m128
zf0
=
_mm_mul_ps
(
xf0
,
mA0
);
zf0
=
_mm_mul_ps
(
_mm_add_ps
(
zf0
,
mA1
),
xf0
);
zf0
=
_mm_mul_ps
(
_mm_add_ps
(
zf0
,
mA2
),
xf0
);
yf0
=
_mm_add_ps
(
yf0
,
zf0
);
_mm_storeu_ps
(
y
+
i
,
yf0
);
}
}
else
#endif
for
(
;
i
<=
n
-
4
;
i
+=
4
)
{
double
x0
,
x1
,
x2
,
x3
;
double
y0
,
y1
,
y2
,
y3
;
...
...
@@ -1294,14 +1516,18 @@ static CvStatus CV_STDCALL Log_32f( const float *_x, float *y, int n )
x2
=
LOGTAB_TRANSLATE
(
buf
[
2
].
f
,
h2
);
x3
=
LOGTAB_TRANSLATE
(
buf
[
3
].
f
,
h3
);
y0
+=
LOGPOLY
(
x0
,
h0
==
510
);
y1
+=
LOGPOLY
(
x1
,
h1
==
510
);
x0
+=
shift
[
h0
==
510
];
x1
+=
shift
[
h1
==
510
];
y0
+=
LOGPOLY
(
x0
);
y1
+=
LOGPOLY
(
x1
);
y
[
i
]
=
(
float
)
y0
;
y
[
i
+
1
]
=
(
float
)
y1
;
y2
+=
LOGPOLY
(
x2
,
h2
==
510
);
y3
+=
LOGPOLY
(
x3
,
h3
==
510
);
x2
+=
shift
[
h2
==
510
];
x3
+=
shift
[
h3
==
510
];
y2
+=
LOGPOLY
(
x2
);
y3
+=
LOGPOLY
(
x3
);
y
[
i
+
2
]
=
(
float
)
y2
;
y
[
i
+
3
]
=
(
float
)
y3
;
...
...
@@ -1310,7 +1536,8 @@ static CvStatus CV_STDCALL Log_32f( const float *_x, float *y, int n )
for
(
;
i
<
n
;
i
++
)
{
int
h0
=
x
[
i
];
double
x0
,
y0
;
double
y0
;
float
x0
;
y0
=
(((
h0
>>
23
)
&
0xff
)
-
127
)
*
ln_2
;
...
...
@@ -1319,7 +1546,8 @@ static CvStatus CV_STDCALL Log_32f( const float *_x, float *y, int n )
y0
+=
icvLogTab
[
h0
];
x0
=
LOGTAB_TRANSLATE
(
buf
[
0
].
f
,
h0
);
y0
+=
LOGPOLY
(
x0
,
h0
==
510
);
x0
+=
shift
[
h0
==
510
];
y0
+=
LOGPOLY
(
x0
);
y
[
i
]
=
(
float
)
y0
;
}
...
...
@@ -1350,6 +1578,91 @@ static CvStatus CV_STDCALL Log_64f( const double *x, double *y, int n )
DBLINT
buf
[
4
];
DBLINT
*
X
=
(
DBLINT
*
)
x
;
#if CV_SSE2
if
(
checkHardwareSupport
(
CV_CPU_SSE
)
)
{
static
const
__m128d
ln2_2
=
_mm_set1_pd
(
ln_2
);
static
const
__m128d
_1_2
=
_mm_set1_pd
(
1.
);
static
const
__m128d
shift2
=
_mm_set1_pd
(
-
1.
/
512
);
static
const
__m128i
log_and_mask2
=
_mm_set_epi32
(
LOGTAB_MASK2
,
0xffffffff
,
LOGTAB_MASK2
,
0xffffffff
);
static
const
__m128i
log_or_mask2
=
_mm_set_epi32
(
1023
<<
20
,
0
,
1023
<<
20
,
0
);
static
const
__m128d
mA0
=
_mm_set1_pd
(
A0
);
static
const
__m128d
mA1
=
_mm_set1_pd
(
A1
);
static
const
__m128d
mA2
=
_mm_set1_pd
(
A2
);
static
const
__m128d
mA3
=
_mm_set1_pd
(
A3
);
static
const
__m128d
mA4
=
_mm_set1_pd
(
A4
);
static
const
__m128d
mA5
=
_mm_set1_pd
(
A5
);
static
const
__m128d
mA6
=
_mm_set1_pd
(
A6
);
static
const
__m128d
mA7
=
_mm_set1_pd
(
A7
);
int
CV_DECL_ALIGNED
(
16
)
idx
[
4
];
for
(
;
i
<=
n
-
4
;
i
+=
4
)
{
__m128i
h0
=
_mm_loadu_si128
((
const
__m128i
*
)(
x
+
i
));
__m128i
h1
=
_mm_loadu_si128
((
const
__m128i
*
)(
x
+
i
+
2
));
__m128d
xd0
=
_mm_castsi128_pd
(
_mm_or_si128
(
_mm_and_si128
(
h0
,
log_and_mask2
),
log_or_mask2
));
__m128d
xd1
=
_mm_castsi128_pd
(
_mm_or_si128
(
_mm_and_si128
(
h1
,
log_and_mask2
),
log_or_mask2
));
h0
=
_mm_unpackhi_epi32
(
_mm_unpacklo_epi32
(
h0
,
h1
),
_mm_unpackhi_epi32
(
h0
,
h1
));
__m128i
yi0
=
_mm_sub_epi32
(
_mm_and_si128
(
_mm_srli_epi32
(
h0
,
20
),
_mm_set1_epi32
(
2047
)),
_mm_set1_epi32
(
1023
));
__m128d
yd0
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
yi0
),
ln2_2
);
__m128d
yd1
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_unpackhi_epi64
(
yi0
,
yi0
)),
ln2_2
);
h0
=
_mm_and_si128
(
_mm_srli_epi32
(
h0
,
20
-
LOGTAB_SCALE
-
1
),
_mm_set1_epi32
(
LOGTAB_MASK
*
2
));
_mm_store_si128
((
__m128i
*
)
idx
,
h0
);
h0
=
_mm_cmpeq_epi32
(
h0
,
_mm_set1_epi32
(
510
));
__m128d
t0
,
t1
,
t2
,
t3
,
t4
;
t0
=
_mm_load_pd
(
icvLogTab
+
idx
[
0
]);
t2
=
_mm_load_pd
(
icvLogTab
+
idx
[
1
]);
t1
=
_mm_unpackhi_pd
(
t0
,
t2
);
t0
=
_mm_unpacklo_pd
(
t0
,
t2
);
t2
=
_mm_load_pd
(
icvLogTab
+
idx
[
2
]);
t4
=
_mm_load_pd
(
icvLogTab
+
idx
[
3
]);
t3
=
_mm_unpackhi_pd
(
t2
,
t4
);
t2
=
_mm_unpacklo_pd
(
t2
,
t4
);
yd0
=
_mm_add_pd
(
yd0
,
t0
);
yd1
=
_mm_add_pd
(
yd1
,
t2
);
xd0
=
_mm_mul_pd
(
_mm_sub_pd
(
xd0
,
_1_2
),
t1
);
xd1
=
_mm_mul_pd
(
_mm_sub_pd
(
xd1
,
_1_2
),
t3
);
xd0
=
_mm_add_pd
(
xd0
,
_mm_and_pd
(
_mm_castsi128_pd
(
_mm_unpacklo_epi32
(
h0
,
h0
)),
shift2
));
xd1
=
_mm_add_pd
(
xd1
,
_mm_and_pd
(
_mm_castsi128_pd
(
_mm_unpackhi_epi32
(
h0
,
h0
)),
shift2
));
__m128d
zd0
=
_mm_mul_pd
(
xd0
,
mA0
);
__m128d
zd1
=
_mm_mul_pd
(
xd1
,
mA0
);
zd0
=
_mm_mul_pd
(
_mm_add_pd
(
zd0
,
mA1
),
xd0
);
zd1
=
_mm_mul_pd
(
_mm_add_pd
(
zd1
,
mA1
),
xd1
);
zd0
=
_mm_mul_pd
(
_mm_add_pd
(
zd0
,
mA2
),
xd0
);
zd1
=
_mm_mul_pd
(
_mm_add_pd
(
zd1
,
mA2
),
xd1
);
zd0
=
_mm_mul_pd
(
_mm_add_pd
(
zd0
,
mA3
),
xd0
);
zd1
=
_mm_mul_pd
(
_mm_add_pd
(
zd1
,
mA3
),
xd1
);
zd0
=
_mm_mul_pd
(
_mm_add_pd
(
zd0
,
mA4
),
xd0
);
zd1
=
_mm_mul_pd
(
_mm_add_pd
(
zd1
,
mA4
),
xd1
);
zd0
=
_mm_mul_pd
(
_mm_add_pd
(
zd0
,
mA5
),
xd0
);
zd1
=
_mm_mul_pd
(
_mm_add_pd
(
zd1
,
mA5
),
xd1
);
zd0
=
_mm_mul_pd
(
_mm_add_pd
(
zd0
,
mA6
),
xd0
);
zd1
=
_mm_mul_pd
(
_mm_add_pd
(
zd1
,
mA6
),
xd1
);
zd0
=
_mm_mul_pd
(
_mm_add_pd
(
zd0
,
mA7
),
xd0
);
zd1
=
_mm_mul_pd
(
_mm_add_pd
(
zd1
,
mA7
),
xd1
);
yd0
=
_mm_add_pd
(
yd0
,
zd0
);
yd1
=
_mm_add_pd
(
yd1
,
zd1
);
_mm_storeu_pd
(
y
+
i
,
yd0
);
_mm_storeu_pd
(
y
+
i
+
2
,
yd1
);
}
}
else
#endif
for
(
;
i
<=
n
-
4
;
i
+=
4
)
{
double
xq
;
...
...
@@ -1414,7 +1727,7 @@ static CvStatus CV_STDCALL Log_64f( const double *x, double *y, int n )
y
[
i
+
2
]
=
y2
;
y
[
i
+
3
]
=
y3
;
}
for
(
;
i
<
n
;
i
++
)
{
int
h0
=
X
[
i
].
i
.
hi
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment