Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
91d80d03
Commit
91d80d03
authored
Jul 15, 2016
by
Maksim Shabunin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #6798 from tomoaki0705:featureSSEvideoAcc
parents
63d82904
3c2f7ecc
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1367 additions
and
3 deletions
+1367
-3
accum.cpp
modules/imgproc/src/accum.cpp
+1364
-0
test_accum.cpp
modules/video/test/test_accum.cpp
+3
-3
No files found.
modules/imgproc/src/accum.cpp
View file @
91d80d03
...
...
@@ -83,6 +83,1370 @@ struct AccW_SIMD
}
};
#if CV_AVX
template
<>
struct
Acc_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src
=
_mm256_loadu_ps
(
src
+
x
);
__m256
v_dst
=
_mm256_loadu_ps
(
dst
+
x
);
v_dst
=
_mm256_add_ps
(
v_src
,
v_dst
);
_mm256_storeu_ps
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src
=
_mm256_loadu_ps
(
src
+
x
);
__m256d
v_src0
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src
,
0
));
__m256d
v_src1
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src
,
1
));
__m256d
v_dst0
=
_mm256_loadu_pd
(
dst
+
x
);
__m256d
v_dst1
=
_mm256_loadu_pd
(
dst
+
x
+
4
);
v_dst0
=
_mm256_add_pd
(
v_src0
,
v_dst0
);
v_dst1
=
_mm256_add_pd
(
v_src1
,
v_dst1
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm256_storeu_pd
(
dst
+
x
+
4
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m256d
v_src
=
_mm256_loadu_pd
(
src
+
x
);
__m256d
v_dst
=
_mm256_loadu_pd
(
dst
+
x
);
v_dst
=
_mm256_add_pd
(
v_dst
,
v_src
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src
=
_mm256_loadu_ps
(
src
+
x
);
__m256
v_dst
=
_mm256_loadu_ps
(
dst
+
x
);
v_src
=
_mm256_mul_ps
(
v_src
,
v_src
);
v_dst
=
_mm256_add_ps
(
v_src
,
v_dst
);
_mm256_storeu_ps
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src
=
_mm256_loadu_ps
(
src
+
x
);
__m256d
v_src0
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src
,
0
));
__m256d
v_src1
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src
,
1
));
__m256d
v_dst0
=
_mm256_loadu_pd
(
dst
+
x
);
__m256d
v_dst1
=
_mm256_loadu_pd
(
dst
+
x
+
4
);
v_src0
=
_mm256_mul_pd
(
v_src0
,
v_src0
);
v_src1
=
_mm256_mul_pd
(
v_src1
,
v_src1
);
v_dst0
=
_mm256_add_pd
(
v_src0
,
v_dst0
);
v_dst1
=
_mm256_add_pd
(
v_src1
,
v_dst1
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm256_storeu_pd
(
dst
+
x
+
4
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m256d
v_src
=
_mm256_loadu_pd
(
src
+
x
);
__m256d
v_dst
=
_mm256_loadu_pd
(
dst
+
x
);
v_src
=
_mm256_mul_pd
(
v_src
,
v_src
);
v_dst
=
_mm256_add_pd
(
v_dst
,
v_src
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src0
=
_mm256_loadu_ps
(
src1
+
x
);
__m256
v_src1
=
_mm256_loadu_ps
(
src2
+
x
);
__m256
v_dst
=
_mm256_loadu_ps
(
dst
+
x
);
__m256
v_src
=
_mm256_mul_ps
(
v_src0
,
v_src1
);
v_dst
=
_mm256_add_ps
(
v_src
,
v_dst
);
_mm256_storeu_ps
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_1src
=
_mm256_loadu_ps
(
src1
+
x
);
__m256
v_2src
=
_mm256_loadu_ps
(
src2
+
x
);
__m256d
v_src00
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_1src
,
0
));
__m256d
v_src01
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_1src
,
1
));
__m256d
v_src10
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_2src
,
0
));
__m256d
v_src11
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_2src
,
1
));
__m256d
v_dst0
=
_mm256_loadu_pd
(
dst
+
x
);
__m256d
v_dst1
=
_mm256_loadu_pd
(
dst
+
x
+
4
);
__m256d
v_src0
=
_mm256_mul_pd
(
v_src00
,
v_src10
);
__m256d
v_src1
=
_mm256_mul_pd
(
v_src01
,
v_src11
);
v_dst0
=
_mm256_add_pd
(
v_src0
,
v_dst0
);
v_dst1
=
_mm256_add_pd
(
v_src1
,
v_dst1
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm256_storeu_pd
(
dst
+
x
+
4
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src1
,
const
double
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m256d
v_src0
=
_mm256_loadu_pd
(
src1
+
x
);
__m256d
v_src1
=
_mm256_loadu_pd
(
src2
+
x
);
__m256d
v_dst
=
_mm256_loadu_pd
(
dst
+
x
);
v_src0
=
_mm256_mul_pd
(
v_src0
,
v_src1
);
v_dst
=
_mm256_add_pd
(
v_dst
,
v_src0
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
int
x
=
0
;
__m256
v_alpha
=
_mm256_set1_ps
(
alpha
);
__m256
v_beta
=
_mm256_set1_ps
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
_mm256_storeu_ps
(
dst
+
x
,
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_loadu_ps
(
dst
+
x
),
v_beta
),
_mm256_mul_ps
(
_mm256_loadu_ps
(
src
+
x
),
v_alpha
)));
_mm256_storeu_ps
(
dst
+
x
+
8
,
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_loadu_ps
(
dst
+
x
+
8
),
v_beta
),
_mm256_mul_ps
(
_mm256_loadu_ps
(
src
+
x
+
8
),
v_alpha
)));
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
__m256d
v_alpha
=
_mm256_set1_pd
(
alpha
);
__m256d
v_beta
=
_mm256_set1_pd
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m256
v_src0
=
_mm256_loadu_ps
(
src
+
x
);
__m256
v_src1
=
_mm256_loadu_ps
(
src
+
x
+
8
);
__m256d
v_src00
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src0
,
0
));
__m256d
v_src01
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src0
,
1
));
__m256d
v_src10
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src1
,
0
));
__m256d
v_src11
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src1
,
1
));
_mm256_storeu_pd
(
dst
+
x
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
),
v_beta
),
_mm256_mul_pd
(
v_src00
,
v_alpha
)));
_mm256_storeu_pd
(
dst
+
x
+
4
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
+
4
),
v_beta
),
_mm256_mul_pd
(
v_src01
,
v_alpha
)));
_mm256_storeu_pd
(
dst
+
x
+
8
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
+
8
),
v_beta
),
_mm256_mul_pd
(
v_src10
,
v_alpha
)));
_mm256_storeu_pd
(
dst
+
x
+
12
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
+
12
),
v_beta
),
_mm256_mul_pd
(
v_src11
,
v_alpha
)));
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
__m256d
v_alpha
=
_mm256_set1_pd
(
alpha
);
__m256d
v_beta
=
_mm256_set1_pd
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256d
v_src0
=
_mm256_loadu_pd
(
src
+
x
);
__m256d
v_src1
=
_mm256_loadu_pd
(
src
+
x
+
4
);
_mm256_storeu_pd
(
dst
+
x
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
),
v_beta
),
_mm256_mul_pd
(
v_src0
,
v_alpha
)));
_mm256_storeu_pd
(
dst
+
x
+
4
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
+
4
),
v_beta
),
_mm256_mul_pd
(
v_src1
,
v_alpha
)));
}
}
return
x
;
}
};
#elif CV_SSE2
template
<>
struct
Acc_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_loadu_ps
(
src
+
x
)));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_loadu_ps
(
src
+
x
+
4
)));
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m128
v_src
=
_mm_loadu_ps
(
src
+
x
);
__m128d
v_src0
=
_mm_cvtps_pd
(
v_src
);
__m128d
v_src1
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_src
,
v_src
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
)));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m128d
v_src0
=
_mm_loadu_pd
(
src
+
x
);
__m128d
v_src1
=
_mm_loadu_pd
(
src
+
x
+
2
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128
v_src0
=
_mm_loadu_ps
(
src
+
x
);
__m128
v_src1
=
_mm_loadu_ps
(
src
+
x
+
4
);
v_src0
=
_mm_mul_ps
(
v_src0
,
v_src0
);
v_src1
=
_mm_mul_ps
(
v_src1
,
v_src1
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_src0
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_src1
));
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m128
v_src
=
_mm_loadu_ps
(
src
+
x
);
__m128d
v_src0
=
_mm_cvtps_pd
(
v_src
);
__m128d
v_src1
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_src
,
v_src
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
)));
v_src0
=
_mm_mul_pd
(
v_src0
,
v_src0
);
v_src1
=
_mm_mul_pd
(
v_src1
,
v_src1
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m128d
v_src0
=
_mm_loadu_pd
(
src
+
x
);
__m128d
v_src1
=
_mm_loadu_pd
(
src
+
x
+
2
);
v_src0
=
_mm_mul_pd
(
v_src0
,
v_src0
);
v_src1
=
_mm_mul_pd
(
v_src1
,
v_src1
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_mul_ps
(
_mm_loadu_ps
(
src1
+
x
),
_mm_loadu_ps
(
src2
+
x
))));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_mul_ps
(
_mm_loadu_ps
(
src1
+
x
+
4
),
_mm_loadu_ps
(
src2
+
x
+
4
))));
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m128
v_1src
=
_mm_loadu_ps
(
src1
+
x
);
__m128
v_2src
=
_mm_loadu_ps
(
src2
+
x
);
__m128d
v_1src0
=
_mm_cvtps_pd
(
v_1src
);
__m128d
v_1src1
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_1src
,
v_1src
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_2src0
=
_mm_cvtps_pd
(
v_2src
);
__m128d
v_2src1
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_2src
,
v_2src
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
_mm_mul_pd
(
v_1src0
,
v_2src0
));
v_dst1
=
_mm_add_pd
(
v_dst1
,
_mm_mul_pd
(
v_1src1
,
v_2src1
));
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src1
,
const
double
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m128d
v_src00
=
_mm_loadu_pd
(
src1
+
x
);
__m128d
v_src01
=
_mm_loadu_pd
(
src1
+
x
+
2
);
__m128d
v_src10
=
_mm_loadu_pd
(
src2
+
x
);
__m128d
v_src11
=
_mm_loadu_pd
(
src2
+
x
+
2
);
__m128d
v_src0
=
_mm_mul_pd
(
v_src00
,
v_src10
);
__m128d
v_src1
=
_mm_mul_pd
(
v_src01
,
v_src11
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
int
x
=
0
;
__m128
v_alpha
=
_mm_set1_ps
(
alpha
);
__m128
v_beta
=
_mm_set1_ps
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_beta
),
_mm_mul_ps
(
_mm_loadu_ps
(
src
+
x
),
v_alpha
)));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_beta
),
_mm_mul_ps
(
_mm_loadu_ps
(
src
+
x
+
4
),
v_alpha
)));
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
__m128d
v_alpha
=
_mm_set1_pd
(
alpha
);
__m128d
v_beta
=
_mm_set1_pd
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128
v_src0
=
_mm_loadu_ps
(
src
+
x
);
__m128
v_src1
=
_mm_loadu_ps
(
src
+
x
+
4
);
__m128d
v_src00
=
_mm_cvtps_pd
(
v_src0
);
__m128d
v_src01
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_src0
,
v_src0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src10
=
_mm_cvtps_pd
(
v_src1
);
__m128d
v_src11
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_src1
,
v_src1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
_mm_storeu_pd
(
dst
+
x
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
),
v_beta
),
_mm_mul_pd
(
v_src00
,
v_alpha
)));
_mm_storeu_pd
(
dst
+
x
+
2
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
+
2
),
v_beta
),
_mm_mul_pd
(
v_src01
,
v_alpha
)));
_mm_storeu_pd
(
dst
+
x
+
4
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
+
4
),
v_beta
),
_mm_mul_pd
(
v_src10
,
v_alpha
)));
_mm_storeu_pd
(
dst
+
x
+
6
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
+
6
),
v_beta
),
_mm_mul_pd
(
v_src11
,
v_alpha
)));
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
__m128d
v_alpha
=
_mm_set1_pd
(
alpha
);
__m128d
v_beta
=
_mm_set1_pd
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m128d
v_src0
=
_mm_loadu_pd
(
src
+
x
);
__m128d
v_src1
=
_mm_loadu_pd
(
src
+
x
+
2
);
_mm_storeu_pd
(
dst
+
x
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
),
v_beta
),
_mm_mul_pd
(
v_src0
,
v_alpha
)));
_mm_storeu_pd
(
dst
+
x
+
2
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
+
2
),
v_beta
),
_mm_mul_pd
(
v_src1
,
v_alpha
)));
}
}
return
x
;
}
};
#endif
#if CV_SSE2
template
<>
struct
Acc_SIMD
<
uchar
,
float
>
{
int
operator
()
(
const
uchar
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_src0
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
__m128i
v_src1
=
_mm_unpackhi_epi8
(
v_src
,
v_0
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src0
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src0
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
8
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
8
),
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src1
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
12
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
12
),
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src1
,
v_0
))));
}
}
else
if
(
cn
==
1
)
{
__m128i
v_255
=
_mm_set1_epi8
(
-
1
);
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_src
=
_mm_and_si128
(
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
)),
_mm_xor_si128
(
v_255
,
_mm_cmpeq_epi8
(
_mm_loadu_si128
((
const
__m128i
*
)(
mask
+
x
)),
v_0
)));
__m128i
v_src0
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
__m128i
v_src1
=
_mm_unpackhi_epi8
(
v_src
,
v_0
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src0
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src0
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
8
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
8
),
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src1
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
12
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
12
),
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src1
,
v_0
))));
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
ushort
,
float
>
{
int
operator
()
(
const
ushort
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_src0
=
_mm_unpacklo_epi16
(
v_src
,
v_0
);
__m128i
v_src1
=
_mm_unpackhi_epi16
(
v_src
,
v_0
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_cvtepi32_ps
(
v_src0
)));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_cvtepi32_ps
(
v_src1
)));
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
uchar
,
double
>
{
int
operator
()
(
const
uchar
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_int0
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
__m128i
v_int1
=
_mm_unpackhi_epi8
(
v_src
,
v_0
);
__m128i
v_int00
=
_mm_unpacklo_epi16
(
v_int0
,
v_0
);
__m128i
v_int01
=
_mm_unpackhi_epi16
(
v_int0
,
v_0
);
__m128i
v_int10
=
_mm_unpacklo_epi16
(
v_int1
,
v_0
);
__m128i
v_int11
=
_mm_unpackhi_epi16
(
v_int1
,
v_0
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int00
);
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int00
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int01
);
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int01
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src4
=
_mm_cvtepi32_pd
(
v_int10
);
__m128d
v_src5
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int10
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src6
=
_mm_cvtepi32_pd
(
v_int11
);
__m128d
v_src7
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int11
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
__m128d
v_dst4
=
_mm_loadu_pd
(
dst
+
x
+
8
);
__m128d
v_dst5
=
_mm_loadu_pd
(
dst
+
x
+
10
);
__m128d
v_dst6
=
_mm_loadu_pd
(
dst
+
x
+
12
);
__m128d
v_dst7
=
_mm_loadu_pd
(
dst
+
x
+
14
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
v_dst4
=
_mm_add_pd
(
v_dst4
,
v_src4
);
v_dst5
=
_mm_add_pd
(
v_dst5
,
v_src5
);
v_dst6
=
_mm_add_pd
(
v_dst6
,
v_src6
);
v_dst7
=
_mm_add_pd
(
v_dst7
,
v_src7
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
_mm_storeu_pd
(
dst
+
x
+
8
,
v_dst4
);
_mm_storeu_pd
(
dst
+
x
+
10
,
v_dst5
);
_mm_storeu_pd
(
dst
+
x
+
12
,
v_dst6
);
_mm_storeu_pd
(
dst
+
x
+
14
,
v_dst7
);
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
ushort
,
double
>
{
int
operator
()
(
const
ushort
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_src
,
v_0
);
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_src
,
v_0
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int0
);
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int1
);
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
uchar
,
float
>
{
int
operator
()
(
const
uchar
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_src0
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
__m128i
v_src1
=
_mm_unpackhi_epi8
(
v_src
,
v_0
);
v_src0
=
_mm_mullo_epi16
(
v_src0
,
v_src0
);
v_src1
=
_mm_mullo_epi16
(
v_src1
,
v_src1
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src0
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src0
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
8
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
8
),
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src1
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
12
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
12
),
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src1
,
v_0
))));
}
}
else
if
(
cn
==
1
)
{
__m128i
v_255
=
_mm_set1_epi8
(
-
1
);
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_src
=
_mm_and_si128
(
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
)),
_mm_xor_si128
(
v_255
,
_mm_cmpeq_epi8
(
_mm_loadu_si128
((
const
__m128i
*
)(
mask
+
x
)),
v_0
)));
__m128i
v_src0
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
__m128i
v_src1
=
_mm_unpackhi_epi8
(
v_src
,
v_0
);
v_src0
=
_mm_mullo_epi16
(
v_src0
,
v_src0
);
v_src1
=
_mm_mullo_epi16
(
v_src1
,
v_src1
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src0
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src0
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
8
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
8
),
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src1
,
v_0
))));
_mm_storeu_ps
(
dst
+
x
+
12
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
12
),
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src1
,
v_0
))));
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
ushort
,
float
>
{
int
operator
()
(
const
ushort
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_src0
=
_mm_unpacklo_epi16
(
v_src
,
v_0
);
__m128i
v_src1
=
_mm_unpackhi_epi16
(
v_src
,
v_0
);
v_src0
=
_mm_mullo_epi16
(
v_src0
,
v_src0
);
v_src1
=
_mm_mullo_epi16
(
v_src1
,
v_src1
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_cvtepi32_ps
(
v_src0
)));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_cvtepi32_ps
(
v_src1
)));
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
uchar
,
double
>
{
int
operator
()
(
const
uchar
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_int
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_int
,
v_0
);
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_int
,
v_0
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int0
);
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int1
);
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_src0
=
_mm_mul_pd
(
v_src0
,
v_src0
);
v_src1
=
_mm_mul_pd
(
v_src1
,
v_src1
);
v_src2
=
_mm_mul_pd
(
v_src2
,
v_src2
);
v_src3
=
_mm_mul_pd
(
v_src3
,
v_src3
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
ushort
,
double
>
{
int
operator
()
(
const
ushort
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_src
,
v_0
);
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_src
,
v_0
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int0
);
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int1
);
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_src0
=
_mm_mul_pd
(
v_src0
,
v_src0
);
v_src1
=
_mm_mul_pd
(
v_src1
,
v_src1
);
v_src2
=
_mm_mul_pd
(
v_src2
,
v_src2
);
v_src3
=
_mm_mul_pd
(
v_src3
,
v_src3
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
uchar
,
float
>
{
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
if
(
!
mask
)
{
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_1src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
v_2src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
__m128i
v_1src0
=
_mm_unpacklo_epi8
(
v_1src
,
v_0
);
__m128i
v_1src1
=
_mm_unpackhi_epi8
(
v_1src
,
v_0
);
__m128i
v_2src0
=
_mm_unpacklo_epi8
(
v_2src
,
v_0
);
__m128i
v_2src1
=
_mm_unpackhi_epi8
(
v_2src
,
v_0
);
__m128i
v_src0
=
_mm_mullo_epi16
(
v_1src0
,
v_2src0
);
__m128i
v_src1
=
_mm_mullo_epi16
(
v_1src1
,
v_2src1
);
__m128
v_src00
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src0
,
v_0
));
__m128
v_src01
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src0
,
v_0
));
__m128
v_src10
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src1
,
v_0
));
__m128
v_src11
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src1
,
v_0
));
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_src00
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_src01
));
_mm_storeu_ps
(
dst
+
x
+
8
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
8
),
v_src10
));
_mm_storeu_ps
(
dst
+
x
+
12
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
12
),
v_src11
));
}
}
else
if
(
cn
==
1
)
{
__m128i
v_255
=
_mm_set1_epi8
(
-
1
);
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_mask
=
_mm_loadu_si128
((
const
__m128i
*
)(
mask
+
x
));
v_mask
=
_mm_xor_si128
(
v_255
,
_mm_cmpeq_epi8
(
v_mask
,
v_0
));
__m128i
v_1src
=
_mm_and_si128
(
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
)),
v_mask
);
__m128i
v_2src
=
_mm_and_si128
(
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
)),
v_mask
);
__m128i
v_1src0
=
_mm_unpacklo_epi8
(
v_1src
,
v_0
);
__m128i
v_1src1
=
_mm_unpackhi_epi8
(
v_1src
,
v_0
);
__m128i
v_2src0
=
_mm_unpacklo_epi8
(
v_2src
,
v_0
);
__m128i
v_2src1
=
_mm_unpackhi_epi8
(
v_2src
,
v_0
);
__m128i
v_src0
=
_mm_mullo_epi16
(
v_1src0
,
v_2src0
);
__m128i
v_src1
=
_mm_mullo_epi16
(
v_1src1
,
v_2src1
);
__m128
v_src00
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src0
,
v_0
));
__m128
v_src01
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src0
,
v_0
));
__m128
v_src10
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src1
,
v_0
));
__m128
v_src11
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src1
,
v_0
));
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_src00
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_src01
));
_mm_storeu_ps
(
dst
+
x
+
8
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
8
),
v_src10
));
_mm_storeu_ps
(
dst
+
x
+
12
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
12
),
v_src11
));
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
ushort
,
float
>
{
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_1src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
v_2src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
__m128
v_1src0
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_1src
,
v_0
));
__m128
v_1src1
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_1src
,
v_0
));
__m128
v_2src0
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_2src
,
v_0
));
__m128
v_2src1
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_2src
,
v_0
));
__m128
v_src0
=
_mm_mul_ps
(
v_1src0
,
v_2src0
);
__m128
v_src1
=
_mm_mul_ps
(
v_1src1
,
v_2src1
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_src0
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_src1
));
}
}
else
if
(
cn
==
1
)
{
__m128i
v_65535
=
_mm_set1_epi16
(
-
1
);
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_mask
=
_mm_loadl_epi64
((
const
__m128i
*
)(
mask
+
x
));
__m128i
v_mask0
=
_mm_unpacklo_epi8
(
v_mask
,
v_0
);
v_mask0
=
_mm_xor_si128
(
v_65535
,
_mm_cmpeq_epi16
(
v_mask0
,
v_0
));
__m128i
v_1src
=
_mm_and_si128
(
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
)),
v_mask0
);
__m128i
v_2src
=
_mm_and_si128
(
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
)),
v_mask0
);
__m128
v_1src0
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_1src
,
v_0
));
__m128
v_1src1
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_1src
,
v_0
));
__m128
v_2src0
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_2src
,
v_0
));
__m128
v_2src1
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_2src
,
v_0
));
__m128
v_src0
=
_mm_mul_ps
(
v_1src0
,
v_2src0
);
__m128
v_src1
=
_mm_mul_ps
(
v_1src1
,
v_2src1
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_src0
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_src1
));
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
uchar
,
double
>
{
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_1src
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src1
+
x
));
__m128i
v_2src
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src2
+
x
));
__m128i
v_1int
=
_mm_unpacklo_epi8
(
v_1src
,
v_0
);
__m128i
v_2int
=
_mm_unpacklo_epi8
(
v_2src
,
v_0
);
__m128i
v_1int0
=
_mm_unpacklo_epi16
(
v_1int
,
v_0
);
__m128i
v_1int1
=
_mm_unpackhi_epi16
(
v_1int
,
v_0
);
__m128i
v_2int0
=
_mm_unpacklo_epi16
(
v_2int
,
v_0
);
__m128i
v_2int1
=
_mm_unpackhi_epi16
(
v_2int
,
v_0
);
__m128d
v_src0
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
v_1int0
),
_mm_cvtepi32_pd
(
v_2int0
));
__m128d
v_src1
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_1int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))),
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_2int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))));
__m128d
v_src2
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
v_1int1
),
_mm_cvtepi32_pd
(
v_2int1
));
__m128d
v_src3
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_1int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))),
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_2int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
ushort
,
double
>
{
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_1src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
__m128i
v_2src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
__m128i
v_1int0
=
_mm_unpacklo_epi16
(
v_1src
,
v_0
);
__m128i
v_1int1
=
_mm_unpackhi_epi16
(
v_1src
,
v_0
);
__m128i
v_2int0
=
_mm_unpacklo_epi16
(
v_2src
,
v_0
);
__m128i
v_2int1
=
_mm_unpackhi_epi16
(
v_2src
,
v_0
);
__m128d
v_src0
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
v_1int0
),
_mm_cvtepi32_pd
(
v_2int0
));
__m128d
v_src1
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_1int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))),
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_2int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))));
__m128d
v_src2
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
v_1int1
),
_mm_cvtepi32_pd
(
v_2int1
));
__m128d
v_src3
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_1int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))),
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_2int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
uchar
,
float
>
{
int
operator
()
(
const
uchar
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
int
x
=
0
;
__m128
v_alpha
=
_mm_set1_ps
(
alpha
);
__m128
v_beta
=
_mm_set1_ps
(
1.0
f
-
alpha
);
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_src0
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
__m128i
v_src1
=
_mm_unpackhi_epi8
(
v_src
,
v_0
);
__m128
v_src00
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src0
,
v_0
));
__m128
v_src01
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src0
,
v_0
));
__m128
v_src10
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src1
,
v_0
));
__m128
v_src11
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src1
,
v_0
));
__m128
v_dst00
=
_mm_loadu_ps
(
dst
+
x
);
__m128
v_dst01
=
_mm_loadu_ps
(
dst
+
x
+
4
);
__m128
v_dst10
=
_mm_loadu_ps
(
dst
+
x
+
8
);
__m128
v_dst11
=
_mm_loadu_ps
(
dst
+
x
+
12
);
v_dst00
=
_mm_add_ps
(
_mm_mul_ps
(
v_dst00
,
v_beta
),
_mm_mul_ps
(
v_src00
,
v_alpha
));
v_dst01
=
_mm_add_ps
(
_mm_mul_ps
(
v_dst01
,
v_beta
),
_mm_mul_ps
(
v_src01
,
v_alpha
));
v_dst10
=
_mm_add_ps
(
_mm_mul_ps
(
v_dst10
,
v_beta
),
_mm_mul_ps
(
v_src10
,
v_alpha
));
v_dst11
=
_mm_add_ps
(
_mm_mul_ps
(
v_dst11
,
v_beta
),
_mm_mul_ps
(
v_src11
,
v_alpha
));
_mm_storeu_ps
(
dst
+
x
,
v_dst00
);
_mm_storeu_ps
(
dst
+
x
+
4
,
v_dst01
);
_mm_storeu_ps
(
dst
+
x
+
8
,
v_dst10
);
_mm_storeu_ps
(
dst
+
x
+
12
,
v_dst11
);
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
ushort
,
float
>
{
int
operator
()
(
const
ushort
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
int
x
=
0
;
__m128
v_alpha
=
_mm_set1_ps
(
alpha
);
__m128
v_beta
=
_mm_set1_ps
(
1.0
f
-
alpha
);
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128
v_src0
=
_mm_cvtepi32_ps
(
_mm_unpacklo_epi16
(
v_src
,
v_0
));
__m128
v_src1
=
_mm_cvtepi32_ps
(
_mm_unpackhi_epi16
(
v_src
,
v_0
));
v_src0
=
_mm_mul_ps
(
v_src0
,
v_alpha
);
v_src1
=
_mm_mul_ps
(
v_src1
,
v_alpha
);
__m128
v_dst0
=
_mm_mul_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_beta
);
__m128
v_dst1
=
_mm_mul_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_beta
);
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
v_dst0
,
v_src0
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
v_dst1
,
v_src1
));
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
uchar
,
double
>
{
int
operator
()
(
const
uchar
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
__m128d
v_alpha
=
_mm_set1_pd
(
alpha
);
__m128d
v_beta
=
_mm_set1_pd
(
1.0
f
-
alpha
);
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_int
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_int
,
v_0
);
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_int
,
v_0
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int0
);
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int1
);
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_dst0
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst0
,
v_beta
),
_mm_mul_pd
(
v_src0
,
v_alpha
));
v_dst1
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst1
,
v_beta
),
_mm_mul_pd
(
v_src1
,
v_alpha
));
v_dst2
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst2
,
v_beta
),
_mm_mul_pd
(
v_src2
,
v_alpha
));
v_dst3
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst3
,
v_beta
),
_mm_mul_pd
(
v_src3
,
v_alpha
));
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
ushort
,
double
>
{
int
operator
()
(
const
ushort
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
__m128d
v_alpha
=
_mm_set1_pd
(
alpha
);
__m128d
v_beta
=
_mm_set1_pd
(
1.0
f
-
alpha
);
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_src
,
v_0
);
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_src
,
v_0
);
__m128d
v_src00
=
_mm_cvtepi32_pd
(
v_int0
);
__m128d
v_src01
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_src10
=
_mm_cvtepi32_pd
(
v_int1
);
__m128d
v_src11
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_dst00
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst01
=
_mm_loadu_pd
(
dst
+
x
+
2
);
__m128d
v_dst10
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst11
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_dst00
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst00
,
v_beta
),
_mm_mul_pd
(
v_src00
,
v_alpha
));
v_dst01
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst01
,
v_beta
),
_mm_mul_pd
(
v_src01
,
v_alpha
));
v_dst10
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst10
,
v_beta
),
_mm_mul_pd
(
v_src10
,
v_alpha
));
v_dst11
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst11
,
v_beta
),
_mm_mul_pd
(
v_src11
,
v_alpha
));
_mm_storeu_pd
(
dst
+
x
,
v_dst00
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst01
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst10
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst11
);
}
}
return
x
;
}
};
#endif //CV_SSE2
#if CV_NEON
template
<>
...
...
modules/video/test/test_accum.cpp
View file @
91d80d03
...
...
@@ -72,11 +72,11 @@ void CV_AccumBaseTest::get_test_array_types_and_sizes( int test_case_idx,
vector
<
vector
<
Size
>
>&
sizes
,
vector
<
vector
<
int
>
>&
types
)
{
RNG
&
rng
=
ts
->
get_rng
();
int
depth
=
cvtest
::
randInt
(
rng
)
%
3
,
cn
=
cvtest
::
randInt
(
rng
)
&
1
?
3
:
1
;
int
accdepth
=
std
::
max
((
int
)(
cvtest
::
randInt
(
rng
)
%
2
+
1
),
depth
);
int
depth
=
cvtest
::
randInt
(
rng
)
%
4
,
cn
=
cvtest
::
randInt
(
rng
)
&
1
?
3
:
1
;
int
accdepth
=
(
int
)(
cvtest
::
randInt
(
rng
)
%
2
+
1
);
int
i
,
input_count
=
(
int
)
test_array
[
INPUT
].
size
();
cvtest
::
ArrayTest
::
get_test_array_types_and_sizes
(
test_case_idx
,
sizes
,
types
);
depth
=
depth
==
0
?
CV_8U
:
depth
==
1
?
CV_32F
:
CV_64F
;
depth
=
depth
==
0
?
CV_8U
:
depth
==
1
?
CV_
16U
:
depth
==
2
?
CV_
32F
:
CV_64F
;
accdepth
=
accdepth
==
1
?
CV_32F
:
CV_64F
;
accdepth
=
MAX
(
accdepth
,
depth
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment