Commit b444bae8 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGBToI400 and ARGBToI411 using AVX2. YUY2ToI420 and UVYVToI420 use AVX2.…

ARGBToI400 and ARGBToI411 using AVX2. YUY2ToI420 and UVYVToI420 use AVX2. CopyPlane use rep movsb for AVX2.   CopyPlane2 use rep movsb for AVX2 and CopyPlane if strides match AVX2, which will do a single rep movsb for entire image if stride == width.  MergeUV for I420ToNV12.
BUG=181
TESTED=unittests pass
Review URL: https://webrtc-codereview.appspot.com/1103007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@569 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c9562334
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 568
Version: 569
License: BSD
License File: LICENSE
......
......@@ -123,12 +123,20 @@ extern "C" {
// TODO(fbarchard): Port to gcc.
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBCOLORTABLEROW_X86
#define HAS_COPYROW_AVX2
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
// TODO(fbarchard): Hook these up to all functions. e.g. format conversion.
#define HAS_ARGBTOYROW_AVX2
#define HAS_ARGBTOUVROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
#define HAS_UYVYTOUVROW_AVX2
#define HAS_UYVYTOYROW_AVX2
#endif
#endif
......@@ -375,7 +383,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
int width);
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
......@@ -449,8 +456,6 @@ void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
......@@ -570,8 +575,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix);
void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix);
void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix);
void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
uint8* dst_v, int pix);
void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
......@@ -593,8 +596,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width);
void MergeUVRow_Unaligned_AVX2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width);
void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
......@@ -603,6 +604,7 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX2(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
......@@ -1154,6 +1156,11 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
uint8* dst_argb,
int width);
void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
......@@ -1175,6 +1182,11 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_C(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
......@@ -1185,7 +1197,11 @@ void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
......@@ -1197,6 +1213,11 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
......@@ -1208,6 +1229,11 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_C(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 568
#define LIBYUV_VERSION 569
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -290,24 +290,31 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
}
static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
uint8* dst, int dst_stride_frame,
uint8* dst, int dst_stride,
int width, int height) {
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#elif defined(HAS_COPYROW_X86)
if (IS_ALIGNED(width, 4)) {
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) &&
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src, 16) &&
IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
CopyRow = CopyRow_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX2)
// TODO(fbarchard): Detect Fast String support.
if (TestCpuFlag(kCpuHasAVX2)) {
CopyRow = CopyRow_AVX2;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
......@@ -319,9 +326,9 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
// Copy plane
for (int y = 0; y < height - 1; y += 2) {
CopyRow(src, dst, width);
CopyRow(src + src_stride_0, dst + dst_stride_frame, width);
CopyRow(src + src_stride_0, dst + dst_stride, width);
src += src_stride_0 + src_stride_1;
dst += dst_stride_frame * 2;
dst += dst_stride * 2;
}
if (height & 1) {
CopyRow(src, dst, width);
......@@ -381,14 +388,9 @@ static int X420ToI420(const uint8* src_y,
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
SplitUVRow = SplitUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
SplitUVRow = SplitUVRow_Unaligned_AVX2;
if (IS_ALIGNED(src_uv, 32) && IS_ALIGNED(src_stride_uv, 32) &&
IS_ALIGNED(dst_u, 32) && IS_ALIGNED(dst_stride_u, 32) &&
IS_ALIGNED(dst_v, 32) && IS_ALIGNED(dst_stride_v, 32)) {
SplitUVRow = SplitUVRow_AVX2;
}
}
}
#endif
#if defined(HAS_SPLITUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
......@@ -413,9 +415,13 @@ static int X420ToI420(const uint8* src_y,
#endif
if (dst_y) {
if (src_stride_y0 == src_stride_y1) {
CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
} else {
CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
width, height);
}
}
int halfheight = (height + 1) >> 1;
for (int y = 0; y < halfheight; ++y) {
......@@ -519,6 +525,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
CopyRow = CopyRow_AVX2;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
......@@ -544,7 +555,20 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
}
}
}
#elif defined(HAS_YUY2TOYROW_NEON)
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
YUY2ToYRow = YUY2ToYRow_AVX2;
}
}
#endif
#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width >= 16) {
......@@ -573,6 +597,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow(src_y, dst_y, width);
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
}
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -610,7 +639,20 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
}
}
}
#elif defined(HAS_YUY2TOYROW_NEON)
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
YUY2ToUVRow = YUY2ToUVRow_AVX2;
YUY2ToYRow = YUY2ToYRow_AVX2;
}
}
#endif
#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width >= 16) {
......@@ -636,6 +678,12 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width);
}
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -673,7 +721,20 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
}
}
}
#elif defined(HAS_UYVYTOYROW_NEON)
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
UYVYToUVRow = UYVYToUVRow_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
UYVYToUVRow = UYVYToUVRow_AVX2;
UYVYToYRow = UYVYToYRow_AVX2;
}
}
#endif
#if defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
UYVYToYRow = UYVYToYRow_Any_NEON;
if (width >= 16) {
......@@ -699,6 +760,12 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width);
}
#if defined(HAS_UYVYTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -747,16 +814,10 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_Unaligned_AVX2;
ARGBToYRow = ARGBToYRow_Unaligned_AVX2;
if (IS_ALIGNED(src_argb, 32) && IS_ALIGNED(src_stride_argb, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
if (IS_ALIGNED(dst_y, 32) && IS_ALIGNED(dst_stride_y, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
......
......@@ -451,17 +451,14 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
clear = true;
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_Unaligned_AVX2;
if (IS_ALIGNED(src_u, 32) && IS_ALIGNED(src_stride_u, 32) &&
IS_ALIGNED(src_v, 32) && IS_ALIGNED(src_stride_v, 32) &&
IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
......@@ -481,6 +478,12 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
src_v += src_stride_v;
dst_uv += dst_stride_uv;
}
#if defined(HAS_MERGEUVROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......
......@@ -183,7 +183,18 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
......@@ -206,6 +217,11 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -277,12 +293,9 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_Unaligned_AVX2;
if (IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
......@@ -383,12 +396,9 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_Unaligned_AVX2;
if (IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
......@@ -624,7 +634,18 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
......@@ -638,6 +659,11 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb;
dst_y += dst_stride_y;
}
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......
......@@ -34,11 +34,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
......@@ -51,6 +46,17 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX2)
// TODO(fbarchard): Detect Fast String support.
if (TestCpuFlag(kCpuHasAVX2)) {
CopyRow = CopyRow_AVX2;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
......@@ -154,11 +160,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToYRow = YUY2ToYRow_C;
YUY2ToUV422Row = YUY2ToUV422Row_C;
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width > 16) {
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
}
if (IS_ALIGNED(width, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
......@@ -170,14 +174,25 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
}
}
}
#elif defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (width > 8) {
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
YUY2ToYRow = YUY2ToYRow_AVX2;
}
}
#endif
#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width > 16) {
if (width >= 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
}
}
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
YUY2ToUV422Row = YUY2ToUV422Row_NEON;
......@@ -193,6 +208,12 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......@@ -216,11 +237,9 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToYRow = UYVYToYRow_C;
UYVYToUV422Row = UYVYToUV422Row_C;
#if defined(HAS_UYVYTOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
if (width > 16) {
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
}
if (IS_ALIGNED(width, 16)) {
UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
......@@ -232,14 +251,25 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
}
}
}
#elif defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (width > 8) {
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
UYVYToUV422Row = UYVYToUV422Row_AVX2;
UYVYToYRow = UYVYToYRow_AVX2;
}
}
#endif
#if defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
UYVYToYRow = UYVYToYRow_Any_NEON;
if (width > 16) {
if (width >= 16) {
UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
}
}
if (IS_ALIGNED(width, 16)) {
UYVYToYRow = UYVYToYRow_NEON;
UYVYToUV422Row = UYVYToUV422Row_NEON;
......@@ -255,6 +285,12 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
#if defined(HAS_UYVYTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......
......@@ -196,7 +196,9 @@ BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
}
#ifdef HAS_ARGBTOYROW_AVX2
YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_Unaligned_AVX2, 4, 1, 32)
YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
......@@ -266,7 +268,9 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
}
#ifdef HAS_ARGBTOYROW_AVX2
UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_Unaligned_AVX2, ARGBToUVRow_C, 4, 31)
UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
......@@ -306,6 +310,12 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
ARGBToUV444Row_C, 4, 15, 0)
#endif
#ifdef HAS_YUY2TOUV422ROW_AVX2
UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
YUY2ToUV422Row_C, 2, 31, 1)
UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
UYVYToUV422Row_C, 2, 31, 1)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
ARGBToUV422Row_C, 4, 15, 1)
......@@ -343,7 +353,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_AVX2
SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31)
SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
#endif
#ifdef HAS_SPLITUVROW_NEON
SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
......@@ -369,7 +379,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX2
MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_Unaligned_AVX2, MergeUVRow_C, 31)
MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
#endif
#ifdef HAS_MERGEUVROW_NEON
MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
......
......@@ -776,10 +776,10 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
align 16
convertloop:
vmovdqa ymm0, [eax]
vmovdqa ymm1, [eax + 32]
vmovdqa ymm2, [eax + 64]
vmovdqa ymm3, [eax + 96]
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
vmovdqu ymm3, [eax + 96]
vpmaddubsw ymm0, ymm0, ymm4
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
......@@ -793,7 +793,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
vpaddb ymm0, ymm0, ymm5
sub ecx, 32
vmovdqa [edx], ymm0
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
ret
......@@ -835,44 +835,6 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
}
#ifdef HAS_ARGBTOYROW_AVX2
__declspec(naked) __declspec(align(32))
void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
vmovdqa ymm6, kShufARGBToY_AVX
vmovdqa ymm5, kAddY16_AVX
vmovdqa ymm4, kARGBToY_AVX
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
vmovdqu ymm3, [eax + 96]
vpmaddubsw ymm0, ymm0, ymm4
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
lea eax, [eax + 128]
vphaddw ymm0, ymm0, ymm1
vphaddw ymm2, ymm2, ymm3
vpsrlw ymm0, ymm0, 7
vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
vpaddb ymm0, ymm0, ymm5
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
ret
}
}
#endif // HAS_ARGBTOYROW_AVX2
__declspec(naked) __declspec(align(16))
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
......@@ -1162,11 +1124,11 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
vmovdqa ymm0, [eax]
vmovdqa ymm1, [eax + 32]
vmovdqa ymm2, [eax + 64]
vmovdqa ymm3, [eax + 96]
/* step 1 - subsample 32x2 argb pixels to 16x1 */
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
vmovdqu ymm3, [eax + 96]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vpavgb ymm2, ymm2, [eax + esi + 64]
......@@ -1200,8 +1162,8 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
// step 3 - store 16 U and 16 V values
sub ecx, 32
vextractf128 qword ptr [edx], ymm0, 0 // U
vextractf128 qword ptr [edx + edi], ymm0, 1 // V
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
jg convertloop
......@@ -1282,75 +1244,6 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
#ifdef HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(32))
void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
vmovdqa ymm7, kARGBToU_AVX
vmovdqa ymm6, kARGBToV_AVX
vmovdqa ymm5, kAddUV128_AVX
sub edi, edx // stride from u to v
align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
vmovdqu ymm3, [eax + 96]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vpavgb ymm2, ymm2, [eax + esi + 64]
vpavgb ymm3, ymm3, [eax + esi + 96]
lea eax, [eax + 128]
vshufps ymm4, ymm0, ymm1, 0x88
vshufps ymm0, ymm0, ymm1, 0xdd
vpavgb ymm0, ymm0, ymm4
vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove.
vshufps ymm4, ymm2, ymm3, 0x88
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4
vpermq ymm2, ymm2, 0xd8 // TODO(fbarchard): Remove.
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
vpmaddubsw ymm2, ymm2, ymm6
vphaddw ymm1, ymm1, ymm3
vpermq ymm1, ymm1, 0xd8 // TODO(fbarchard): Remove.
vphaddw ymm0, ymm0, ymm2
vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove.
vpsraw ymm1, ymm1, 8
vpsraw ymm0, ymm0, 8
vpacksswb ymm0, ymm1, ymm0
vpermq ymm0, ymm0, 0xd8
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
sub ecx, 32
vextractf128 qword ptr [edx], ymm0, 0 // U
vextractf128 qword ptr [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
jg convertloop
pop edi
pop esi
ret
}
}
#endif // HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
......@@ -3207,37 +3100,6 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
}
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_MERGEUVROW_SSE2
__declspec(naked) __declspec(align(16))
void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_u
mov edx, [esp + 4 + 8] // src_v
mov edi, [esp + 4 + 12] // dst_uv
mov ecx, [esp + 4 + 16] // width
sub edx, eax
align 16
convertloop:
movdqa xmm0, [eax] // read 16 U's
movdqa xmm1, [eax + edx] // and 16 V's
lea eax, [eax + 16]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1 // first 8 UV pairs
punpckhbw xmm2, xmm1 // next 8 UV pairs
movdqa [edi], xmm0
movdqa [edi + 16], xmm2
lea edi, [edi + 32]
sub ecx, 16
jg convertloop
pop edi
ret
}
}
#ifdef HAS_SPLITUVROW_AVX2
__declspec(naked) __declspec(align(16))
void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
......@@ -3253,8 +3115,8 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
align 16
convertloop:
vmovdqa ymm0, [eax]
vmovdqa ymm1, [eax + 32]
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm2, ymm0, 8 // odd bytes
vpsrlw ymm3, ymm1, 8
......@@ -3264,8 +3126,8 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
vpackuswb ymm2, ymm2, ymm3
vpermq ymm0, ymm0, 0xd8
vpermq ymm2, ymm2, 0xd8
vmovdqa [edx], ymm0
vmovdqa [edx + edi], ymm2
vmovdqu [edx], ymm0
vmovdqu [edx + edi], ymm2
lea edx, [edx + 32]
sub ecx, 32
jg convertloop
......@@ -3274,44 +3136,38 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
ret
}
}
#endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_MERGEUVROW_SSE2
__declspec(naked) __declspec(align(16))
void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix) {
void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_uv
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
mov eax, [esp + 4 + 4] // src_u
mov edx, [esp + 4 + 8] // src_v
mov edi, [esp + 4 + 12] // dst_uv
mov ecx, [esp + 4 + 16] // width
sub edx, eax
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm2, ymm0, 8 // odd bytes
vpsrlw ymm3, ymm1, 8
vpand ymm0, ymm0, ymm5 // even bytes
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpackuswb ymm2, ymm2, ymm3
vpermq ymm0, ymm0, 0xd8
vpermq ymm2, ymm2, 0xd8
vmovdqu [edx], ymm0
vmovdqu [edx + edi], ymm2
lea edx, [edx + 32]
sub ecx, 32
movdqa xmm0, [eax] // read 16 U's
movdqa xmm1, [eax + edx] // and 16 V's
lea eax, [eax + 16]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1 // first 8 UV pairs
punpckhbw xmm2, xmm1 // next 8 UV pairs
movdqa [edi], xmm0
movdqa [edi + 16], xmm2
lea edi, [edi + 32]
sub ecx, 16
jg convertloop
pop edi
ret
}
}
#endif // HAS_SPLITUVROW_AVX2
__declspec(naked) __declspec(align(16))
void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
......@@ -3344,6 +3200,39 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
}
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2
__declspec(naked) __declspec(align(16))
void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_u
mov edx, [esp + 4 + 8] // src_v
mov edi, [esp + 4 + 12] // dst_uv
mov ecx, [esp + 4 + 16] // width
sub edx, eax
align 16
convertloop:
vmovdqu ymm0, [eax] // read 32 U's
vmovdqu ymm1, [eax + edx] // and 32 V's
lea eax, [eax + 32]
vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
vmovdqu [edi], ymm1
vmovdqu [edi + 32], ymm2
lea edi, [edi + 64]
sub ecx, 32
jg convertloop
pop edi
ret
}
}
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_COPYROW_SSE2
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
__declspec(naked) __declspec(align(16))
......@@ -3368,6 +3257,24 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_SSE2
#ifdef HAS_COPYROW_AVX2
// Unaligned Multiple of 1.
__declspec(naked) __declspec(align(16))
void CopyRow_AVX2(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, esi
mov edx, edi
mov esi, [esp + 4] // src
mov edi, [esp + 8] // dst
mov ecx, [esp + 12] // count
rep movsb
mov edi, edx
mov esi, eax
ret
}
}
#endif // HAS_COPYROW_AVX2
#ifdef HAS_COPYROW_X86
__declspec(naked) __declspec(align(16))
void CopyRow_X86(const uint8* src, uint8* dst, int count) {
......@@ -3434,6 +3341,226 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
}
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_AVX2
__declspec(naked) __declspec(align(16))
void YUY2ToYRow_AVX2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpand ymm0, ymm0, ymm5 // even bytes are Y
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
ret
}
}
__declspec(naked) __declspec(align(16))
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64]
vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vpand ymm1, ymm0, ymm5 // U
vpsrlw ymm0, ymm0, 8 // V
vpackuswb ymm1, ymm1, ymm1 // mutates.
vpackuswb ymm0, ymm0, ymm0 // mutates.
vpermq ymm1, ymm1, 0xd8
vpermq ymm0, ymm0, 0xd8
vextractf128 [edx], ymm1, 0 // U
vextractf128 [edx + edi], ymm0, 0 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vpand ymm1, ymm0, ymm5 // U
vpsrlw ymm0, ymm0, 8 // V
vpackuswb ymm1, ymm1, ymm1 // mutates.
vpackuswb ymm0, ymm0, ymm0 // mutates.
vpermq ymm1, ymm1, 0xd8
vpermq ymm0, ymm0, 0xd8
vextractf128 [edx], ymm1, 0 // U
vextractf128 [edx + edi], ymm0, 0 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
pop edi
ret
}
}
__declspec(naked) __declspec(align(16))
void UYVYToYRow_AVX2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_uyvy
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm0, ymm0, 8 // odd bytes are Y
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
ret
}
}
__declspec(naked) __declspec(align(16))
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_yuy2
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64]
vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vpand ymm1, ymm0, ymm5 // U
vpsrlw ymm0, ymm0, 8 // V
vpackuswb ymm1, ymm1, ymm1 // mutates.
vpackuswb ymm0, ymm0, ymm0 // mutates.
vpermq ymm1, ymm1, 0xd8
vpermq ymm0, ymm0, 0xd8
vextractf128 [edx], ymm1, 0 // U
vextractf128 [edx + edi], ymm0, 0 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8
vpand ymm1, ymm0, ymm5 // U
vpsrlw ymm0, ymm0, 8 // V
vpackuswb ymm1, ymm1, ymm1 // mutates.
vpackuswb ymm0, ymm0, ymm0 // mutates.
vpermq ymm1, ymm1, 0xd8
vpermq ymm0, ymm0, 0xd8
vextractf128 [edx], ymm1, 0 // U
vextractf128 [edx + edi], ymm0, 0 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
pop edi
ret
}
}
#endif // HAS_YUY2TOYROW_AVX2
#ifdef HAS_YUY2TOYROW_SSE2
__declspec(naked) __declspec(align(16))
void YUY2ToYRow_SSE2(const uint8* src_yuy2,
......
......@@ -64,9 +64,7 @@ YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_YMM AVX2
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
......@@ -107,7 +105,6 @@ SplitUVRow a,
SplitUVRow u,_Unaligned
INIT_YMM AVX2
SplitUVRow a,
SplitUVRow u,_Unaligned
; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
; int width);
......@@ -121,11 +118,17 @@ cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
mov%1 m0, [src_uq]
mov%1 m1, [src_vq]
lea src_uq, [src_uq + mmsize]
mova m2, m0
punpcklbw m0, m0, m1 // first 8 UV pairs
punpckhbw m2, m2, m1 // next 8 UV pairs
mov%1 [dst_uvq], m0
punpcklbw m2, m0, m1 // first 8 UV pairs
punpckhbw m0, m0, m1 // next 8 UV pairs
%if cpuflag(AVX2)
vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
mov%1 [dst_uvq], m1
mov%1 [dst_uvq + mmsize], m2
%else
mov%1 [dst_uvq], m2
mov%1 [dst_uvq + mmsize], m0
%endif
lea dst_uvq, [dst_uvq + mmsize * 2]
sub pixd, mmsize
jg .convertloop
......@@ -140,4 +143,4 @@ MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
INIT_YMM AVX2
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment