Commit 551d2b29 authored by fbarchard@google.com's avatar fbarchard@google.com

AVX2 version of ARGBToI420

BUG=181
TEST=unittest
Review URL: https://webrtc-codereview.appspot.com/1090005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@566 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8e26eada
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 565
Version: 566
License: BSD
License File: LICENSE
......
......@@ -123,6 +123,12 @@ extern "C" {
// TODO(fbarchard): Port to gcc.
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBCOLORTABLEROW_X86
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
// TODO(fbarchard): Hook these up to all functions. e.g. format conversion.
#define HAS_ARGBTOYROW_AVX2
#define HAS_ARGBTOUVROW_AVX2
#endif
#endif
// The following are Yasm x86 only.
......@@ -258,6 +264,13 @@ typedef __declspec(align(16)) int16 vec16[8];
typedef __declspec(align(16)) uint16 uvec16[8];
typedef __declspec(align(16)) int32 vec32[4];
typedef __declspec(align(16)) uint32 uvec32[4];
typedef __declspec(align(32)) int8 lvec8[32];
typedef __declspec(align(32)) uint8 ulvec8[32];
typedef __declspec(align(32)) int16 lvec16[16];
typedef __declspec(align(32)) uint16 ulvec16[16];
typedef __declspec(align(32)) int32 lvec32[8];
typedef __declspec(align(32)) uint32 ulvec32[8];
#elif defined(__GNUC__)
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
typedef int8 __attribute__((vector_size(16))) vec8;
......@@ -360,6 +373,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
int width);
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
......@@ -430,6 +446,12 @@ void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 565
#define LIBYUV_VERSION 566
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -739,7 +739,26 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_Unaligned_AVX2;
ARGBToYRow = ARGBToYRow_Unaligned_AVX2;
if (IS_ALIGNED(src_argb, 32) && IS_ALIGNED(src_stride_argb, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
if (IS_ALIGNED(dst_y, 32) && IS_ALIGNED(dst_stride_y, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
......@@ -767,6 +786,12 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width);
}
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0;
}
......
......@@ -195,6 +195,9 @@ BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
dst_y + (width - NUM) * BPP, NUM); \
}
#ifdef HAS_ARGBTOYROW_AVX2
YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_Unaligned_AVX2, 4, 1, 32)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
......@@ -251,37 +254,40 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
#endif
// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \
#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \
void NAMEANY(const uint8* src_argb, int src_stride_argb, \
uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~15; \
int n = width & ~MASK; \
ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
dst_u + (n >> 1), \
dst_v + (n >> 1), \
width & 15); \
width & MASK); \
}
#ifdef HAS_ARGBTOYROW_AVX2
UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_Unaligned_AVX2, ARGBToUVRow_C, 4, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGBTOUVROW_NEON
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4)
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4)
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4)
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4)
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3)
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3)
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2)
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2)
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2)
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
#endif
#undef UVANY
......
......@@ -25,14 +25,35 @@ static const vec8 kARGBToY = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
static const lvec8 kARGBToY_AVX = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0,
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
static const lvec8 kARGBToU_AVX = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
static const vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
static const lvec8 kARGBToV_AVX = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
};
// Unshuffle for vphaddw + vpackuswb vpermd.
static const lvec32 kShufARGBToY_AVX = {
0, 4, 1, 5, 2, 6, 3, 7
};
// Constants for BGRA.
static const vec8 kBGRAToY = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
......@@ -76,11 +97,25 @@ static const uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
static const ulvec8 kAddY16_AVX = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
static const ulvec8 kAddUV128_AVX = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
......@@ -727,6 +762,49 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
}
#ifdef HAS_ARGBTOYROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
__declspec(naked) __declspec(align(32))
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
vmovdqa ymm6, kShufARGBToY_AVX
vmovdqa ymm5, kAddY16_AVX
vmovdqa ymm4, kARGBToY_AVX
align 16
convertloop:
vmovdqa ymm0, [eax]
vmovdqa ymm1, [eax + 32]
vmovdqa ymm2, [eax + 64]
vmovdqa ymm3, [eax + 96]
vpmaddubsw ymm0, ymm0, ymm4
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
lea eax, [eax + 128]
vphaddw ymm0, ymm0, ymm1
vphaddw ymm2, ymm2, ymm3
vpsrlw ymm0, ymm0, 7
vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
vpaddb ymm0, ymm0, ymm5
sub ecx, 32
vmovdqa [edx], ymm0
lea edx, [edx + 32]
jg convertloop
ret
vphaddw ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8
vpackuswb ymm0, ymm0, ymm2
vpermq ymm0, ymm0, 0xd8
}
}
#endif // HAS_ARGBTOYROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
......@@ -761,6 +839,44 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
}
#ifdef HAS_ARGBTOYROW_AVX2
__declspec(naked) __declspec(align(32))
void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
vmovdqa ymm6, kShufARGBToY_AVX
vmovdqa ymm5, kAddY16_AVX
vmovdqa ymm4, kARGBToY_AVX
align 16
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
vmovdqu ymm3, [eax + 96]
vpmaddubsw ymm0, ymm0, ymm4
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
lea eax, [eax + 128]
vphaddw ymm0, ymm0, ymm1
vphaddw ymm2, ymm2, ymm3
vpsrlw ymm0, ymm0, 7
vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
vpaddb ymm0, ymm0, ymm5
sub ecx, 32
vmovdqu [edx], ymm0
lea edx, [edx + 32]
jg convertloop
ret
}
}
#endif // HAS_ARGBTOYROW_AVX2
__declspec(naked) __declspec(align(16))
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
......@@ -1031,6 +1147,80 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
#ifdef HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(32))
void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
vmovdqa ymm7, kARGBToU_AVX
vmovdqa ymm6, kARGBToV_AVX
vmovdqa ymm5, kAddUV128_AVX
sub edi, edx // stride from u to v
align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
vmovdqa ymm0, [eax]
vmovdqa ymm1, [eax + 32]
vmovdqa ymm2, [eax + 64]
vmovdqa ymm3, [eax + 96]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vpavgb ymm2, ymm2, [eax + esi + 64]
vpavgb ymm3, ymm3, [eax + esi + 96]
lea eax, [eax + 128]
vmovdqa ymm4, ymm0 // TODO(fbarchard): Remove.
vshufps ymm0, ymm0, ymm1, 0x88
vshufps ymm4, ymm4, ymm1, 0xdd
vpavgb ymm0, ymm0, ymm4
vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove.
vmovdqa ymm4, ymm2 // TODO(fbarchard): Remove.
vshufps ymm2, ymm2, ymm3, 0x88
vshufps ymm4, ymm4, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4
vpermq ymm2, ymm2, 0xd8 // TODO(fbarchard): Remove.
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
vmovdqa ymm1, ymm0 // TODO(fbarchard): Remove.
vmovdqa ymm3, ymm2 // TODO(fbarchard): Remove.
vpmaddubsw ymm0, ymm0, ymm7 // U
vpmaddubsw ymm2, ymm2, ymm7
vpmaddubsw ymm1, ymm1, ymm6 // V
vpmaddubsw ymm3, ymm3, ymm6
vphaddw ymm0, ymm0, ymm2
vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove.
vphaddw ymm1, ymm1, ymm3
vpermq ymm1, ymm1, 0xd8 // TODO(fbarchard): Remove.
vpsraw ymm0, ymm0, 8
vpsraw ymm1, ymm1, 8
vpacksswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
sub ecx, 32
vmovdqa ymm1, ymm0
vextractf128 qword ptr [edx], ymm0, 0 // U
vextractf128 qword ptr [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
jg convertloop
pop edi
pop esi
ret
}
}
#endif // HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
......@@ -1101,6 +1291,80 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
#ifdef HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(32))
void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
vmovdqa ymm7, kARGBToU_AVX
vmovdqa ymm6, kARGBToV_AVX
vmovdqa ymm5, kAddUV128_AVX
sub edi, edx // stride from u to v
align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
vmovdqu ymm3, [eax + 96]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vpavgb ymm2, ymm2, [eax + esi + 64]
vpavgb ymm3, ymm3, [eax + esi + 96]
lea eax, [eax + 128]
vmovdqa ymm4, ymm0
vshufps ymm0, ymm0, ymm1, 0x88
vshufps ymm4, ymm4, ymm1, 0xdd
vpavgb ymm0, ymm0, ymm4
vpermq ymm0, ymm0, 0xd8
vmovdqa ymm4, ymm2
vshufps ymm2, ymm2, ymm3, 0x88
vshufps ymm4, ymm4, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4
vpermq ymm2, ymm2, 0xd8
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
vmovdqa ymm1, ymm0
vmovdqa ymm3, ymm2
vpmaddubsw ymm0, ymm0, ymm7 // U
vpmaddubsw ymm2, ymm2, ymm7
vpmaddubsw ymm1, ymm1, ymm6 // V
vpmaddubsw ymm3, ymm3, ymm6
vphaddw ymm0, ymm0, ymm2
vpermq ymm0, ymm0, 0xd8
vphaddw ymm1, ymm1, ymm3
vpermq ymm1, ymm1, 0xd8
vpsraw ymm0, ymm0, 8
vpsraw ymm1, ymm1, 8
vpacksswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
sub ecx, 32
vmovdqa ymm1, ymm0
vextractf128 qword ptr [edx], ymm0, 0 // U
vextractf128 qword ptr [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
jg convertloop
pop edi
pop esi
ret
}
}
#endif // HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment