Commit 22e062a4 authored by Frank Barchard's avatar Frank Barchard

Port ARGBToJ420 to AVX2

ARGBToJ420 had an SSSE3 version, but not AVX2.
ARGBToI420 had an AVX2, so adapt that code to J420.

TBR=harryjin@google.com
BUG=libyuv:553

Review URL: https://codereview.chromium.org/1702373004 .
parent 127ff512
...@@ -1023,6 +1023,67 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1023,6 +1023,67 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
} }
#endif // HAS_ARGBTOUVROW_AVX2 #endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
"vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
"lea " MEMLEA(0x80,0) ",%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
"vpsraw $0x8,%%ymm1,%%ymm1 \n"
"vpsraw $0x8,%%ymm0,%%ymm0 \n"
"vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpshufb %8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_argb)), // %4
"m"(kAddUVJ128), // %5
"m"(kARGBToVJ), // %6
"m"(kARGBToUJ), // %7
"m"(kShufARGBToUV_AVX) // %8
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_ARGBTOUVJROW_AVX2
#ifdef HAS_ARGBTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUVJROW_SSSE3
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
...@@ -1475,7 +1536,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1475,7 +1536,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#define READYUV411_TEMP \ #define READYUV411_TEMP \
"movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
"movd %[temp],%%xmm0 \n" \ "movd %[temp],%%xmm0 \n" \
MEMOPARG(movzwl,0x00,[u_buf],[v_buf],1,[temp]) " \n" \ MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \
"movd %[temp],%%xmm1 \n" \ "movd %[temp],%%xmm1 \n" \
"lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \
...@@ -2032,7 +2093,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -2032,7 +2093,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2 \ #define YUVTORGB_REGS_AVX2 \
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
#else// Convert 16 pixels: 16 UV and 16 Y. #else // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_SETUP_AVX2(yuvconstants) #define YUVTORGB_SETUP_AVX2(yuvconstants)
#define YUVTORGB_AVX2(yuvconstants) \ #define YUVTORGB_AVX2(yuvconstants) \
"vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
......
...@@ -1505,7 +1505,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1505,7 +1505,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
pmaddubsw xmm3, xmm6 pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2 phaddw xmm0, xmm2
phaddw xmm1, xmm3 phaddw xmm1, xmm3
paddw xmm0, xmm5 // +.5 rounding -> unsigned paddw xmm0, xmm5 // +.5 rounding -> unsigned
paddw xmm1, xmm5 paddw xmm1, xmm5
psraw xmm0, 8 psraw xmm0, 8
psraw xmm1, 8 psraw xmm1, 8
...@@ -1590,6 +1590,73 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1590,6 +1590,73 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
} }
#endif // HAS_ARGBTOUVROW_AVX2 #endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
__declspec(naked)
void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
vbroadcastf128 ymm5, xmmword ptr kAddUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
convertloop:
/* step 1 - subsample 32x2 argb pixels to 16x1 */
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64]
vmovdqu ymm3, [eax + 96]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vpavgb ymm2, ymm2, [eax + esi + 64]
vpavgb ymm3, ymm3, [eax + esi + 96]
lea eax, [eax + 128]
vshufps ymm4, ymm0, ymm1, 0x88
vshufps ymm0, ymm0, ymm1, 0xdd
vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
vshufps ymm4, ymm2, ymm3, 0x88
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
vpmaddubsw ymm2, ymm2, ymm6
vphaddw ymm1, ymm1, ymm3 // mutates
vphaddw ymm0, ymm0, ymm2
vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
vpaddw ymm0, ymm0, ymm5
vpsraw ymm1, ymm1, 8
vpsraw ymm0, ymm0, 8
vpacksswb ymm0, ymm1, ymm0 // mutates
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
// step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
sub ecx, 32
jg convertloop
pop edi
pop esi
vzeroupper
ret
}
}
#endif // HAS_ARGBTOUVJROW_AVX2
__declspec(naked) __declspec(naked)
void ARGBToUV444Row_SSSE3(const uint8* src_argb0, void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment