Commit 6825b161 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

HalfFloat SSE2/AVX2 optimized port scheduling.

Uses 1 add instead of 2 leas to reduce port pressure on ports 1 and 5
used for SIMD instructions.

BUG=libyuv:670
TEST=~/iaca-lin64/bin/iaca.sh -arch HSW out/Release/obj/libyuv/row_gcc.o

Change-Id: I3965ee5dcb49941a535efa611b5988d977f5b65c
Reviewed-on: https://chromium-review.googlesource.com/433391Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 7a54d0a3
...@@ -202,9 +202,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { ...@@ -202,9 +202,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
uint8 b1 = src_argb[4] >> 3; uint8 b1 = src_argb[4] >> 3;
uint8 g1 = src_argb[5] >> 2; uint8 g1 = src_argb[5] >> 2;
uint8 r1 = src_argb[6] >> 3; uint8 r1 = src_argb[6] >> 3;
WRITEWORD( WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
dst_rgb, (r1 << 27));
b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27));
dst_rgb += 4; dst_rgb += 4;
src_argb += 8; src_argb += 8;
} }
...@@ -238,9 +237,8 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb, ...@@ -238,9 +237,8 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb,
uint8 b1 = clamp255(src_argb[4] + dither1) >> 3; uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
uint8 g1 = clamp255(src_argb[5] + dither1) >> 2; uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
uint8 r1 = clamp255(src_argb[6] + dither1) >> 3; uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
WRITEWORD( WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
dst_rgb, (r1 << 27));
b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27));
dst_rgb += 4; dst_rgb += 4;
src_argb += 8; src_argb += 8;
} }
......
// VERSION 2
/* /*
* Copyright 2011 The LibYuv Project Authors. All rights reserved. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
* *
...@@ -5457,12 +5456,13 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5457,12 +5456,13 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile ( asm volatile (
"pshufd $0x0,%3,%%xmm4 \n" "pshufd $0x0,%3,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
"lea " MEMLEA(0x10,0) ",%0 \n" "add $0x10,%0 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
"cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
...@@ -5473,8 +5473,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5473,8 +5473,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
"psrld $0xd,%%xmm2 \n" "psrld $0xd,%%xmm2 \n"
"psrld $0xd,%%xmm3 \n" "psrld $0xd,%%xmm3 \n"
"packssdw %%xmm3,%%xmm2 \n" "packssdw %%xmm3,%%xmm2 \n"
"movdqu %%xmm2," MEMACCESS(1) " \n" MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
...@@ -5488,17 +5487,17 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5488,17 +5487,17 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
#endif // HAS_HALFFLOATROW_SSE2 #endif // HAS_HALFFLOATROW_SSE2
#ifdef HAS_HALFFLOATROW_AVX2 #ifdef HAS_HALFFLOATROW_AVX2
// TODO(fbarchard): consider vadddw instead of vmulps
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile ( asm volatile (
"vbroadcastss %3, %%ymm4 \n" "vbroadcastss %3, %%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"sub %0,%1 \n"
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
"lea " MEMLEA(0x20,0) ",%0 \n" "add $0x20,%0 \n"
"vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
"vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n"
...@@ -5508,10 +5507,10 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5508,10 +5507,10 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
"vpsrld $0xd,%%ymm3,%%ymm3 \n" "vpsrld $0xd,%%ymm3,%%ymm3 \n"
"vpsrld $0xd,%%ymm2,%%ymm2 \n" "vpsrld $0xd,%%ymm2,%%ymm2 \n"
"vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
"vmovdqu %%ymm2," MEMACCESS(1) " \n" MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -5526,26 +5525,25 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5526,26 +5525,25 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_F16C #ifdef HAS_HALFFLOATROW_F16C
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
asm volatile ( asm volatile (
"vbroadcastss %3, %%ymm4 \n" "vbroadcastss %3, %%ymm4 \n"
"sub %0,%1 \n"
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n"
"vmulps %%ymm2,%%ymm4,%%ymm2 \n" "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
"vmulps %%ymm3,%%ymm4,%%ymm3 \n" "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
"vcvtps2ph $3, %%ymm2, %%xmm2 \n" "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
"vcvtps2ph $3, %%ymm3, %%xmm3 \n" "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
"vmovdqu %%xmm2," MEMACCESS(1) " \n" MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
"lea " MEMLEA(0x20,1) ",%1 \n" "add $0x20,%0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -5560,22 +5558,21 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5560,22 +5558,21 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_F16C #ifdef HAS_HALFFLOATROW_F16C
void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) { void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
asm volatile ( asm volatile (
"sub %0,%1 \n"
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n"
"vcvtps2ph $3, %%ymm2, %%xmm2 \n" "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
"vcvtps2ph $3, %%ymm3, %%xmm3 \n" "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
"vmovdqu %%xmm2," MEMACCESS(1) " \n" MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
"lea " MEMLEA(0x20,1) ",%1 \n" "add $0x20,%0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
......
...@@ -6070,11 +6070,12 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src, ...@@ -6070,11 +6070,12 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
mulss xmm4, kExpBias mulss xmm4, kExpBias
pshufd xmm4, xmm4, 0 pshufd xmm4, xmm4, 0
pxor xmm5, xmm5 pxor xmm5, xmm5
sub edx, eax
// 8 pixel loop. // 8 pixel loop.
convertloop: convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts movdqu xmm2, xmmword ptr [eax] // 8 shorts
lea eax, [eax + 16] add eax, 16
movdqa xmm3, xmm2 movdqa xmm3, xmm2
punpcklwd xmm2, xmm5 punpcklwd xmm2, xmm5
cvtdq2ps xmm2, xmm2 // convert 8 ints to floats cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
...@@ -6085,8 +6086,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src, ...@@ -6085,8 +6086,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
psrld xmm2, 13 psrld xmm2, 13
psrld xmm3, 13 psrld xmm3, 13
packssdw xmm2, xmm3 packssdw xmm2, xmm3
movdqu [edx], xmm2 movdqu [eax + edx - 16], xmm2
lea edx, [edx + 16]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
ret ret
...@@ -6108,11 +6108,12 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, ...@@ -6108,11 +6108,12 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vmulss xmm4, xmm4, kExpBias vmulss xmm4, xmm4, kExpBias
vbroadcastss ymm4, xmm4 vbroadcastss ymm4, xmm4
vpxor ymm5, ymm5, ymm5 vpxor ymm5, ymm5, ymm5
sub edx, eax
// 16 pixel loop. // 16 pixel loop.
convertloop: convertloop:
vmovdqu ymm2, [eax] // 16 shorts vmovdqu ymm2, [eax] // 16 shorts
lea eax, [eax + 32] add eax, 32
vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
vpunpcklwd ymm2, ymm2, ymm5 vpunpcklwd ymm2, ymm2, ymm5
vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
...@@ -6122,8 +6123,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, ...@@ -6122,8 +6123,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
vpsrld ymm2, ymm2, 13 vpsrld ymm2, ymm2, 13
vpackssdw ymm2, ymm2, ymm3 vpackssdw ymm2, ymm2, ymm3
vmovdqu [edx], ymm2 vmovdqu [eax + edx - 32], ymm2
lea edx, [edx + 32]
sub ecx, 16 sub ecx, 16
jg convertloop jg convertloop
vzeroupper vzeroupper
...@@ -6142,21 +6142,21 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src, ...@@ -6142,21 +6142,21 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
mov edx, [esp + 8] /* dst */ mov edx, [esp + 8] /* dst */
vbroadcastss ymm4, [esp + 12] /* scale */ vbroadcastss ymm4, [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */ mov ecx, [esp + 16] /* width */
sub edx, eax
// 16 pixel loop. // 16 pixel loop.
convertloop: convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
lea eax, [eax + 32] add eax, 32
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
vcvtdq2ps ymm3, ymm3 vcvtdq2ps ymm3, ymm3
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
vmulps ymm3, ymm3, ymm4 vmulps ymm3, ymm3, ymm4
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm3, ymm3, 3 vcvtps2ph xmm3, ymm3, 3
vmovdqu [edx], xmm2 vmovdqu [eax + edx + 32], xmm2
vmovdqu [edx + 16], xmm3 vmovdqu [eax + edx + 32 + 16], xmm3
lea edx, [edx + 32]
sub ecx, 16 sub ecx, 16
jg convertloop jg convertloop
vzeroupper vzeroupper
......
...@@ -45,9 +45,10 @@ static void ScalePlaneDown2(int src_width, ...@@ -45,9 +45,10 @@ static void ScalePlaneDown2(int src_width,
int y; int y;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) = uint8* dst_ptr, int dst_width) =
filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear filtering == kFilterNone
? ScaleRowDown2Linear_C ? ScaleRowDown2_C
: ScaleRowDown2Box_C); : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
: ScaleRowDown2Box_C);
int row_stride = src_stride << 1; int row_stride = src_stride << 1;
if (!filtering) { if (!filtering) {
src_ptr += src_stride; // Point to odd rows. src_ptr += src_stride; // Point to odd rows.
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment