Commit 6825b161 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

HalfFloat SSE2/AVX2 optimized port scheduling.

Uses 1 add instead of 2 leas to reduce port pressure on ports 1 and 5
used for SIMD instructions.

BUG=libyuv:670
TEST=~/iaca-lin64/bin/iaca.sh -arch HSW out/Release/obj/libyuv/row_gcc.o

Change-Id: I3965ee5dcb49941a535efa611b5988d977f5b65c
Reviewed-on: https://chromium-review.googlesource.com/433391Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 7a54d0a3
......@@ -202,9 +202,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
uint8 b1 = src_argb[4] >> 3;
uint8 g1 = src_argb[5] >> 2;
uint8 r1 = src_argb[6] >> 3;
WRITEWORD(
dst_rgb,
b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27));
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
(r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
......@@ -238,9 +237,8 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb,
uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
WRITEWORD(
dst_rgb,
b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27));
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
(r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
......
// VERSION 2
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
......@@ -5457,12 +5456,13 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"pshufd $0x0,%3,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
"lea " MEMLEA(0x10,0) ",%0 \n"
"add $0x10,%0 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
"cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
......@@ -5473,8 +5473,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
"psrld $0xd,%%xmm2 \n"
"psrld $0xd,%%xmm3 \n"
"packssdw %%xmm3,%%xmm2 \n"
"movdqu %%xmm2," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
......@@ -5488,17 +5487,17 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
#endif // HAS_HALFFLOATROW_SSE2
#ifdef HAS_HALFFLOATROW_AVX2
// TODO(fbarchard): consider vadddw instead of vmulps
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
"lea " MEMLEA(0x20,0) ",%0 \n"
"add $0x20,%0 \n"
"vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
"vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
......@@ -5508,10 +5507,10 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
"vpsrld $0xd,%%ymm3,%%ymm3 \n"
"vpsrld $0xd,%%ymm2,%%ymm2 \n"
"vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
"vmovdqu %%ymm2," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -5526,26 +5525,25 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
"vbroadcastss %3, %%ymm4 \n"
"sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
"vmulps %%ymm2,%%ymm4,%%ymm2 \n"
"vmulps %%ymm3,%%ymm4,%%ymm3 \n"
"vcvtps2ph $3, %%ymm2, %%xmm2 \n"
"vcvtps2ph $3, %%ymm3, %%xmm3 \n"
"vmovdqu %%xmm2," MEMACCESS(1) " \n"
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
"add $0x20,%0 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -5560,22 +5558,21 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
asm volatile (
"sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
"vcvtps2ph $3, %%ymm2, %%xmm2 \n"
"vcvtps2ph $3, %%ymm3, %%xmm3 \n"
"vmovdqu %%xmm2," MEMACCESS(1) " \n"
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
"add $0x20,%0 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
......
......@@ -6070,11 +6070,12 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
mulss xmm4, kExpBias
pshufd xmm4, xmm4, 0
pxor xmm5, xmm5
sub edx, eax
// 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
lea eax, [eax + 16]
add eax, 16
movdqa xmm3, xmm2
punpcklwd xmm2, xmm5
cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
......@@ -6085,8 +6086,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
psrld xmm2, 13
psrld xmm3, 13
packssdw xmm2, xmm3
movdqu [edx], xmm2
lea edx, [edx + 16]
movdqu [eax + edx - 16], xmm2
sub ecx, 8
jg convertloop
ret
......@@ -6108,11 +6108,12 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vmulss xmm4, xmm4, kExpBias
vbroadcastss ymm4, xmm4
vpxor ymm5, ymm5, ymm5
sub edx, eax
// 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
lea eax, [eax + 32]
add eax, 32
vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
vpunpcklwd ymm2, ymm2, ymm5
vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
......@@ -6122,8 +6123,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
vpsrld ymm2, ymm2, 13
vpackssdw ymm2, ymm2, ymm3
vmovdqu [edx], ymm2
lea edx, [edx + 32]
vmovdqu [eax + edx - 32], ymm2
sub ecx, 16
jg convertloop
vzeroupper
......@@ -6142,21 +6142,21 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
mov edx, [esp + 8] /* dst */
vbroadcastss ymm4, [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */
sub edx, eax
// 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
lea eax, [eax + 32]
add eax, 32
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
vcvtdq2ps ymm3, ymm3
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
vmulps ymm3, ymm3, ymm4
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm3, ymm3, 3
vmovdqu [edx], xmm2
vmovdqu [edx + 16], xmm3
lea edx, [edx + 32]
vmovdqu [eax + edx + 32], xmm2
vmovdqu [eax + edx + 32 + 16], xmm3
sub ecx, 16
jg convertloop
vzeroupper
......
......@@ -45,9 +45,10 @@ static void ScalePlaneDown2(int src_width,
int y;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear
? ScaleRowDown2Linear_C
: ScaleRowDown2Box_C);
filtering == kFilterNone
? ScaleRowDown2_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_C
: ScaleRowDown2Box_C);
int row_stride = src_stride << 1;
if (!filtering) {
src_ptr += src_stride; // Point to odd rows.
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment