Commit 2d80fc31 authored by Frank Barchard's avatar Frank Barchard

Port HalfFloatRow_SSE2 to AVX2 but not using F16C.

R=wangcheng@google.com, hubbe@chromium.org
BUG=libyuv:560

Review URL: https://codereview.chromium.org/2421993002 .
parent fdcf524a
......@@ -201,6 +201,7 @@ extern "C" {
#define HAS_COPYROW_AVX
#define HAS_H422TOARGBROW_AVX2
#define HAS_HALFFLOATROW_AVX2
// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast
#define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
......@@ -1931,11 +1932,14 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
// Scale and convert to half float.
void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_SSE2(const uint16* src, uint16* dst, float scale,
int width);
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
int width);
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_SSE2(const uint16* src, uint16* dst, float scale,
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
int width);
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
......
......@@ -2570,12 +2570,20 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
}
#endif
#if defined(HAS_HALFFLOATROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
if (TestCpuFlag(kCpuHasAVX2)) {
HalfFloatRow = HalfFloatRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
HalfFloatRow = HalfFloatRow_AVX2;
}
}
#endif
#if defined(HAS_HALFFLOATROW_F16C)
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
HalfFloatRow = HalfFloatRow_Any_F16C;
if (IS_ALIGNED(width, 16)) {
HalfFloatRow = HalfFloatRow_F16C;
}
}
#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
......
......@@ -576,9 +576,11 @@ ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15)
#ifdef HAS_HALFFLOATROW_AVX2
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
#endif
#ifdef HAS_HALFFLOATROW_F16C
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
#endif
#undef ANY11P16
// Any 1 to 1 with yuvconstants
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
......
......@@ -5341,7 +5341,43 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
#endif // HAS_HALFFLOATROW_SSE2
#ifdef HAS_HALFFLOATROW_AVX2
// TODO(fbarchard): consider vadddw instead of vmulps
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts
"lea " MEMLEA(0x20,0) ",%0 \n"
"vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n"
"vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vmulps %%ymm3,%%ymm4,%%ymm3 \n"
"vmulps %%ymm2,%%ymm4,%%ymm2 \n"
"vpsrld $0xd,%%ymm3,%%ymm3 \n"
"vpsrld $0xd,%%ymm2,%%ymm2 \n"
"vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
"vmovdqu %%ymm2," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "x"(scale * kScaleBias) // %3
: "memory", "cc",
"xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_HALFFLOATROW_AVX2
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
......@@ -5362,6 +5398,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -5371,7 +5408,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
"xmm2", "xmm3", "xmm4"
);
}
#endif // HAS_HALFFLOATROW_AVX2
#endif // HAS_HALFFLOATROW_F16C
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
......
......@@ -6056,13 +6056,49 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_AVX2
__declspec(naked)
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
__asm {
mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */
movd xmm4, dword ptr [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */
vmulss xmm4, xmm4, kExpBias
vbroadcastss ymm4, xmm4
vpxor ymm5, ymm5, ymm5
// 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
lea eax, [eax + 32]
vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
vpunpcklwd ymm2, ymm2, ymm5
vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
vcvtdq2ps ymm2, ymm2
vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
vmulps ymm2, ymm2, ymm4
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
vpsrld ymm2, ymm2, 13
vpackssdw ymm2, ymm2, ymm3
vmovdqu [edx], ymm2
lea edx, [edx + 32]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_HALFFLOATROW_AVX2
#ifdef HAS_HALFFLOATROW_F16C
__declspec(naked)
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
__asm {
mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */
vbroadcastss ymm4, [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */
// 8 pixel loop.
// 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
......@@ -6082,7 +6118,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
ret
}
}
#endif // HAS_HALFFLOATROW_AVX2
#endif // HAS_HALFFLOATROW_F16C
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment