Commit aa197ee1 authored by Frank Barchard's avatar Frank Barchard

HalfFloat_SSE2 for Visual C

Low level support for 12 bit 420, 422 and 444 YUV video frame conversion.

BUG=libyuv:560, chromium:445071
TEST=LibYUVPlanarTest.TestHalfFloatPlane on windows
R=hubbe@chromium.org, wangcheng@google.com

Review URL: https://codereview.chromium.org/2387713002 .
parent 4a14cb2e
......@@ -140,6 +140,7 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2
#define HAS_HALFFLOATROW_SSE2
// Effects:
#define HAS_ARGBADDROW_SSE2
......@@ -262,13 +263,6 @@ extern "C" {
#define HAS_I422TOARGBROW_SSSE3
#endif
// The following are available on gcc x86 platforms:
// TODO(fbarchard): Port to Visual C.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_HALFFLOATROW_SSE2
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
......
......@@ -2486,15 +2486,6 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_HALFFLOATROW_AVX)
if (TestCpuFlag(kCpuHasAVX)) {
// HalfFloatRow = HalfFloatRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
HalfFloatRow = HalfFloatRow_AVX;
}
}
#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
src_y += src_stride_y;
......
......@@ -5367,38 +5367,37 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
#ifdef HAS_HALFFLOATROW_SSE2
static float kScaleBias = 1.9259299444e-34f;
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
float mult = 1.9259299444e-34f * scale;
asm volatile (
"movd %3,%%xmm4 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
"pshufd $0x0,%3,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" // 8 shorts
"movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm5,%%xmm0 \n" // 8 ints in xmm0/1
"cvtdq2ps %%xmm0,%%xmm0 \n" // 8 floats
"punpckhwd %%xmm5,%%xmm1 \n"
"cvtdq2ps %%xmm1,%%xmm1 \n"
"mulps %%xmm4,%%xmm0 \n"
"mulps %%xmm4,%%xmm1 \n"
"psrld $0xd,%%xmm0 \n"
"psrld $0xd,%%xmm1 \n"
"packssdw %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
"cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
"punpckhwd %%xmm5,%%xmm3 \n"
"cvtdq2ps %%xmm3,%%xmm3 \n"
"mulps %%xmm4,%%xmm2 \n"
"mulps %%xmm4,%%xmm3 \n"
"psrld $0xd,%%xmm2 \n"
"psrld $0xd,%%xmm3 \n"
"packssdw %%xmm3,%%xmm2 \n"
"movdqu %%xmm2," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "rm"(mult) // %3
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "x"(scale * kScaleBias) // %3
: "memory", "cc",
"xmm0", "xmm1", "xmm4", "xmm5"
"xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_HALFFLOATROW_SSE2
......@@ -5411,17 +5410,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
// 16 pixel loop.
LABELALIGN
"1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm0 \n" // 8 shorts -> 8 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm1 \n" // 8 more
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm0,%%ymm0 \n"
"vcvtdq2ps %%ymm1,%%ymm1 \n"
"vmulps %%ymm0,%%ymm4,%%ymm0 \n"
"vmulps %%ymm1,%%ymm4,%%ymm1 \n"
"vcvtps2ph $3, %%ymm0, %%xmm0 \n"
"vcvtps2ph $3, %%ymm1, %%xmm1 \n"
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
"vmovdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
"vmulps %%ymm2,%%ymm4,%%ymm2 \n"
"vmulps %%ymm3,%%ymm4,%%ymm3 \n"
"vcvtps2ph $3, %%ymm2, %%xmm2 \n"
"vcvtps2ph $3, %%ymm3, %%xmm3 \n"
"vmovdqu %%xmm2," MEMACCESS(1) " \n"
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
......@@ -5431,7 +5430,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
"+r"(width) // %2
: "x"(scale) // %3
: "memory", "cc",
"xmm0", "xmm1", "xmm4"
"xmm2", "xmm3", "xmm4"
);
}
#endif // HAS_HALFFLOATROW_AVX2
......
......@@ -6095,6 +6095,42 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
}
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
#ifdef HAS_HALFFLOATROW_SSE2
static float kExpBias = 1.9259299444e-34f;
__declspec(naked)
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
__asm {
mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */
movd xmm4, dword ptr [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */
mulss xmm4, kExpBias
pshufd xmm4, xmm4, 0
pxor xmm5, xmm5
// 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
lea eax, [eax + 16]
movdqa xmm3, xmm2
punpcklwd xmm2, xmm5
cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
punpckhwd xmm3, xmm5
cvtdq2ps xmm3, xmm3
mulps xmm2, xmm4
mulps xmm3, xmm4
psrld xmm2, 13
psrld xmm3, 13
packssdw xmm2, xmm3
movdqu [edx], xmm2
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
ret
}
}
#endif // HAS_HALFFLOATROW_SSE2
#ifdef HAS_HALFFLOATROW_AVX2
__declspec(naked)
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
......@@ -6106,17 +6142,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
// 8 pixel loop.
convertloop:
vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
lea eax, [eax + 32]
vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats
vcvtdq2ps ymm1, ymm1
vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1
vmulps ymm1, ymm1, ymm4
vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm1, ymm1, 3
vmovdqu [edx], xmm0
vmovdqu [edx + 16], xmm1
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
vcvtdq2ps ymm3, ymm3
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
vmulps ymm3, ymm3, ymm4
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm3, ymm3, 3
vmovdqu [edx], xmm2
vmovdqu [edx + 16], xmm3
lea edx, [edx + 32]
sub ecx, 16
jg convertloop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment