Commit 70445ef2 authored by Frank Barchard's avatar Frank Barchard

avx2 scale down by 2 for gcc

R=dhrosa@google.com, harryjin@google.com
BUG=libyuv:527

Review URL: https://codereview.chromium.org/1520423003 .
parent 77346fcb
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1554
Version: 1555
License: BSD
License File: LICENSE
......
......@@ -69,12 +69,12 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#endif
// The following are available for Visual C and clangcl 32 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
#endif
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1554
#define LIBYUV_VERSION 1555
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -188,6 +188,104 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
#ifdef HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1"
);
}
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
);
}
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......
......@@ -298,18 +298,18 @@ static int TestFilter_16(int src_width, int src_height,
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
#define TEST_FACTOR(name, nom, denom) \
#define TEST_FACTOR(name, nom, denom, maxdiff) \
TEST_FACTOR1(name, None, nom, denom, 0) \
TEST_FACTOR1(name, Linear, nom, denom, 3) \
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
TEST_FACTOR1(name, Box, nom, denom, 3)
TEST_FACTOR(2, 1, 2)
TEST_FACTOR(4, 1, 4)
TEST_FACTOR(8, 1, 8)
TEST_FACTOR(3by4, 3, 4)
TEST_FACTOR(3by8, 3, 8)
TEST_FACTOR(3, 1, 3)
TEST_FACTOR1(name, Linear, nom, denom, maxdiff) \
TEST_FACTOR1(name, Bilinear, nom, denom, maxdiff) \
TEST_FACTOR1(name, Box, nom, denom, maxdiff)
TEST_FACTOR(2, 1, 2, 0)
TEST_FACTOR(4, 1, 4, 3)
TEST_FACTOR(8, 1, 8, 3)
TEST_FACTOR(3by4, 3, 4, 3)
TEST_FACTOR(3by8, 3, 8, 3)
TEST_FACTOR(3, 1, 3, 3)
#undef TEST_FACTOR1
#undef TEST_FACTOR
#undef SX
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment