Commit 91000425 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGBUnattenuate_AVX2 ported to GCC. Minor cleanup of constants to use broadcast…

ARGBUnattenuate_AVX2 ported to GCC. Minor cleanup of constants to use broadcast to make 16 byte constant instead of 32 byte.
BUG=269
TESTED=try bots
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/30999004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1163 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f8c33447
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1162
Version: 1163
License: BSD
License File: LICENSE
......
......@@ -205,6 +205,7 @@ extern "C" {
#define HAS_ARGBSUBTRACTROW_AVX2
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
#endif
// The following are require VS2012.
......@@ -218,10 +219,7 @@ extern "C" {
#define HAS_I422TOABGRROW_AVX2
#define HAS_INTERPOLATEROW_AVX2
#define HAS_MIRRORROW_AVX2
// Effects:
#define HAS_ARGBMIRRORROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
#endif // defined(VISUALC_HAS_AVX2)
// The following are Yasm x86 only:
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1162
#define LIBYUV_VERSION 1163
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -3379,7 +3379,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"jge 40b \n"
"jmp 49f \n"
// 4 pixel unaligned loop.
// 4 pixel loop.
LABELALIGN
"41: \n"
"movdqu " MEMACCESS(0) ",%%xmm3 \n"
......@@ -3449,7 +3449,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time.
// aligned to 16 bytes
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
......@@ -3497,14 +3496,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
static uvec8 kShuffleAlpha0 = {
3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
};
static uvec8 kShuffleAlpha1 = {
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
};
// Attenuate 4 pixels at a time.
// aligned to 16 bytes
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm3,%%xmm3 \n"
......@@ -3551,9 +3549,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const ulvec8 kShuffleAlpha_AVX2 = {
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
static const uvec8 kShuffleAlpha_AVX2 = {
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
};
// Attenuate 8 pixels at a time.
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
......@@ -3597,7 +3594,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
// aligned to 16 bytes
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
uintptr_t alpha = 0;
......@@ -3647,6 +3643,80 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
}
#endif // HAS_ARGBUNATTENUATEROW_SSE2
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const uvec8 kUnattenShuffleAlpha_AVX2 = {
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
};
// Unattenuate 8 pixels at a time.
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
int width) {
uintptr_t alpha = 0;
asm volatile (
"sub %0,%1 \n"
"vbroadcastf128 %5,%%ymm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
// replace VPGATHER
"movzb " MEMACCESS2(0x03,0) ",%3 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
"movzb " MEMACCESS2(0x07,0) ",%3 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
"movzb " MEMACCESS2(0x0b,0) ",%3 \n"
"vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
"movzb " MEMACCESS2(0x0f,0) ",%3 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
"movzb " MEMACCESS2(0x13,0) ",%3 \n"
"vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
"movzb " MEMACCESS2(0x17,0) ",%3 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
"movzb " MEMACCESS2(0x1b,0) ",%3 \n"
"vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
"movzb " MEMACCESS2(0x1f,0) ",%3 \n"
MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
"vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
"vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
"vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
"vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
// end of VPGATHER
"vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
"vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
"vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
"vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
"vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
"vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
"vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"sub $0x8,%2 \n"
MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
"lea " MEMLEA(0x20,0) ",%0 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
"+r"(alpha) // %3
: "r"(fixed_invtbl8), // %4
"m"(kUnattenShuffleAlpha_AVX2) // %5
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
#endif // HAS_ARGBUNATTENUATEROW_AVX2
#ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
......@@ -3841,7 +3911,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
// aligned to 16 bytes
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
asm volatile (
......@@ -3894,7 +3963,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
// Aligned to 16 bytes.
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
asm volatile (
......
......@@ -72,7 +72,6 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
// 64 bit
#if defined(_M_X64)
// Aligned destination version.
__declspec(align(16))
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
......@@ -165,7 +164,7 @@ static const lvec32 kPermdARGBToY_AVX = {
// vpshufb for vphaddw + vpackuswb packed to shorts.
static const lvec8 kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
};
// Constants for BGRA.
......@@ -1845,7 +1844,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm packuswb xmm2, xmm2 /* R */ \
}
// 8 pixels, dest aligned 16.
// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I444ToARGBRow_SSSE3(const uint8* y_buf,
......@@ -1888,7 +1887,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
......@@ -1935,7 +1934,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRAWRow_SSSE3(const uint8* y_buf,
......@@ -2055,7 +2054,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToARGBRow_SSSE3(const uint8* y_buf,
......@@ -2098,7 +2097,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 8 pixels.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
// Similar to I420 but duplicate UV once more.
__declspec(naked) __declspec(align(16))
......@@ -2144,7 +2143,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
......@@ -2182,7 +2181,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
......@@ -2423,8 +2422,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
#ifdef HAS_MIRRORROW_AVX2
// Shuffle table for reversing the bytes.
static const ulvec8 kShuffleMirror_AVX2 = {
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
static const uvec8 kShuffleMirror_AVX2 = {
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
};
......@@ -2434,7 +2432,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
vmovdqa ymm5, kShuffleMirror_AVX2
vbroadcastf128 ymm5, kShuffleMirror_AVX2
lea eax, [eax - 32]
align 4
......@@ -3711,7 +3709,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time.
// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
......@@ -3805,8 +3802,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha_AVX2 = {
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
};
__declspec(naked) __declspec(align(16))
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
......@@ -3846,7 +3842,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
......@@ -3896,7 +3891,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
static const uvec8 kUnattenShuffleAlpha_AVX2 = {
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
};
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
......@@ -4185,7 +4180,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
......@@ -4232,7 +4226,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
......@@ -4738,8 +4731,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// area is the number of pixels in the area being averaged.
// dst points to pixel to store result to.
// count is number of averaged pixels to produce.
// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
// aligned.
// Does 4 pixels at a time.
void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst,
int count) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment