Commit 373cdbdc authored by fbarchard@google.com's avatar fbarchard@google.com

reorder stores for FastConvertYUVToABGRRow_SSSE3 and…

reorder stores for FastConvertYUVToABGRRow_SSSE3 and FastConvertYUVToBGRARow_SSSE3. ReverseRow_SSE2.  cpu detect allow environment variable override set LIBYUV_DISABLE_SSSE3=1 set LIBYUV_DISABLE_SSE2=1.  Reorder stores in rotate for core2
BUG=none
TEST=none
Review URL: http://webrtc-codereview.appspot.com/317010

git-svn-id: http://libyuv.googlecode.com/svn/trunk@107 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8b9759c4
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 106 Version: 107
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include <stdlib.h> // for getenv
#ifdef _MSC_VER #ifdef _MSC_VER
#include <intrin.h> #include <intrin.h>
#endif #endif
...@@ -55,6 +56,15 @@ int InitCpuFlags() { ...@@ -55,6 +56,15 @@ int InitCpuFlags() {
cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) | cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
(cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
kCpuInitialized; kCpuInitialized;
// environment variable overrides for testing.
if (getenv("LIBYUV_DISABLE_SSE2")) {
cpu_info_ &= ~kCpuHasSSE2;
}
// environment variable overrides for testing.
if (getenv("LIBYUV_DISABLE_SSSE3")) {
cpu_info_ &= ~kCpuHasSSSE3;
}
#elif defined(__ANDROID__) && defined(__ARM_NEON__) #elif defined(__ANDROID__) && defined(__ARM_NEON__)
uint64_t features = android_getCpuFeatures(); uint64_t features = android_getCpuFeatures();
cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) | cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
......
...@@ -340,6 +340,18 @@ int I420Mirror(const uint8* src_y, int src_stride_y, ...@@ -340,6 +340,18 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
ReverseRow = ReverseRow_SSSE3; ReverseRow = ReverseRow_SSSE3;
} else } else
#endif
#if defined(HAS_REVERSE_ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16) &&
IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
ReverseRow = ReverseRow_SSE2;
} else
#endif #endif
{ {
ReverseRow = ReverseRow_C; ReverseRow = ReverseRow_C;
......
...@@ -867,6 +867,14 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -867,6 +867,14 @@ void RotatePlane180(const uint8* src, int src_stride,
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
ReverseRow = ReverseRow_SSSE3; ReverseRow = ReverseRow_SSSE3;
} else } else
#endif
#if defined(HAS_REVERSE_ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
ReverseRow = ReverseRow_SSE2;
} else
#endif #endif
{ {
ReverseRow = ReverseRow_C; ReverseRow = ReverseRow_C;
...@@ -1019,8 +1027,8 @@ __asm { ...@@ -1019,8 +1027,8 @@ __asm {
lea eax, [eax - 16] lea eax, [eax - 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
movlpd qword ptr [edx], xmm0 movlpd qword ptr [edx], xmm0
lea edx, [edx + 8]
movhpd qword ptr [edi], xmm0 movhpd qword ptr [edi], xmm0
lea edx, [edx + 8]
lea edi, [edi + 8] lea edi, [edi + 8]
sub ecx, 8 sub ecx, 8
ja convertloop ja convertloop
...@@ -1044,8 +1052,8 @@ void ReverseRowUV_SSSE3(const uint8* src, ...@@ -1044,8 +1052,8 @@ void ReverseRowUV_SSSE3(const uint8* src,
"lea -16(%0),%0 \n" "lea -16(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" "movlpd %%xmm0,(%1) \n"
"lea 8(%1),%1 \n"
"movhpd %%xmm0,(%2) \n" "movhpd %%xmm0,(%2) \n"
"lea 8(%1),%1 \n"
"lea 8(%2),%2 \n" "lea 8(%2),%2 \n"
"sub $8,%3 \n" "sub $8,%3 \n"
"ja 1b \n" "ja 1b \n"
......
...@@ -65,6 +65,7 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, ...@@ -65,6 +65,7 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
#define HAS_REVERSE_ROW_SSSE3 #define HAS_REVERSE_ROW_SSSE3
#define HAS_REVERSE_ROW_SSE2
#endif #endif
// The following are available on Neon platforms // The following are available on Neon platforms
...@@ -102,6 +103,9 @@ void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -102,6 +103,9 @@ void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#ifdef HAS_REVERSE_ROW_SSSE3 #ifdef HAS_REVERSE_ROW_SSSE3
void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width); void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
#endif #endif
#ifdef HAS_REVERSE_ROW_SSE2
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width);
#endif
#ifdef HAS_REVERSE_ROW_NEON #ifdef HAS_REVERSE_ROW_NEON
void ReverseRow_NEON(const uint8* src, uint8* dst, int width); void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
#endif #endif
......
...@@ -17,16 +17,22 @@ namespace libyuv { ...@@ -17,16 +17,22 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#ifdef __APPLE__
#define CONST
#else
#define CONST static const
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_SSSE3
vec8 kARGBToU = { CONST vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
}; };
uvec8 kARGBToV = { CONST uvec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
}; };
uvec8 kAddUV128 = { CONST uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
}; };
...@@ -35,31 +41,31 @@ uvec8 kAddUV128 = { ...@@ -35,31 +41,31 @@ uvec8 kAddUV128 = {
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
// Constant multiplication table for converting ARGB to I400. // Constant multiplication table for converting ARGB to I400.
vec8 kARGBToY = { CONST vec8 kARGBToY = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
}; };
uvec8 kAddY16 = { CONST uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
}; };
// Shuffle table for converting BG24 to ARGB. // Shuffle table for converting BG24 to ARGB.
uvec8 kShuffleMaskBG24ToARGB = { CONST uvec8 kShuffleMaskBG24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
}; };
// Shuffle table for converting RAW to ARGB. // Shuffle table for converting RAW to ARGB.
uvec8 kShuffleMaskRAWToARGB = { CONST uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
}; };
// Shuffle table for converting ABGR to ARGB. // Shuffle table for converting ABGR to ARGB.
uvec8 kShuffleMaskABGRToARGB = { CONST uvec8 kShuffleMaskABGRToARGB = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
}; };
// Shuffle table for converting BGRA to ARGB. // Shuffle table for converting BGRA to ARGB.
uvec8 kShuffleMaskBGRAToARGB = { CONST uvec8 kShuffleMaskBGRAToARGB = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
}; };
...@@ -352,7 +358,7 @@ struct { ...@@ -352,7 +358,7 @@ struct {
vec16 kUVBiasR; vec16 kUVBiasR;
vec16 kYSub16; vec16 kYSub16;
vec16 kYToRgb; vec16 kYToRgb;
} SIMD_ALIGNED(kYuvConstants) = { } CONST SIMD_ALIGNED(kYuvConstants) = {
{ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
{ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
...@@ -445,8 +451,8 @@ void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi ...@@ -445,8 +451,8 @@ void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
"punpcklbw %%xmm2,%%xmm5 \n" "punpcklbw %%xmm2,%%xmm5 \n"
"movdqa %%xmm5,%%xmm0 \n" "movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n" "punpcklwd %%xmm1,%%xmm5 \n"
"movdqa %%xmm5,(%3) \n"
"punpckhwd %%xmm1,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5,(%3) \n"
"movdqa %%xmm0,0x10(%3) \n" "movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
...@@ -480,8 +486,8 @@ void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi ...@@ -480,8 +486,8 @@ void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
"punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n" "movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n" "punpcklwd %%xmm0,%%xmm2 \n"
"movdqa %%xmm2,(%3) \n"
"punpckhwd %%xmm0,%%xmm1 \n" "punpckhwd %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n" "movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
...@@ -640,11 +646,8 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, ...@@ -640,11 +646,8 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
#ifdef HAS_REVERSE_ROW_SSSE3 #ifdef HAS_REVERSE_ROW_SSSE3
// TODO(fbarchard): define CONST macro that is static const for linux, but
// does nothing for gcc on OSX (which has an internal compiler fault)
// Shuffle table for reversing the bytes. // Shuffle table for reversing the bytes.
uvec8 kShuffleReverse = { CONST uvec8 kShuffleReverse = {
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
}; };
...@@ -653,14 +656,14 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -653,14 +656,14 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
asm volatile ( asm volatile (
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
"lea -0x10(%0,%2,1),%0 \n" "lea -0x10(%0,%2,1),%0 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n" "lea -0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "ja 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(temp_width) // %2 "+r"(temp_width) // %2
...@@ -673,6 +676,38 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -673,6 +676,38 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
} }
#endif #endif
#ifdef HAS_REVERSE_ROW_SSE2
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"lea -0x10(%0,%2,1),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"psllw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
);
}
#endif
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -654,8 +654,8 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, ...@@ -654,8 +654,8 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
punpcklbw xmm5, xmm2 // AR punpcklbw xmm5, xmm2 // AR
movdqa xmm0, xmm5 movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels punpcklwd xmm5, xmm1 // BGRA first 4 pixels
movdqa [edx], xmm5
punpckhwd xmm0, xmm1 // BGRA next 4 pixels punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqa [edx], xmm5
movdqa [edx + 16], xmm0 movdqa [edx + 16], xmm0
lea edx, [edx + 32] lea edx, [edx + 32]
...@@ -694,8 +694,8 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, ...@@ -694,8 +694,8 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
punpcklbw xmm0, xmm5 // BA punpcklbw xmm0, xmm5 // BA
movdqa xmm1, xmm2 movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels punpcklwd xmm2, xmm0 // RGBA first 4 pixels
movdqa [edx], xmm2
punpckhwd xmm1, xmm0 // RGBA next 4 pixels punpckhwd xmm1, xmm0 // RGBA next 4 pixels
movdqa [edx], xmm2
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
...@@ -794,7 +794,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, ...@@ -794,7 +794,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
convertloop: convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, [eax] movq xmm0, qword ptr [eax]
lea eax, [eax + 8] lea eax, [eax + 8]
punpcklbw xmm0, xmm0 // Y.Y punpcklbw xmm0, xmm0 // Y.Y
psubusw xmm0, xmm3 psubusw xmm0, xmm3
...@@ -849,6 +849,33 @@ __asm { ...@@ -849,6 +849,33 @@ __asm {
} }
#endif #endif
#ifdef HAS_REVERSE_ROW_SSE2
__declspec(naked)
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
lea eax, [eax + ecx - 16]
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
movdqa xmm1, xmm0 // swap bytes
psllw xmm0, 8
psrlw xmm1, 8
por xmm0, xmm1
pshuflw xmm0, xmm0, 0x1b // swap words
pshufhw xmm0, xmm0, 0x1b
pshufd xmm0, xmm0, 0x4e
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
#endif
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment