Commit 373cdbdc authored by fbarchard@google.com's avatar fbarchard@google.com

reorder stores for FastConvertYUVToABGRRow_SSSE3 and…

reorder stores for FastConvertYUVToABGRRow_SSSE3 and FastConvertYUVToBGRARow_SSSE3. ReverseRow_SSE2.  cpu detect allow environment variable override set LIBYUV_DISABLE_SSSE3=1 set LIBYUV_DISABLE_SSE2=1.  Reorder stores in rotate for core2
BUG=none
TEST=none
Review URL: http://webrtc-codereview.appspot.com/317010

git-svn-id: http://libyuv.googlecode.com/svn/trunk@107 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8b9759c4
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 106
Version: 107
License: BSD
License File: LICENSE
......
......@@ -10,6 +10,7 @@
#include "libyuv/cpu_id.h"
#include <stdlib.h> // for getenv
#ifdef _MSC_VER
#include <intrin.h>
#endif
......@@ -55,6 +56,15 @@ int InitCpuFlags() {
cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
(cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
kCpuInitialized;
// environment variable overrides for testing.
if (getenv("LIBYUV_DISABLE_SSE2")) {
cpu_info_ &= ~kCpuHasSSE2;
}
// environment variable overrides for testing.
if (getenv("LIBYUV_DISABLE_SSSE3")) {
cpu_info_ &= ~kCpuHasSSSE3;
}
#elif defined(__ANDROID__) && defined(__ARM_NEON__)
uint64_t features = android_getCpuFeatures();
cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
......
......@@ -340,6 +340,18 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
ReverseRow = ReverseRow_SSSE3;
} else
#endif
#if defined(HAS_REVERSE_ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16) &&
IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
ReverseRow = ReverseRow_SSE2;
} else
#endif
{
ReverseRow = ReverseRow_C;
......
......@@ -867,6 +867,14 @@ void RotatePlane180(const uint8* src, int src_stride,
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
ReverseRow = ReverseRow_SSSE3;
} else
#endif
#if defined(HAS_REVERSE_ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
ReverseRow = ReverseRow_SSE2;
} else
#endif
{
ReverseRow = ReverseRow_C;
......@@ -1019,8 +1027,8 @@ __asm {
lea eax, [eax - 16]
pshufb xmm0, xmm5
movlpd qword ptr [edx], xmm0
lea edx, [edx + 8]
movhpd qword ptr [edi], xmm0
lea edx, [edx + 8]
lea edi, [edi + 8]
sub ecx, 8
ja convertloop
......@@ -1044,8 +1052,8 @@ void ReverseRowUV_SSSE3(const uint8* src,
"lea -16(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n"
"lea 8(%1),%1 \n"
"movhpd %%xmm0,(%2) \n"
"lea 8(%1),%1 \n"
"lea 8(%2),%2 \n"
"sub $8,%3 \n"
"ja 1b \n"
......
......@@ -65,6 +65,7 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
#define HAS_REVERSE_ROW_SSSE3
#define HAS_REVERSE_ROW_SSE2
#endif
// The following are available on Neon platforms
......@@ -102,6 +103,9 @@ void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#ifdef HAS_REVERSE_ROW_SSSE3
void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
#endif
#ifdef HAS_REVERSE_ROW_SSE2
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width);
#endif
#ifdef HAS_REVERSE_ROW_NEON
void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
#endif
......
......@@ -17,16 +17,22 @@ namespace libyuv {
extern "C" {
#endif
#ifdef __APPLE__
#define CONST
#else
#define CONST static const
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
vec8 kARGBToU = {
CONST vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
uvec8 kARGBToV = {
CONST uvec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
};
uvec8 kAddUV128 = {
CONST uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
......@@ -35,31 +41,31 @@ uvec8 kAddUV128 = {
#ifdef HAS_ARGBTOYROW_SSSE3
// Constant multiplication table for converting ARGB to I400.
vec8 kARGBToY = {
CONST vec8 kARGBToY = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
uvec8 kAddY16 = {
CONST uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
// Shuffle table for converting BG24 to ARGB.
uvec8 kShuffleMaskBG24ToARGB = {
CONST uvec8 kShuffleMaskBG24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
uvec8 kShuffleMaskRAWToARGB = {
CONST uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
// Shuffle table for converting ABGR to ARGB.
uvec8 kShuffleMaskABGRToARGB = {
CONST uvec8 kShuffleMaskABGRToARGB = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting BGRA to ARGB.
uvec8 kShuffleMaskBGRAToARGB = {
CONST uvec8 kShuffleMaskBGRAToARGB = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
......@@ -352,7 +358,7 @@ struct {
vec16 kUVBiasR;
vec16 kYSub16;
vec16 kYToRgb;
} SIMD_ALIGNED(kYuvConstants) = {
} CONST SIMD_ALIGNED(kYuvConstants) = {
{ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
{ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
......@@ -445,8 +451,8 @@ void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
"punpcklbw %%xmm2,%%xmm5 \n"
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"movdqa %%xmm5,(%3) \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5,(%3) \n"
"movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
......@@ -480,8 +486,8 @@ void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"movdqa %%xmm2,(%3) \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
......@@ -640,11 +646,8 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
#ifdef HAS_REVERSE_ROW_SSSE3
// TODO(fbarchard): define CONST macro that is static const for linux, but
// does nothing for gcc on OSX (which has an internal compiler fault)
// Shuffle table for reversing the bytes.
uvec8 kShuffleReverse = {
CONST uvec8 kShuffleReverse = {
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
};
......@@ -653,14 +656,14 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
asm volatile (
"movdqa %3,%%xmm5 \n"
"lea -0x10(%0,%2,1),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
......@@ -673,6 +676,38 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
}
#endif
#ifdef HAS_REVERSE_ROW_SSE2
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"lea -0x10(%0,%2,1),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"psllw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
);
}
#endif
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -654,8 +654,8 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
punpcklbw xmm5, xmm2 // AR
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
movdqa [edx], xmm5
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqa [edx], xmm5
movdqa [edx + 16], xmm0
lea edx, [edx + 32]
......@@ -694,8 +694,8 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
punpcklbw xmm0, xmm5 // BA
movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
movdqa [edx], xmm2
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
movdqa [edx], xmm2
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
......@@ -794,7 +794,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, [eax]
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm0 // Y.Y
psubusw xmm0, xmm3
......@@ -849,6 +849,33 @@ __asm {
}
#endif
#ifdef HAS_REVERSE_ROW_SSE2
__declspec(naked)
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
lea eax, [eax + ecx - 16]
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
movdqa xmm1, xmm0 // swap bytes
psllw xmm0, 8
psrlw xmm1, 8
por xmm0, xmm1
pshuflw xmm0, xmm0, 0x1b // swap words
pshufhw xmm0, xmm0, 0x1b
pshufd xmm0, xmm0, 0x4e
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
#endif
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment