Commit d33bf86b authored by fbarchard@google.com's avatar fbarchard@google.com

CopyRow_AVX which supports unaligned pointers for Sandy Bridge CPU.

BUG=363
TESTED=out\release\libyuv_unittest --gtest_filter=*ARGBToARGB_*
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/31489004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1097 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c379d171
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1096 Version: 1097
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -122,6 +122,7 @@ extern "C" { ...@@ -122,6 +122,7 @@ extern "C" {
#define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS #define HAS_COPYROW_ERMS
#define HAS_COPYROW_AVX
#define HAS_COPYROW_SSE2 #define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86 #define HAS_COPYROW_X86
#define HAS_HALFROW_SSE2 #define HAS_HALFROW_SSE2
...@@ -891,6 +892,7 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -891,6 +892,7 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width); int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1096 #define LIBYUV_VERSION 1097
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -201,6 +201,11 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, ...@@ -201,6 +201,11 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
CopyRow = CopyRow_SSE2; CopyRow = CopyRow_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) { if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
...@@ -441,6 +446,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, ...@@ -441,6 +446,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_SSE2; CopyRow = CopyRow_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) { if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
......
...@@ -53,6 +53,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -53,6 +53,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_SSE2; CopyRow = CopyRow_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) { if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
......
...@@ -946,6 +946,11 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -946,6 +946,11 @@ void RotatePlane180(const uint8* src, int src_stride,
CopyRow = CopyRow_SSE2; CopyRow = CopyRow_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) { if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
......
...@@ -136,6 +136,11 @@ void ARGBRotate180(const uint8* src, int src_stride, ...@@ -136,6 +136,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_SSE2; CopyRow = CopyRow_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
CopyRow = CopyRow_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) { if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
......
...@@ -3266,6 +3266,31 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -3266,6 +3266,31 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
} }
#endif // HAS_COPYROW_SSE2 #endif // HAS_COPYROW_SSE2
#ifdef HAS_COPYROW_AVX
void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%ymm0 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%ymm0," MEMACCESS(1) " \n"
"movdqa %%ymm1," MEMACCESS2(0x20,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "ymm0", "ymm1"
#endif
);
}
#endif // HAS_COPYROW_AVX
#ifdef HAS_COPYROW_X86 #ifdef HAS_COPYROW_X86
void CopyRow_X86(const uint8* src, uint8* dst, int width) { void CopyRow_X86(const uint8* src, uint8* dst, int width) {
size_t width_tmp = (size_t)(width); size_t width_tmp = (size_t)(width);
......
...@@ -3687,6 +3687,32 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -3687,6 +3687,32 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
} }
#endif // HAS_COPYROW_SSE2 #endif // HAS_COPYROW_SSE2
#ifdef HAS_COPYROW_AVX
// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
__declspec(naked) __declspec(align(16))
void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 64
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_COPYROW_AVX
// Unaligned Multiple of 1. // Unaligned Multiple of 1.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
...@@ -3704,6 +3730,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { ...@@ -3704,6 +3730,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
} }
#ifdef HAS_COPYROW_X86 #ifdef HAS_COPYROW_X86
// Unaligned Multiple of 4.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void CopyRow_X86(const uint8* src, uint8* dst, int count) { void CopyRow_X86(const uint8* src, uint8* dst, int count) {
__asm { __asm {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment