Commit abfeea9b authored by fbarchard@google.com's avatar fbarchard@google.com

Math functions - add, substract, multiply and shade adapted to nacl friendly addressing.

BUG=253
TEST=out\release\libyuv_unittest --gtest_filter=*Add*
R=dingkai@google.com, nfullagar@chromium.org

Review URL: https://webrtc-codereview.appspot.com/1972004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@746 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 008ecea4
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 745 Version: 746
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -38,8 +38,17 @@ extern "C" { ...@@ -38,8 +38,17 @@ extern "C" {
// The following are available on all x86 platforms, including NaCL: // The following are available on all x86 platforms, including NaCL:
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_ARGBBLENDROW_SSSE3 // Effects:
#define HAS_ARGBADDROW_SSE2
#define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBATTENUATEROW_SSSE3
#define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBMULTIPLYROW_SSE2
#define HAS_ARGBSHADEROW_SSE2
#define HAS_ARGBSUBTRACTROW_SSE2
// Conversions:
#define HAS_FIXEDDIV_X86
#endif #endif
// The following are available on all x86 platforms except NaCL x64: // The following are available on all x86 platforms except NaCL x64:
...@@ -47,7 +56,7 @@ extern "C" { ...@@ -47,7 +56,7 @@ extern "C" {
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!(defined(__native_client__) && defined(__x86_64__)) !(defined(__native_client__) && defined(__x86_64__))
// Conversions. // Conversions:
#define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB1555TOARGBROW_SSE2
...@@ -110,19 +119,14 @@ extern "C" { ...@@ -110,19 +119,14 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2 #define HAS_YUY2TOYROW_SSE2
#define HAS_FIXEDDIV
// Effects // Effects:
#define HAS_ARGBADDROW_SSE2
#define HAS_ARGBAFFINEROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2
#define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3
#define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBGRAYROW_SSSE3
#define HAS_ARGBMIRRORROW_SSSE3 #define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBMULTIPLYROW_SSE2
#define HAS_ARGBQUANTIZEROW_SSE2 #define HAS_ARGBQUANTIZEROW_SSE2
#define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSEPIAROW_SSSE3
#define HAS_ARGBSHADEROW_SSE2
#define HAS_ARGBSUBTRACTROW_SSE2
#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_ARGBUNATTENUATEROW_SSE2
#define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
...@@ -134,12 +138,12 @@ extern "C" { ...@@ -134,12 +138,12 @@ extern "C" {
#define HAS_SOBELYROW_SSSE3 #define HAS_SOBELYROW_SSSE3
#endif #endif
// The following are Windows only. // The following are Windows only:
// TODO(fbarchard): Port to gcc. // TODO(fbarchard): Port to gcc.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#define HAS_ARGBCOLORTABLEROW_X86 #define HAS_ARGBCOLORTABLEROW_X86
#define HAS_RGBCOLORTABLEROW_X86 #define HAS_RGBCOLORTABLEROW_X86
// Visual C 2012 required for AVX2. // Caveat: Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700 #if _MSC_VER >= 1700
#define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2
#define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOUVROW_AVX2
...@@ -157,7 +161,7 @@ extern "C" { ...@@ -157,7 +161,7 @@ extern "C" {
#define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2 #define HAS_YUY2TOYROW_AVX2
// Effects // Effects:
#define HAS_ARGBADDROW_AVX2 #define HAS_ARGBADDROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2
...@@ -167,7 +171,7 @@ extern "C" { ...@@ -167,7 +171,7 @@ extern "C" {
#endif #endif
#endif #endif
// The following are Yasm x86 only. // The following are Yasm x86 only:
// TODO(fbarchard): Port AVX2 to inline. // TODO(fbarchard): Port AVX2 to inline.
#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
(defined(_M_IX86) || defined(_M_X64) || \ (defined(_M_IX86) || defined(_M_X64) || \
...@@ -194,7 +198,7 @@ extern "C" { ...@@ -194,7 +198,7 @@ extern "C" {
#endif #endif
#endif #endif
// The following are available on Neon platforms // The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOUVROW_NEON
...@@ -267,7 +271,7 @@ extern "C" { ...@@ -267,7 +271,7 @@ extern "C" {
#define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON #define HAS_YUY2TOYROW_NEON
// Effects // Effects:
#define HAS_ARGBADDROW_NEON #define HAS_ARGBADDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON #define HAS_ARGBATTENUATEROW_NEON
#define HAS_ARGBBLENDROW_NEON #define HAS_ARGBBLENDROW_NEON
...@@ -286,7 +290,7 @@ extern "C" { ...@@ -286,7 +290,7 @@ extern "C" {
#define HAS_INTERPOLATEROW_NEON #define HAS_INTERPOLATEROW_NEON
#endif #endif
// The following are available on Mips platforms // The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
#define HAS_COPYROW_MIPS #define HAS_COPYROW_MIPS
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
...@@ -1534,8 +1538,9 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -1534,8 +1538,9 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
int FixedDiv_C(int num, int div); int FixedDiv_C(int num, int div);
#ifdef HAS_FIXEDDIV int FixedDiv_X86(int num, int div);
int FixedDiv(int num, int div); #ifdef HAS_FIXEDDIV_X86
#define FixedDiv FixedDiv_X86
#else #else
#define FixedDiv FixedDiv_C #define FixedDiv FixedDiv_C
#endif #endif
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 745 #define LIBYUV_VERSION 746
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -30,7 +30,9 @@ extern "C" { ...@@ -30,7 +30,9 @@ extern "C" {
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
// This module is for Visual C x86 // This module is for Visual C x86
#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \ #if !defined(LIBYUV_DISABLE_X86) && \
!(defined(__native_client__) && defined(__x86_64__)) && \
(defined(_M_IX86) || \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
#define HAS_HASHDJB2_SSE41 #define HAS_HASHDJB2_SSE41
...@@ -73,8 +75,9 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); ...@@ -73,8 +75,9 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#define HAS_SUMSQUAREERROR_NEON #define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif #endif
#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \ #if !defined(LIBYUV_DISABLE_X86) && \
defined(__x86_64__) || defined(__i386__)) !(defined(__native_client__) && defined(__x86_64__)) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2 #define HAS_SUMSQUAREERROR_SSE2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
#endif #endif
......
...@@ -16,7 +16,9 @@ namespace libyuv { ...@@ -16,7 +16,9 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if !defined(LIBYUV_DISABLE_X86) && \
!(defined(__native_client__) && defined(__x86_64__)) && \
(defined(__x86_64__) || defined(__i386__))
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse; uint32 sse;
...@@ -65,6 +67,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -65,6 +67,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
!(defined(__native_client__) && defined(__x86_64__)) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
#define HAS_HASHDJB2_SSE41 #define HAS_HASHDJB2_SSE41
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
......
...@@ -1904,7 +1904,7 @@ void I422ToUYVYRow_C(const uint8* src_y, ...@@ -1904,7 +1904,7 @@ void I422ToUYVYRow_C(const uint8* src_y,
} }
} }
#if !defined(LIBYUV_DISABLE_X86) #if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper. 5% slower. // row_win.cc has asm version, but GCC uses 2 step wrapper. 5% slower.
// TODO(fbarchard): Handle width > kMaxStride here instead of calling code. // TODO(fbarchard): Handle width > kMaxStride here instead of calling code.
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
...@@ -2001,7 +2001,6 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, ...@@ -2001,7 +2001,6 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width); UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
} }
#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
#endif // !defined(LIBYUV_DISABLE_X86) #endif // !defined(LIBYUV_DISABLE_X86)
#undef clamp0 #undef clamp0
......
...@@ -3027,6 +3027,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) { ...@@ -3027,6 +3027,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
} }
#endif // HAS_COPYROW_X86 #endif // HAS_COPYROW_X86
#ifdef HAS_COPYROW_ERMS
// Unaligned Multiple of 1. // Unaligned Multiple of 1.
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
size_t width_tmp = static_cast<size_t>(width); size_t width_tmp = static_cast<size_t>(width);
...@@ -3039,6 +3040,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { ...@@ -3039,6 +3040,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
: "memory", "cc" : "memory", "cc"
); );
} }
#endif // HAS_COPYROW_ERMS
#ifdef HAS_SETROW_X86 #ifdef HAS_SETROW_X86
void SetRow_X86(uint8* dst, uint32 v32, int width) { void SetRow_X86(uint8* dst, uint32 v32, int width) {
...@@ -4167,14 +4169,14 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -4167,14 +4169,14 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) { uint32 value) {
asm volatile ( asm volatile (
"movd %3,%%xmm2 \n" "movd %3,%%xmm2 \n"
"sub %0,%1 \n"
"punpcklbw %%xmm2,%%xmm2 \n" "punpcklbw %%xmm2,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm2 \n"
// 4 pixel loop. // 4 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"lea "MEMLEA(0x10,0)",%0 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n"
...@@ -4184,8 +4186,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -4184,8 +4186,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n" "movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea 0x10(%0),%0 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -4205,14 +4207,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4205,14 +4207,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
"sub %0,%2 \n"
// 4 pixel loop. // 4 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqu (%0,%1),%%xmm2 \n" "lea "MEMLEA(0x10,0)",%0 \n"
"movdqu "MEMACCESS(1)",%%xmm2 \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"movdqu %%xmm0,%%xmm1 \n" "movdqu %%xmm0,%%xmm1 \n"
"movdqu %%xmm2,%%xmm3 \n" "movdqu %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
...@@ -4223,8 +4225,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4223,8 +4225,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pmulhuw %%xmm3,%%xmm1 \n" "pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"movdqu %%xmm0,(%0,%2,1) \n" "movdqu %%xmm0,"MEMACCESS(2)" \n"
"lea 0x10(%0),%0 \n" "lea "MEMLEA(0x10,2)",%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
...@@ -4244,18 +4246,17 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4244,18 +4246,17 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"sub %0,%1 \n"
"sub %0,%2 \n"
// 4 pixel loop. // 4 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqu (%0,%1),%%xmm1 \n" "lea "MEMLEA(0x10,0)",%0 \n"
"movdqu "MEMACCESS(1)",%%xmm1 \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"movdqu %%xmm0,(%0,%2,1) \n" "movdqu %%xmm0,"MEMACCESS(2)" \n"
"lea 0x10(%0),%0 \n" "lea "MEMLEA(0x10,2)",%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
...@@ -4275,18 +4276,17 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4275,18 +4276,17 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"sub %0,%1 \n"
"sub %0,%2 \n"
// 4 pixel loop. // 4 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqu (%0,%1),%%xmm1 \n" "lea "MEMLEA(0x10,0)",%0 \n"
"movdqu "MEMACCESS(1)",%%xmm1 \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"psubusb %%xmm1,%%xmm0 \n" "psubusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"movdqu %%xmm0,(%0,%2,1) \n" "movdqu %%xmm0,"MEMACCESS(2)" \n"
"lea 0x10(%0),%0 \n" "lea "MEMLEA(0x10,2)",%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
...@@ -4793,6 +4793,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -4793,6 +4793,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
} }
#endif // HAS_ARGBAFFINEROW_SSE2 #endif // HAS_ARGBAFFINEROW_SSE2
#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1 // Bilinear filter 16x2 -> 16x1
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
...@@ -4895,6 +4896,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -4895,6 +4896,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#endif #endif
); );
} }
#endif // HAS_INTERPOLATEROW_SSSE3
#ifdef HAS_INTERPOLATEROW_SSE2 #ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1 // Bilinear filter 16x2 -> 16x1
...@@ -5009,6 +5011,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5009,6 +5011,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
} }
#endif // HAS_INTERPOLATEROW_SSE2 #endif // HAS_INTERPOLATEROW_SSE2
#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1 // Bilinear filter 16x2 -> 16x1
void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
...@@ -5111,6 +5114,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5111,6 +5114,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#endif #endif
); );
} }
#endif // HAS_INTERPOLATEROW_SSSE3
#ifdef HAS_INTERPOLATEROW_SSE2 #ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1 // Bilinear filter 16x2 -> 16x1
...@@ -5225,6 +5229,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5225,6 +5229,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
} }
#endif // HAS_INTERPOLATEROW_SSE2 #endif // HAS_INTERPOLATEROW_SSE2
#ifdef HAS_HALFROW_SSE2
void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) { uint8* dst_uv, int pix) {
asm volatile ( asm volatile (
...@@ -5247,7 +5252,9 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -5247,7 +5252,9 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
#endif #endif
); );
} }
#endif // HAS_HALFROW_SSE2
#ifdef HAS_ARGBTOBAYERROW_SSSE3
void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) { uint32 selector, int pix) {
asm volatile ( asm volatile (
...@@ -5275,7 +5282,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, ...@@ -5275,7 +5282,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
#endif #endif
); );
} }
#endif // HAS_ARGBTOBAYERROW_SSSE3
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
...@@ -5330,7 +5339,9 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, ...@@ -5330,7 +5339,9 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
#endif #endif
); );
} }
#endif // HAS_ARGBSHUFFLEROW_SSSE3
#ifdef HAS_I422TOYUY2ROW_SSE2
void I422ToYUY2Row_SSE2(const uint8* src_y, void I422ToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -5365,7 +5376,9 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, ...@@ -5365,7 +5376,9 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
#endif #endif
); );
} }
#endif // HAS_I422TOYUY2ROW_SSE2
#ifdef HAS_I422TOUYVYROW_SSE2
void I422ToUYVYRow_SSE2(const uint8* src_y, void I422ToUYVYRow_SSE2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -5400,9 +5413,11 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, ...@@ -5400,9 +5413,11 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
#endif #endif
); );
} }
#endif // HAS_I422TOUYVYROW_SSE2
#ifdef HAS_FIXEDDIV_X86
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
int FixedDiv(int num, int div) { int FixedDiv_X86(int num, int div) {
asm volatile ( asm volatile (
"cdq \n" "cdq \n"
"shld $0x10,%%eax,%%edx \n" "shld $0x10,%%eax,%%edx \n"
...@@ -5415,6 +5430,7 @@ int FixedDiv(int num, int div) { ...@@ -5415,6 +5430,7 @@ int FixedDiv(int num, int div) {
); );
return num; return num;
} }
#endif // HAS_FIXEDDIV_X86
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -5239,13 +5239,13 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -5239,13 +5239,13 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
movd xmm2, [esp + 16] // value movd xmm2, [esp + 16] // value
sub edx, eax
punpcklbw xmm2, xmm2 punpcklbw xmm2, xmm2
punpcklqdq xmm2, xmm2 punpcklqdq xmm2, xmm2
align 16 align 16
convertloop: convertloop:
movdqa xmm0, [eax] // read 4 pixels movdqa xmm0, [eax] // read 4 pixels
lea eax, [eax + 16]
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 // first 2 punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2 punpckhbw xmm1, xmm1 // next 2
...@@ -5255,8 +5255,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -5255,8 +5255,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4 sub ecx, 4
movdqa [eax + edx], xmm0 movdqa [edx], xmm0
lea eax, [eax + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
ret ret
...@@ -5276,25 +5276,25 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5276,25 +5276,25 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0 pxor xmm5, xmm5 // constant 0
sub esi, eax
sub edx, eax
align 16 align 16
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0 movdqu xmm0, [eax] // read 4 pixels from src_argb0
movdqu xmm2, [eax + esi] // read 4 pixels from src_argb1 movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0 movdqu xmm1, xmm0
movdqu xmm3, xmm2 movdqu xmm3, xmm2
punpcklbw xmm0, xmm0 // first 2 punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2 punpckhbw xmm1, xmm1 // next 2
punpcklbw xmm2, xmm5 // first 2 punpcklbw xmm2, xmm5 // first 2
punpckhbw xmm3, xmm5 // next 2 punpckhbw xmm3, xmm5 // next 2
pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4 sub ecx, 4
movdqu [eax + edx], xmm0 movdqu [edx], xmm0
lea eax, [eax + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
pop esi pop esi
...@@ -5315,8 +5315,6 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5315,8 +5315,6 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax
sub edx, eax
sub ecx, 4 sub ecx, 4
jl convertloop49 jl convertloop49
...@@ -5324,11 +5322,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5324,11 +5322,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
align 16 align 16
convertloop4: convertloop4:
movdqu xmm0, [eax] // read 4 pixels from src_argb0 movdqu xmm0, [eax] // read 4 pixels from src_argb0
movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1 lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
paddusb xmm0, xmm1 // src_argb0 + src_argb1 paddusb xmm0, xmm1 // src_argb0 + src_argb1
sub ecx, 4 sub ecx, 4
movdqu [eax + edx], xmm0 movdqu [edx], xmm0
lea eax, [eax + 16] lea edx, [edx + 16]
jge convertloop4 jge convertloop4
convertloop49: convertloop49:
...@@ -5337,11 +5337,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5337,11 +5337,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
convertloop1: convertloop1:
movd xmm0, [eax] // read 1 pixels from src_argb0 movd xmm0, [eax] // read 1 pixels from src_argb0
movd xmm1, [eax + esi] // read 1 pixels from src_argb1 lea eax, [eax + 4]
movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
paddusb xmm0, xmm1 // src_argb0 + src_argb1 paddusb xmm0, xmm1 // src_argb0 + src_argb1
sub ecx, 1 sub ecx, 1
movd [eax + edx], xmm0 movd [edx], xmm0
lea eax, [eax + 4] lea edx, [edx + 4]
jge convertloop1 jge convertloop1
convertloop19: convertloop19:
...@@ -5362,17 +5364,17 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5362,17 +5364,17 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax
sub edx, eax
align 16 align 16
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0 movdqu xmm0, [eax] // read 4 pixels from src_argb0
movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1 lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
psubusb xmm0, xmm1 // src_argb0 - src_argb1 psubusb xmm0, xmm1 // src_argb0 - src_argb1
sub ecx, 4 sub ecx, 4
movdqu [eax + edx], xmm0 movdqu [edx], xmm0
lea eax, [eax + 16] lea edx, [edx + 16]
jg convertloop jg convertloop
pop esi pop esi
...@@ -5392,14 +5394,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5392,14 +5394,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0 vpxor ymm5, ymm5, ymm5 // constant 0
sub esi, eax
sub edx, eax
align 16 align 16
convertloop: convertloop:
vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
vmovdqu ymm3, [eax + esi] // read 8 pixels from src_argb1 lea eax, [eax + 32]
vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
lea esi, [esi + 32]
vpunpcklbw ymm0, ymm1, ymm1 // low 4 vpunpcklbw ymm0, ymm1, ymm1 // low 4
vpunpckhbw ymm1, ymm1, ymm1 // high 4 vpunpckhbw ymm1, ymm1, ymm1 // high 4
vpunpcklbw ymm2, ymm3, ymm5 // low 4 vpunpcklbw ymm2, ymm3, ymm5 // low 4
...@@ -5407,8 +5409,8 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5407,8 +5409,8 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm1
vmovdqu [eax + edx], ymm0 vmovdqu [edx], ymm0
lea eax, [eax + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
...@@ -5430,15 +5432,15 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5430,15 +5432,15 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax
sub edx, eax
align 16 align 16
convertloop: convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
vpaddusb ymm0, ymm0, [eax + esi] // add 8 pixels from src_argb1
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32] lea eax, [eax + 32]
vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
lea esi, [esi + 32]
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
...@@ -5460,15 +5462,15 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5460,15 +5462,15 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax
sub edx, eax
align 16 align 16
convertloop: convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
vpsubusb ymm0, ymm0, [eax + esi] // src_argb0 - src_argb1
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32] lea eax, [eax + 32]
vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
lea esi, [esi + 32]
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
...@@ -6646,9 +6648,10 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, ...@@ -6646,9 +6648,10 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
} }
} }
#ifdef HAS_FIXEDDIV_X86
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
int FixedDiv(int num, int div) { int FixedDiv_X86(int num, int div) {
__asm { __asm {
mov eax, [esp + 4] // num mov eax, [esp + 4] // num
cdq // extend num to 64 bits cdq // extend num to 64 bits
...@@ -6658,6 +6661,7 @@ int FixedDiv(int num, int div) { ...@@ -6658,6 +6661,7 @@ int FixedDiv(int num, int div) {
ret ret
} }
} }
#endif // HAS_FIXEDDIV_X86
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#ifdef __cplusplus #ifdef __cplusplus
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment