Commit 9335518f authored by fbarchard@google.com's avatar fbarchard@google.com

Port some of the conversion routines to nacl

BUG=253
TEST=validator
R=nfullagar@google.com, ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/1983004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@748 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent b8ffdc9e
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 747
Version: 748
License: BSD
License File: LICENSE
......
......@@ -44,15 +44,21 @@ extern "C" {
#define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBCOLORMATRIXROW_SSSE3
#define HAS_ARGBGRAYROW_SSSE3
#define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBMULTIPLYROW_SSE2
#define HAS_ARGBQUANTIZEROW_SSE2
#define HAS_ARGBSEPIAROW_SSSE3
#define HAS_ARGBSHADEROW_SSE2
#define HAS_ARGBSUBTRACTROW_SSE2
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
// Conversions:
#define HAS_ARGBTOBAYERROW_SSSE3
#define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_FIXEDDIV_X86
#define HAS_ARGBTOYROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif
// The following are available on all x86 platforms except NaCL x64:
......@@ -65,10 +71,8 @@ extern "C" {
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
#define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_ARGBTOARGB1555ROW_SSE2
#define HAS_ARGBTOARGB4444ROW_SSE2
#define HAS_ARGBTOBAYERROW_SSSE3
#define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565ROW_SSE2
......@@ -76,15 +80,12 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
#define HAS_COPYROW_ERMS
#define HAS_HALFROW_SSE2
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
......@@ -126,9 +127,7 @@ extern "C" {
// Effects:
#define HAS_ARGBAFFINEROW_SSE2
#define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBUNATTENUATEROW_SSE2
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
#define HAS_INTERPOLATEROW_SSE2
#define HAS_INTERPOLATEROW_SSSE3
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 747
#define LIBYUV_VERSION 748
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -24,13 +24,17 @@ extern "C" {
#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
#define MEMLEA(offset, base) #offset "(%q" #base ")"
#define MEMLEA4(offset, base, index, scale) \
#offset "(%q" #base ",%q" #index "," #scale ")"
#else
#define MEMACCESS(base) "(%" #base ")"
#define MEMACCESS2(offset, base) #offset "(%" #base ")"
#define MEMLEA(offset, base) #offset "(%" #base ")"
#define MEMLEA4(offset, base, index, scale) \
#offset "(%" #base ",%" #index "," #scale ")"
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB
static vec8 kARGBToY = {
......@@ -41,6 +45,9 @@ static vec8 kARGBToY = {
static vec8 kARGBToYJ = {
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
static vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
......@@ -113,6 +120,9 @@ static uvec8 kAddUV128 = {
static uvec16 kAddUVJ128 = {
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
#ifdef HAS_RGB24TOARGBROW_SSSE3
// Shuffle table for converting RGB24 to ARGB.
static uvec8 kShuffleMaskRGB24ToARGB = {
......@@ -143,24 +153,26 @@ static uvec8 kShuffleMaskARGBToRGB24_0 = {
static uvec8 kShuffleMaskARGBToRAW_0 = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
};
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_I400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"movq "MEMACCESS(0)",%%xmm0 \n"
"lea "MEMLEA(0x8,0)",%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"movdqa %%xmm0,"MEMACCESS(1)" \n"
"movdqa %%xmm1,"MEMACCESS2(0x10,1)" \n"
"lea "MEMLEA(0x20,1)",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
......@@ -181,17 +193,17 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
"pslld $0x18,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"movq "MEMACCESS(0)",%%xmm0 \n"
"lea "MEMLEA(0x8,0)",%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"movdqu %%xmm0,"MEMACCESS(1)" \n"
"movdqu %%xmm1,"MEMACCESS2(0x10,1)" \n"
"lea "MEMLEA(0x20,1)",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
......@@ -204,7 +216,9 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
#endif
);
}
#endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_RGB24TOARGBROW_SSSE3
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
......@@ -627,22 +641,24 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
#endif
);
}
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm3 \n"
"movdqa "MEMACCESS(0)",%%xmm0 \n"
"movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n"
"movdqa "MEMACCESS2(0x20,0)",%%xmm2 \n"
"movdqa "MEMACCESS2(0x30,0)",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"lea "MEMLEA(0x40,0)",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
......@@ -650,8 +666,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
......@@ -665,74 +681,76 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
);
}
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm3 \n"
"movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqu "MEMACCESS2(0x10,0)",%%xmm1 \n"
"movdqu "MEMACCESS2(0x20,0)",%%xmm2 \n"
"movdqu "MEMACCESS2(0x30,0)",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"lea "MEMLEA(0x40,0)",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"paddw %%xmm5,%%xmm0 \n"
"paddw %%xmm5,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"movdqu %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kARGBToYJ), // %3
"m"(kAddYJ64) // %4
: "m"(kARGBToY), // %3
"m"(kAddY16) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
#endif // HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#ifdef HAS_ARGBTOYJROW_SSSE3
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"movdqa "MEMACCESS(0)",%%xmm0 \n"
"movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n"
"movdqa "MEMACCESS2(0x20,0)",%%xmm2 \n"
"movdqa "MEMACCESS2(0x30,0)",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"lea "MEMLEA(0x40,0)",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"paddw %%xmm5,%%xmm0 \n"
"paddw %%xmm5,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kARGBToY), // %3
"m"(kAddY16) // %4
: "m"(kARGBToYJ), // %3
"m"(kAddYJ64) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -746,15 +764,15 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"movdqa %4,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqu "MEMACCESS2(0x10,0)",%%xmm1 \n"
"movdqu "MEMACCESS2(0x20,0)",%%xmm2 \n"
"movdqu "MEMACCESS2(0x30,0)",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"lea "MEMLEA(0x40,0)",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"paddw %%xmm5,%%xmm0 \n"
......@@ -763,8 +781,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"movdqu %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
......@@ -777,7 +795,9 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#endif
);
}
#endif // HAS_ARGBTOYJROW_SSSE3
#ifdef HAS_ARGBTOUVROW_SSSE3
// TODO(fbarchard): pass xmm constants to single block of assembly.
// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
......@@ -1873,7 +1893,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#endif
);
}
#endif // HAS_ARGBTOYROW_SSSE3
#endif // HAS_ARGBTOUVROW_SSSE3
#ifdef HAS_I422TOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
......@@ -2834,15 +2854,16 @@ static uvec8 kARGBShuffleMirror = {
void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"lea "MEMLEA4(-0x10,0,2,4)",%0 \n"
"movdqa %3,%%xmm5 \n"
"lea -0x10(%0),%0 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0,%2,4),%%xmm0 \n"
"movdqa "MEMACCESS(0)",%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"lea "MEMLEA(-0x10,0)",%0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -4053,9 +4074,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
int width) {
asm volatile (
"movd (%2),%%xmm2 \n"
"movd 0x4(%2),%%xmm3 \n"
"movd 0x8(%2),%%xmm4 \n"
"movd "MEMACCESS(2)",%%xmm2 \n"
"movd "MEMACCESS2(0x4,2)",%%xmm3 \n"
"movd "MEMACCESS2(0x8,2)",%%xmm4 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
......@@ -4136,11 +4157,11 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"movdqa "MEMACCESS(0)",%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n"
"movdqa "MEMACCESS(0)",%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm1 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"pmullw %%xmm3,%%xmm0 \n"
"movdqa (%0),%%xmm7 \n"
"movdqa "MEMACCESS(0)",%%xmm7 \n"
"pmullw %%xmm3,%%xmm1 \n"
"pand %%xmm6,%%xmm7 \n"
"paddw %%xmm4,%%xmm0 \n"
......@@ -4520,7 +4541,6 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
const int32* previous_cumsum, int width) {
asm volatile (
"sub %1,%2 \n"
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n"
"sub $0x4,%3 \n"
......@@ -4531,8 +4551,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
// 4 pixel loop \n"
".p2align 2 \n"
"40: \n"
"movdqu (%0),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
"movdqu "MEMACCESS(0)",%%xmm2 \n"
"lea "MEMLEA(0x10,0)",%0 \n"
"movdqa %%xmm2,%%xmm4 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
......@@ -4543,22 +4563,23 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"punpcklwd %%xmm1,%%xmm4 \n"
"punpckhwd %%xmm1,%%xmm5 \n"
"paddd %%xmm2,%%xmm0 \n"
"movdqa (%1,%2,1),%%xmm2 \n"
"movdqa "MEMACCESS(2)",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm0 \n"
"movdqa 0x10(%1,%2,1),%%xmm3 \n"
"movdqa "MEMACCESS2(0x10,2)",%%xmm3 \n"
"paddd %%xmm0,%%xmm3 \n"
"paddd %%xmm4,%%xmm0 \n"
"movdqa 0x20(%1,%2,1),%%xmm4 \n"
"movdqa "MEMACCESS2(0x20,2)",%%xmm4 \n"
"paddd %%xmm0,%%xmm4 \n"
"paddd %%xmm5,%%xmm0 \n"
"movdqa 0x30(%1,%2,1),%%xmm5 \n"
"movdqa "MEMACCESS2(0x30,2)",%%xmm5 \n"
"lea "MEMLEA(0x40,2)",%2 \n"
"paddd %%xmm0,%%xmm5 \n"
"movdqa %%xmm2,(%1) \n"
"movdqa %%xmm3,0x10(%1) \n"
"movdqa %%xmm4,0x20(%1) \n"
"movdqa %%xmm5,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"movdqa %%xmm2,"MEMACCESS(1)" \n"
"movdqa %%xmm3,"MEMACCESS2(0x10,1)" \n"
"movdqa %%xmm4,"MEMACCESS2(0x20,1)" \n"
"movdqa %%xmm5,"MEMACCESS2(0x30,1)" \n"
"lea "MEMLEA(0x40,1)",%1 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
......@@ -4569,15 +4590,15 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
// 1 pixel loop \n"
".p2align 2 \n"
"10: \n"
"movd (%0),%%xmm2 \n"
"lea 0x4(%0),%0 \n"
"movd "MEMACCESS(0)",%%xmm2 \n"
"lea "MEMLEA(0x4,0)",%0 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklwd %%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm0 \n"
"movdqu (%1,%2,1),%%xmm2 \n"
"movdqu "MEMACCESS(2)",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"movdqu %%xmm2,(%1) \n"
"lea 0x10(%1),%1 \n"
"movdqu %%xmm2,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x10,1)",%1 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
......@@ -5260,19 +5281,20 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
// NaCL caveat - assumes movd is from GPR
"movd %3,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"movdqa "MEMACCESS(0)",%%xmm0 \n"
"movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n"
"lea "MEMLEA(0x20,0)",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"punpckldq %%xmm1,%%xmm0 \n"
"sub $0x8,%2 \n"
"movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n"
"movq %%xmm0,"MEMACCESS(1)" \n"
"lea "MEMLEA(0x8,1)",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
......@@ -5291,18 +5313,18 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
"movdqa (%3),%%xmm5 \n"
"movdqa "MEMACCESS(3)",%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"movdqa "MEMACCESS(0)",%%xmm0 \n"
"movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n"
"lea "MEMLEA(0x20,0)",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"movdqa %%xmm0,"MEMACCESS(1)" \n"
"movdqa %%xmm1,"MEMACCESS2(0x10,1)" \n"
"lea "MEMLEA(0x20,1)",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -5318,18 +5340,18 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
"movdqa (%3),%%xmm5 \n"
"movdqa "MEMACCESS(3)",%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqu "MEMACCESS2(0x10,0)",%%xmm1 \n"
"lea "MEMLEA(0x20,0)",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"movdqu %%xmm0,"MEMACCESS(1)" \n"
"movdqu %%xmm1,"MEMACCESS2(0x10,1)" \n"
"lea "MEMLEA(0x20,1)",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......
......@@ -3322,12 +3322,13 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
movdqa xmm5, kARGBShuffleMirror
lea eax, [eax - 16]
align 16
convertloop:
movdqa xmm0, [eax + ecx * 4]
movdqa xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm5
sub ecx, 4
movdqa [edx], xmm0
......@@ -5806,7 +5807,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
mov edx, cumsum
mov esi, previous_cumsum
mov ecx, width
sub esi, edx
pxor xmm0, xmm0
pxor xmm1, xmm1
......@@ -5833,19 +5833,20 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpckhwd xmm5, xmm1
paddd xmm0, xmm2
movdqa xmm2, [edx + esi] // previous row above.
movdqa xmm2, [esi] // previous row above.
paddd xmm2, xmm0
paddd xmm0, xmm3
movdqa xmm3, [edx + esi + 16]
movdqa xmm3, [esi + 16]
paddd xmm3, xmm0
paddd xmm0, xmm4
movdqa xmm4, [edx + esi + 32]
movdqa xmm4, [esi + 32]
paddd xmm4, xmm0
paddd xmm0, xmm5
movdqa xmm5, [edx + esi + 48]
movdqa xmm5, [esi + 48]
lea esi, [esi + 64]
paddd xmm5, xmm0
movdqa [edx], xmm2
......@@ -5869,7 +5870,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1
paddd xmm0, xmm2
movdqu xmm2, [edx + esi]
movdqu xmm2, [esi]
lea esi, [esi + 16]
paddd xmm2, xmm0
movdqu [edx], xmm2
lea edx, [edx + 16]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment