Commit 9335518f authored by fbarchard@google.com's avatar fbarchard@google.com

Port some of the conversion routines to nacl

BUG=253
TEST=validator
R=nfullagar@google.com, ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/1983004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@748 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent b8ffdc9e
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 747 Version: 748
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -44,15 +44,21 @@ extern "C" { ...@@ -44,15 +44,21 @@ extern "C" {
#define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3
#define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBGRAYROW_SSSE3
#define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBMULTIPLYROW_SSE2 #define HAS_ARGBMULTIPLYROW_SSE2
#define HAS_ARGBQUANTIZEROW_SSE2 #define HAS_ARGBQUANTIZEROW_SSE2
#define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSEPIAROW_SSSE3
#define HAS_ARGBSHADEROW_SSE2 #define HAS_ARGBSHADEROW_SSE2
#define HAS_ARGBSUBTRACTROW_SSE2 #define HAS_ARGBSUBTRACTROW_SSE2
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
// Conversions: // Conversions:
#define HAS_ARGBTOBAYERROW_SSSE3
#define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_FIXEDDIV_X86 #define HAS_FIXEDDIV_X86
#define HAS_ARGBTOYROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif #endif
// The following are available on all x86 platforms except NaCL x64: // The following are available on all x86 platforms except NaCL x64:
...@@ -65,10 +71,8 @@ extern "C" { ...@@ -65,10 +71,8 @@ extern "C" {
#define HAS_ABGRTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2
#define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB1555ROW_SSE2
#define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2
#define HAS_ARGBTOBAYERROW_SSSE3
#define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTORGB565ROW_SSE2
...@@ -76,15 +80,12 @@ extern "C" { ...@@ -76,15 +80,12 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_SSE2 #define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86 #define HAS_COPYROW_X86
#define HAS_COPYROW_ERMS #define HAS_COPYROW_ERMS
#define HAS_HALFROW_SSE2 #define HAS_HALFROW_SSE2
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3 #define HAS_I411TOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3
...@@ -126,9 +127,7 @@ extern "C" { ...@@ -126,9 +127,7 @@ extern "C" {
// Effects: // Effects:
#define HAS_ARGBAFFINEROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2
#define HAS_ARGBMIRRORROW_SSSE3
#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_ARGBUNATTENUATEROW_SSE2
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
#define HAS_INTERPOLATEROW_SSE2 #define HAS_INTERPOLATEROW_SSE2
#define HAS_INTERPOLATEROW_SSSE3 #define HAS_INTERPOLATEROW_SSSE3
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 747 #define LIBYUV_VERSION 748
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -24,13 +24,17 @@ extern "C" { ...@@ -24,13 +24,17 @@ extern "C" {
#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" #define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" #define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
#define MEMLEA(offset, base) #offset "(%q" #base ")" #define MEMLEA(offset, base) #offset "(%q" #base ")"
#define MEMLEA4(offset, base, index, scale) \
#offset "(%q" #base ",%q" #index "," #scale ")"
#else #else
#define MEMACCESS(base) "(%" #base ")" #define MEMACCESS(base) "(%" #base ")"
#define MEMACCESS2(offset, base) #offset "(%" #base ")" #define MEMACCESS2(offset, base) #offset "(%" #base ")"
#define MEMLEA(offset, base) #offset "(%" #base ")" #define MEMLEA(offset, base) #offset "(%" #base ")"
#define MEMLEA4(offset, base, index, scale) \
#offset "(%" #base ",%" #index "," #scale ")"
#endif #endif
#ifdef HAS_ARGBTOYROW_SSSE3 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB // Constants for ARGB
static vec8 kARGBToY = { static vec8 kARGBToY = {
...@@ -41,6 +45,9 @@ static vec8 kARGBToY = { ...@@ -41,6 +45,9 @@ static vec8 kARGBToY = {
static vec8 kARGBToYJ = { static vec8 kARGBToYJ = {
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
}; };
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
static vec8 kARGBToU = { static vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
...@@ -113,6 +120,9 @@ static uvec8 kAddUV128 = { ...@@ -113,6 +120,9 @@ static uvec8 kAddUV128 = {
static uvec16 kAddUVJ128 = { static uvec16 kAddUVJ128 = {
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
}; };
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
#ifdef HAS_RGB24TOARGBROW_SSSE3
// Shuffle table for converting RGB24 to ARGB. // Shuffle table for converting RGB24 to ARGB.
static uvec8 kShuffleMaskRGB24ToARGB = { static uvec8 kShuffleMaskRGB24ToARGB = {
...@@ -143,24 +153,26 @@ static uvec8 kShuffleMaskARGBToRGB24_0 = { ...@@ -143,24 +153,26 @@ static uvec8 kShuffleMaskARGBToRGB24_0 = {
static uvec8 kShuffleMaskARGBToRAW_0 = { static uvec8 kShuffleMaskARGBToRAW_0 = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
}; };
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_I400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n" "pslld $0x18,%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" "movq "MEMACCESS(0)",%%xmm0 \n"
"lea 0x8(%0),%0 \n" "lea "MEMLEA(0x8,0)",%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n" "punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n" "punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n" "por %%xmm5,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,"MEMACCESS(1)" \n"
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,"MEMACCESS2(0x10,1)" \n"
"lea 0x20(%1),%1 \n" "lea "MEMLEA(0x20,1)",%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -181,17 +193,17 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, ...@@ -181,17 +193,17 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
"pslld $0x18,%%xmm5 \n" "pslld $0x18,%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" "movq "MEMACCESS(0)",%%xmm0 \n"
"lea 0x8(%0),%0 \n" "lea "MEMLEA(0x8,0)",%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n" "punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n" "punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n" "por %%xmm5,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,"MEMACCESS(1)" \n"
"movdqu %%xmm1,0x10(%1) \n" "movdqu %%xmm1,"MEMACCESS2(0x10,1)" \n"
"lea 0x20(%1),%1 \n" "lea "MEMLEA(0x20,1)",%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
...@@ -204,7 +216,9 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, ...@@ -204,7 +216,9 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
#endif #endif
); );
} }
#endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_RGB24TOARGBROW_SSSE3
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
...@@ -627,22 +641,24 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { ...@@ -627,22 +641,24 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
#endif #endif
); );
} }
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n" "movdqa "MEMACCESS2(0x20,0)",%%xmm2 \n"
"movdqa 0x30(%0),%%xmm3 \n" "movdqa "MEMACCESS2(0x30,0)",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n" "lea "MEMLEA(0x40,0)",%0 \n"
"phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n" "phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n"
...@@ -650,8 +666,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -650,8 +666,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea 0x10(%1),%1 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
...@@ -665,74 +681,76 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -665,74 +681,76 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
); );
} }
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqu "MEMACCESS2(0x10,0)",%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n" "movdqu "MEMACCESS2(0x20,0)",%%xmm2 \n"
"movdqa 0x30(%0),%%xmm3 \n" "movdqu "MEMACCESS2(0x30,0)",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n" "lea "MEMLEA(0x40,0)",%0 \n"
"phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n" "phaddw %%xmm3,%%xmm2 \n"
"paddw %%xmm5,%%xmm0 \n"
"paddw %%xmm5,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqu %%xmm0,"MEMACCESS(1)" \n"
"lea 0x10(%1),%1 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "m"(kARGBToYJ), // %3 : "m"(kARGBToY), // %3
"m"(kAddYJ64) // %4 "m"(kAddY16) // %4
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif #endif
); );
} }
#endif // HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { #ifdef HAS_ARGBTOYJROW_SSSE3
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n" "movdqa "MEMACCESS2(0x20,0)",%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n" "movdqa "MEMACCESS2(0x30,0)",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n" "lea "MEMLEA(0x40,0)",%0 \n"
"phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n" "phaddw %%xmm3,%%xmm2 \n"
"paddw %%xmm5,%%xmm0 \n"
"paddw %%xmm5,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea 0x10(%1),%1 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "m"(kARGBToY), // %3 : "m"(kARGBToYJ), // %3
"m"(kAddY16) // %4 "m"(kAddYJ64) // %4
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
...@@ -746,15 +764,15 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -746,15 +764,15 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu "MEMACCESS2(0x10,0)",%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n" "movdqu "MEMACCESS2(0x20,0)",%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n" "movdqu "MEMACCESS2(0x30,0)",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n" "lea "MEMLEA(0x40,0)",%0 \n"
"phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n" "phaddw %%xmm3,%%xmm2 \n"
"paddw %%xmm5,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n"
...@@ -763,8 +781,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -763,8 +781,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,"MEMACCESS(1)" \n"
"lea 0x10(%1),%1 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
...@@ -777,7 +795,9 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -777,7 +795,9 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#endif #endif
); );
} }
#endif // HAS_ARGBTOYJROW_SSSE3
#ifdef HAS_ARGBTOUVROW_SSSE3
// TODO(fbarchard): pass xmm constants to single block of assembly. // TODO(fbarchard): pass xmm constants to single block of assembly.
// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
...@@ -1873,7 +1893,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1873,7 +1893,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#endif #endif
); );
} }
#endif // HAS_ARGBTOYROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
#ifdef HAS_I422TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
...@@ -2834,15 +2854,16 @@ static uvec8 kARGBShuffleMirror = { ...@@ -2834,15 +2854,16 @@ static uvec8 kARGBShuffleMirror = {
void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width); intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile ( asm volatile (
"lea "MEMLEA4(-0x10,0,2,4)",%0 \n"
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
"lea -0x10(%0),%0 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0,%2,4),%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"lea "MEMLEA(-0x10,0)",%0 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,"MEMACCESS(1)" \n"
"lea 0x10(%1),%1 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -4053,9 +4074,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { ...@@ -4053,9 +4074,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
int width) { int width) {
asm volatile ( asm volatile (
"movd (%2),%%xmm2 \n" "movd "MEMACCESS(2)",%%xmm2 \n"
"movd 0x4(%2),%%xmm3 \n" "movd "MEMACCESS2(0x4,2)",%%xmm3 \n"
"movd 0x8(%2),%%xmm4 \n" "movd "MEMACCESS2(0x8,2)",%%xmm4 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n"
...@@ -4136,11 +4157,11 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, ...@@ -4136,11 +4157,11 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"movdqa "MEMACCESS(0)",%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"movdqa (%0),%%xmm1 \n" "movdqa "MEMACCESS(0)",%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm1 \n" "punpckhbw %%xmm5,%%xmm1 \n"
"pmulhuw %%xmm2,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n"
"pmullw %%xmm3,%%xmm0 \n" "pmullw %%xmm3,%%xmm0 \n"
"movdqa (%0),%%xmm7 \n" "movdqa "MEMACCESS(0)",%%xmm7 \n"
"pmullw %%xmm3,%%xmm1 \n" "pmullw %%xmm3,%%xmm1 \n"
"pand %%xmm6,%%xmm7 \n" "pand %%xmm6,%%xmm7 \n"
"paddw %%xmm4,%%xmm0 \n" "paddw %%xmm4,%%xmm0 \n"
...@@ -4520,7 +4541,6 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -4520,7 +4541,6 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
const int32* previous_cumsum, int width) { const int32* previous_cumsum, int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n"
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n" "pxor %%xmm1,%%xmm1 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
...@@ -4531,8 +4551,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -4531,8 +4551,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
// 4 pixel loop \n" // 4 pixel loop \n"
".p2align 2 \n" ".p2align 2 \n"
"40: \n" "40: \n"
"movdqu (%0),%%xmm2 \n" "movdqu "MEMACCESS(0)",%%xmm2 \n"
"lea 0x10(%0),%0 \n" "lea "MEMLEA(0x10,0)",%0 \n"
"movdqa %%xmm2,%%xmm4 \n" "movdqa %%xmm2,%%xmm4 \n"
"punpcklbw %%xmm1,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
...@@ -4543,22 +4563,23 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -4543,22 +4563,23 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"punpcklwd %%xmm1,%%xmm4 \n" "punpcklwd %%xmm1,%%xmm4 \n"
"punpckhwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm5 \n"
"paddd %%xmm2,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n"
"movdqa (%1,%2,1),%%xmm2 \n" "movdqa "MEMACCESS(2)",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n" "paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm0 \n" "paddd %%xmm3,%%xmm0 \n"
"movdqa 0x10(%1,%2,1),%%xmm3 \n" "movdqa "MEMACCESS2(0x10,2)",%%xmm3 \n"
"paddd %%xmm0,%%xmm3 \n" "paddd %%xmm0,%%xmm3 \n"
"paddd %%xmm4,%%xmm0 \n" "paddd %%xmm4,%%xmm0 \n"
"movdqa 0x20(%1,%2,1),%%xmm4 \n" "movdqa "MEMACCESS2(0x20,2)",%%xmm4 \n"
"paddd %%xmm0,%%xmm4 \n" "paddd %%xmm0,%%xmm4 \n"
"paddd %%xmm5,%%xmm0 \n" "paddd %%xmm5,%%xmm0 \n"
"movdqa 0x30(%1,%2,1),%%xmm5 \n" "movdqa "MEMACCESS2(0x30,2)",%%xmm5 \n"
"lea "MEMLEA(0x40,2)",%2 \n"
"paddd %%xmm0,%%xmm5 \n" "paddd %%xmm0,%%xmm5 \n"
"movdqa %%xmm2,(%1) \n" "movdqa %%xmm2,"MEMACCESS(1)" \n"
"movdqa %%xmm3,0x10(%1) \n" "movdqa %%xmm3,"MEMACCESS2(0x10,1)" \n"
"movdqa %%xmm4,0x20(%1) \n" "movdqa %%xmm4,"MEMACCESS2(0x20,1)" \n"
"movdqa %%xmm5,0x30(%1) \n" "movdqa %%xmm5,"MEMACCESS2(0x30,1)" \n"
"lea 0x40(%1),%1 \n" "lea "MEMLEA(0x40,1)",%1 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"jge 40b \n" "jge 40b \n"
...@@ -4569,15 +4590,15 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -4569,15 +4590,15 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
// 1 pixel loop \n" // 1 pixel loop \n"
".p2align 2 \n" ".p2align 2 \n"
"10: \n" "10: \n"
"movd (%0),%%xmm2 \n" "movd "MEMACCESS(0)",%%xmm2 \n"
"lea 0x4(%0),%0 \n" "lea "MEMLEA(0x4,0)",%0 \n"
"punpcklbw %%xmm1,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm2 \n"
"punpcklwd %%xmm1,%%xmm2 \n" "punpcklwd %%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n"
"movdqu (%1,%2,1),%%xmm2 \n" "movdqu "MEMACCESS(2)",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n" "paddd %%xmm0,%%xmm2 \n"
"movdqu %%xmm2,(%1) \n" "movdqu %%xmm2,"MEMACCESS(1)" \n"
"lea 0x10(%1),%1 \n" "lea "MEMLEA(0x10,1)",%1 \n"
"sub $0x1,%3 \n" "sub $0x1,%3 \n"
"jge 10b \n" "jge 10b \n"
...@@ -5260,19 +5281,20 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -5260,19 +5281,20 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) { uint32 selector, int pix) {
asm volatile ( asm volatile (
// NaCL caveat - assumes movd is from GPR
"movd %3,%%xmm5 \n" "movd %3,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea "MEMLEA(0x20,0)",%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"punpckldq %%xmm1,%%xmm0 \n" "punpckldq %%xmm1,%%xmm0 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"movq %%xmm0,(%1) \n" "movq %%xmm0,"MEMACCESS(1)" \n"
"lea 0x8(%1),%1 \n" "lea "MEMLEA(0x8,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
...@@ -5291,18 +5313,18 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, ...@@ -5291,18 +5313,18 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
asm volatile ( asm volatile (
"movdqa (%3),%%xmm5 \n" "movdqa "MEMACCESS(3)",%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa "MEMACCESS(0)",%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea "MEMLEA(0x20,0)",%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,"MEMACCESS(1)" \n"
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,"MEMACCESS2(0x10,1)" \n"
"lea 0x20(%1),%1 \n" "lea "MEMLEA(0x20,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -5318,18 +5340,18 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, ...@@ -5318,18 +5340,18 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
asm volatile ( asm volatile (
"movdqa (%3),%%xmm5 \n" "movdqa "MEMACCESS(3)",%%xmm5 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu "MEMACCESS(0)",%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu "MEMACCESS2(0x10,0)",%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea "MEMLEA(0x20,0)",%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,"MEMACCESS(1)" \n"
"movdqu %%xmm1,0x10(%1) \n" "movdqu %%xmm1,"MEMACCESS2(0x10,1)" \n"
"lea 0x20(%1),%1 \n" "lea "MEMLEA(0x20,1)",%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
......
...@@ -3322,12 +3322,13 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -3322,12 +3322,13 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
movdqa xmm5, kARGBShuffleMirror movdqa xmm5, kARGBShuffleMirror
lea eax, [eax - 16]
align 16 align 16
convertloop: convertloop:
movdqa xmm0, [eax + ecx * 4] movdqa xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
sub ecx, 4 sub ecx, 4
movdqa [edx], xmm0 movdqa [edx], xmm0
...@@ -5806,7 +5807,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -5806,7 +5807,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
mov edx, cumsum mov edx, cumsum
mov esi, previous_cumsum mov esi, previous_cumsum
mov ecx, width mov ecx, width
sub esi, edx
pxor xmm0, xmm0 pxor xmm0, xmm0
pxor xmm1, xmm1 pxor xmm1, xmm1
...@@ -5833,19 +5833,20 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -5833,19 +5833,20 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpckhwd xmm5, xmm1 punpckhwd xmm5, xmm1
paddd xmm0, xmm2 paddd xmm0, xmm2
movdqa xmm2, [edx + esi] // previous row above. movdqa xmm2, [esi] // previous row above.
paddd xmm2, xmm0 paddd xmm2, xmm0
paddd xmm0, xmm3 paddd xmm0, xmm3
movdqa xmm3, [edx + esi + 16] movdqa xmm3, [esi + 16]
paddd xmm3, xmm0 paddd xmm3, xmm0
paddd xmm0, xmm4 paddd xmm0, xmm4
movdqa xmm4, [edx + esi + 32] movdqa xmm4, [esi + 32]
paddd xmm4, xmm0 paddd xmm4, xmm0
paddd xmm0, xmm5 paddd xmm0, xmm5
movdqa xmm5, [edx + esi + 48] movdqa xmm5, [esi + 48]
lea esi, [esi + 64]
paddd xmm5, xmm0 paddd xmm5, xmm0
movdqa [edx], xmm2 movdqa [edx], xmm2
...@@ -5869,7 +5870,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -5869,7 +5870,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpcklbw xmm2, xmm1 punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1 punpcklwd xmm2, xmm1
paddd xmm0, xmm2 paddd xmm0, xmm2
movdqu xmm2, [edx + esi] movdqu xmm2, [esi]
lea esi, [esi + 16]
paddd xmm2, xmm0 paddd xmm2, xmm0
movdqu [edx], xmm2 movdqu [edx], xmm2
lea edx, [edx + 16] lea edx, [edx + 16]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment