Commit f7a5048f authored by fbarchard@google.com's avatar fbarchard@google.com

align asm new line to column 48

BUG=none
TEST=builds
Review URL: http://webrtc-codereview.appspot.com/268008

git-svn-id: http://libyuv.googlecode.com/svn/trunk@73 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 2cb934c6
......@@ -23,35 +23,30 @@ namespace libyuv {
static uint32 SumSquareError_NEON(const uint8* src_a,
const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile
(
"vmov.u8 q7, #0\n"
"vmov.u8 q9, #0\n"
"vmov.u8 q8, #0\n"
"vmov.u8 q10, #0\n"
"1:\n"
"vld1.u8 {q0}, [%0]!\n"
"vld1.u8 {q1}, [%1]!\n"
"vsubl.u8 q2, d0, d2\n"
"vsubl.u8 q3, d1, d3\n"
"vmlal.s16 q7, d4, d4\n"
"vmlal.s16 q8, d6, d6\n"
"vmlal.s16 q8, d5, d5\n"
"vmlal.s16 q10, d7, d7\n"
"subs %2, %2, #16\n"
"bhi 1b\n"
"vadd.u32 q7, q7, q8\n"
"vadd.u32 q9, q9, q10\n"
"vadd.u32 q10, q7, q9\n"
"vpaddl.u32 q1, q10\n"
"vadd.u64 d0, d2, d3\n"
"vmov.32 %3, d0[0]\n"
asm volatile (
"vmov.u8 q7, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"1: \n"
"vld1.u8 {q0}, [%0]! \n"
"vld1.u8 {q1}, [%1]! \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q7, d4, d4 \n"
"vmlal.s16 q8, d6, d6 \n"
"vmlal.s16 q8, d5, d5 \n"
"vmlal.s16 q10, d7, d7 \n"
"subs %2, %2, #16 \n"
"bhi 1b \n"
"vadd.u32 q7, q7, q8 \n"
"vadd.u32 q9, q9, q10 \n"
"vadd.u32 q10, q7, q9 \n"
"vpaddl.u32 q1, q10 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
......@@ -59,7 +54,6 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"
);
return sse;
}
......@@ -102,7 +96,6 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
pshufd xmm1, xmm0, 01h
paddd xmm0, xmm1
movd eax, xmm0
ret
}
}
......@@ -112,11 +105,12 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
// DISABLE
//#define HAS_SUMSQUAREERROR_SSE2
// DISABLE
#if HAS_SUMSQUAREERROR_SSE2
static uint32 SumSquareError_SSE2(const uint8* src_a,
const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile(
"\n"
asm volatile (
" \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
......@@ -131,6 +125,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
}
#endif
#endif
#endif
static uint32 SumSquareError_C(const uint8* src_a,
const uint8* src_b, int count) {
......@@ -148,7 +143,6 @@ uint64 ComputeSumSquareError(const uint8* src_a,
const uint8* src_b, int count) {
uint32 (*SumSquareError)(const uint8* src_a,
const uint8* src_b, int count);
#if defined(HAS_SUMSQUAREERROR_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SumSquareError = SumSquareError_NEON;
......@@ -162,10 +156,8 @@ uint64 ComputeSumSquareError(const uint8* src_a,
{
SumSquareError = SumSquareError_C;
}
const int kBlockSize = 4096;
uint64 diff = 0;
while (count >= kBlockSize) {
diff += SumSquareError(src_a, src_b, kBlockSize);
src_a += kBlockSize;
......@@ -179,7 +171,6 @@ uint64 ComputeSumSquareError(const uint8* src_a,
diff += static_cast<uint64>(SumSquareError_C(src_a, src_b, count));
}
}
return diff;
}
......@@ -188,7 +179,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
int width, int height) {
uint32 (*SumSquareError)(const uint8* src_a,
const uint8* src_b, int count);
#if defined(HAS_SUMSQUAREERROR_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
(width % 16 == 0)) {
......@@ -200,7 +190,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
}
uint64 sse = 0;
for (int h = 0; h < height; ++h) {
sse += static_cast<uint64>(SumSquareError(src_a, src_b, width));
src_a += stride_a;
......@@ -210,11 +199,10 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
return sse;
}
double Sse2Psnr(double Samples, double Sse) {
double Sse2Psnr(double samples, double sse) {
double psnr;
if (Sse > 0.0)
psnr = 10.0 * log10(255.0 * 255.0 * Samples / Sse);
if (sse > 0.0)
psnr = 10.0 * log10(255.0 * 255.0 * samples / sse);
else
psnr = kMaxPsnr; // Limit to prevent divide by 0
......@@ -224,6 +212,21 @@ double Sse2Psnr(double Samples, double Sse) {
return psnr;
}
double Sse2Psnr(uint64 samples, uint64 sse) {
double psnr;
if (sse > 0) {
double mse = static_cast<double>(samples) / static_cast<double>(sse);
psnr = 10.0 * log10(255.0 * 255.0 * mse);
} else {
psnr = kMaxPsnr; // Limit to prevent divide by 0
}
if (psnr > kMaxPsnr)
psnr = kMaxPsnr;
return psnr;
}
double CalcFramePsnr(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
......@@ -233,7 +236,7 @@ double CalcFramePsnr(const uint8* src_a, int stride_a,
src_b, stride_b,
width, height);
return Sse2Psnr (samples, sse);
return Sse2Psnr(samples, sse);
}
double I420Psnr(const uint8* src_y_a, int stride_y_a,
......
......@@ -22,9 +22,9 @@
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile (
"mov %%ebx, %%edi\n"
"cpuid\n"
"xchg %%edi, %%ebx\n"
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type)
);
......@@ -32,7 +32,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) {
#elif defined(__i386__) || defined(__x86_64__)
static inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile (
"cpuid\n"
"cpuid \n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type)
);
......
......@@ -50,17 +50,17 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
#define HAS_ARGBTOBAYERROW_SSSE3
static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile(
"movd %3,%%xmm5\n"
"pshufd $0x0,%%xmm5,%%xmm5\n"
"1:\n"
"movdqa (%0),%%xmm0\n"
"lea 0x10(%0),%0\n"
"pshufb %%xmm5,%%xmm0\n"
"movd %%xmm0,(%1)\n"
"lea 0x4(%1),%1\n"
"sub $0x4,%2\n"
"ja 1b\n"
asm volatile (
"movd %3,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n"
"ja 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
......
......@@ -23,14 +23,13 @@ namespace libyuv {
// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
static void SplitUV_NEON(const uint8* src_uv,
uint8* dst_u, uint8* dst_v, int pix) {
__asm__ volatile
(
"1:\n"
"vld2.u8 {q0,q1}, [%0]!\n" // load 16 pairs of UV
"vst1.u8 {q0}, [%1]!\n" // store U
"vst1.u8 {q1}, [%2]!\n" // Store V
"subs %3, %3, #16\n" // 16 processed per loop
"bhi 1b\n"
asm volatile (
"1: \n"
"vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV
"vst1.u8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // Store V
"subs %3, %3, #16 \n" // 16 processed per loop
"bhi 1b \n"
: "+r"(src_uv),
"+r"(dst_u),
"+r"(dst_v),
......@@ -57,7 +56,7 @@ static void SplitUV_SSE2(const uint8* src_uv,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
wloop:
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
......@@ -74,7 +73,7 @@ static void SplitUV_SSE2(const uint8* src_uv,
movdqa [edi], xmm2
lea edi, [edi + 16]
sub ecx, 16
ja wloop
ja convertloop
pop edi
ret
}
......@@ -85,27 +84,27 @@ static void SplitUV_SSE2(const uint8* src_uv,
#define HAS_SPLITUV_SSE2
static void SplitUV_SSE2(const uint8* src_uv,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n"
"movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n"
"pand %%xmm5,%%xmm0\n"
"pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"psrlw $0x8,%%xmm2\n"
"psrlw $0x8,%%xmm3\n"
"packuswb %%xmm3,%%xmm2\n"
"movdqa %%xmm2,(%2)\n"
"lea 0x10(%2),%2\n"
"sub $0x10,%3\n"
"ja 1b\n"
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,(%2) \n"
"lea 0x10(%2),%2 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
......@@ -239,13 +238,12 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
#define HAS_SETROW_NEON
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
__asm__ volatile
(
"vdup.u32 q0, %2\n" // duplicate 4 ints
"1:\n"
"vst1.u32 {q0}, [%0]!\n" // store
"subs %1, %1, #16\n" // 16 processed per loop
"bhi 1b\n"
asm volatile (
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"vst1.u32 {q0}, [%0]! \n" // store
"subs %1, %1, #16 \n" // 16 processed per loop
"bhi 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
......@@ -263,11 +261,11 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
mov ecx, [esp + 12] // count
pshufd xmm5, xmm5, 0
wloop:
convertloop:
movdqa [eax], xmm5
lea eax, [eax + 16]
sub ecx, 16
ja wloop
ja convertloop
ret
}
}
......@@ -277,14 +275,14 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
#define HAS_SETROW_SSE2
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
asm volatile(
"movd %2, %%xmm5\n"
"pshufd $0x0,%%xmm5,%%xmm5\n"
"1:"
"movdqa %%xmm5,(%0)\n"
"lea 0x10(%0),%0\n"
"sub $0x10,%1\n"
"ja 1b\n"
asm volatile (
"movd %2, %%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"1: \n"
"movdqa %%xmm5,(%0) \n"
"lea 0x10(%0),%0 \n"
"sub $0x10,%1 \n"
"ja 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
......@@ -561,7 +559,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
wloop:
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
......@@ -585,7 +583,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
movq qword ptr [edi], xmm1
lea edi, [edi + 8]
sub ecx, 16
ja wloop
ja convertloop
pop edi
pop esi
......@@ -598,34 +596,34 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
#define HAS_SPLITYUY2_SSE2
static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n"
"movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n"
"pand %%xmm5,%%xmm2\n"
"pand %%xmm5,%%xmm3\n"
"packuswb %%xmm3,%%xmm2\n"
"movdqa %%xmm2,(%1)\n"
"lea 0x10(%1),%1\n"
"psrlw $0x8,%%xmm0\n"
"psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,%%xmm1\n"
"pand %%xmm5,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%2)\n"
"lea 0x8(%2),%2\n"
"psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm1\n"
"movq %%xmm1,(%3)\n"
"lea 0x8(%3),%3\n"
"sub $0x10,%4\n"
"ja 1b\n"
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"pand %%xmm5,%%xmm2 \n"
"pand %%xmm5,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,(%1) \n"
"lea 0x10(%1),%1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0,(%2) \n"
"lea 0x8(%2),%2 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm1,(%3) \n"
"lea 0x8(%3),%3 \n"
"sub $0x10,%4 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(dst_u), // %2
......@@ -716,7 +714,7 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
wloop:
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
......@@ -726,7 +724,7 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja wloop
ja convertloop
ret
}
}
......@@ -745,7 +743,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
wloop:
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
......@@ -766,7 +764,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
movq qword ptr [edi], xmm1
lea edi, [edi + 8]
sub ecx, 16
ja wloop
ja convertloop
pop edi
pop esi
......@@ -783,7 +781,7 @@ void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
wloop:
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
......@@ -793,7 +791,7 @@ void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja wloop
ja convertloop
ret
}
}
......@@ -812,7 +810,7 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
wloop:
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
......@@ -833,7 +831,7 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
movq qword ptr [edi], xmm1
lea edi, [edi + 8]
sub ecx, 16
ja wloop
ja convertloop
pop edi
pop esi
......@@ -847,20 +845,20 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
#define HAS_YUY2TOI420ROW_SSE2
static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n"
"pand %%xmm5,%%xmm0\n"
"pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
......@@ -874,31 +872,31 @@ static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa (%0,%4,1),%%xmm2\n"
"movdqa 0x10(%0,%4,1),%%xmm3\n"
"lea 0x20(%0),%0\n"
"pavgb %%xmm2,%%xmm0\n"
"pavgb %%xmm3,%%xmm1\n"
"psrlw $0x8,%%xmm0\n"
"psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,%%xmm1\n"
"pand %%xmm5,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%1)\n"
"lea 0x8(%1),%1\n"
"psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm1\n"
"movq %%xmm1,(%2)\n"
"lea 0x8(%2),%2\n"
"sub $0x10,%3\n"
"ja 1b\n"
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%4,1),%%xmm2 \n"
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm1,(%2) \n"
"lea 0x8(%2),%2 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
......@@ -913,18 +911,18 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
#define HAS_UYVYTOI420ROW_SSE2
static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
asm volatile(
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n"
"psrlw $0x8,%%xmm0\n"
"psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
asm volatile (
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
......@@ -938,31 +936,31 @@ static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa (%0,%4,1),%%xmm2\n"
"movdqa 0x10(%0,%4,1),%%xmm3\n"
"lea 0x20(%0),%0\n"
"pavgb %%xmm2,%%xmm0\n"
"pavgb %%xmm3,%%xmm1\n"
"pand %%xmm5,%%xmm0\n"
"pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,%%xmm1\n"
"pand %%xmm5,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%1)\n"
"lea 0x8(%1),%1\n"
"psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm1\n"
"movq %%xmm1,(%2)\n"
"lea 0x8(%2),%2\n"
"sub $0x10,%3\n"
"ja 1b\n"
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa (%0,%4,1),%%xmm2 \n"
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movq %%xmm1,(%2) \n"
"lea 0x8(%2),%2 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_y), // %2
......
......@@ -282,78 +282,78 @@ __asm {
#define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile(
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
"1:\n"
"movq (%0),%%xmm0\n"
"movq (%0,%3),%%xmm1\n"
"lea (%0,%3,2),%0\n"
"punpcklbw %%xmm1,%%xmm0\n"
"movq (%0),%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n"
"palignr $0x8,%%xmm1,%%xmm1\n"
"movq (%0,%3),%%xmm3\n"
"lea (%0,%3,2),%0\n"
"punpcklbw %%xmm3,%%xmm2\n"
"movdqa %%xmm2,%%xmm3\n"
"movq (%0),%%xmm4\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"movq (%0,%3),%%xmm5\n"
"lea (%0,%3,2),%0\n"
"punpcklbw %%xmm5,%%xmm4\n"
"movdqa %%xmm4,%%xmm5\n"
"movq (%0),%%xmm6\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"movq (%0,%3),%%xmm7\n"
"lea (%0,%3,2),%0\n"
"punpcklbw %%xmm7,%%xmm6\n"
"neg %3\n"
"movdqa %%xmm6,%%xmm7\n"
"lea 0x8(%0,%3,8),%0\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"neg %3\n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0\n"
"punpcklwd %%xmm3,%%xmm1\n"
"movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n"
"palignr $0x8,%%xmm2,%%xmm2\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"punpcklwd %%xmm6,%%xmm4\n"
"punpcklwd %%xmm7,%%xmm5\n"
"movdqa %%xmm4,%%xmm6\n"
"movdqa %%xmm5,%%xmm7\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0\n"
"movq %%xmm0,(%1)\n"
"movdqa %%xmm0,%%xmm4\n"
"palignr $0x8,%%xmm4,%%xmm4\n"
"movq %%xmm4,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm6,%%xmm2\n"
"movdqa %%xmm2,%%xmm6\n"
"movq %%xmm2,(%1)\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"punpckldq %%xmm5,%%xmm1\n"
"movq %%xmm6,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm1,%%xmm5\n"
"movq %%xmm1,(%1)\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"movq %%xmm5,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm7,%%xmm3\n"
"movq %%xmm3,(%1)\n"
"movdqa %%xmm3,%%xmm7\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"movq %%xmm7,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"sub $0x8,%2\n"
"ja 1b\n"
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
......@@ -372,258 +372,258 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w);
asm(
".text\n"
asm volatile (
".text \n"
#if defined(OSX)
".globl _TransposeUVWx8_SSE2\n"
"_TransposeUVWx8_SSE2:\n"
".globl _TransposeUVWx8_SSE2 \n"
"_TransposeUVWx8_SSE2: \n"
#else
".global TransposeUVWx8_SSE2\n"
"TransposeUVWx8_SSE2:\n"
".global TransposeUVWx8_SSE2 \n"
"TransposeUVWx8_SSE2: \n"
#endif
"push %ebx\n"
"push %esi\n"
"push %edi\n"
"push %ebp\n"
"mov 0x14(%esp),%eax\n"
"mov 0x18(%esp),%edi\n"
"mov 0x1c(%esp),%edx\n"
"mov 0x20(%esp),%esi\n"
"mov 0x24(%esp),%ebx\n"
"mov 0x28(%esp),%ebp\n"
"mov %esp,%ecx\n"
"sub $0x14,%esp\n"
"and $0xfffffff0,%esp\n"
"mov %ecx,0x10(%esp)\n"
"mov 0x2c(%ecx),%ecx\n"
"push %ebx \n"
"push %esi \n"
"push %edi \n"
"push %ebp \n"
"mov 0x14(%esp),%eax \n"
"mov 0x18(%esp),%edi \n"
"mov 0x1c(%esp),%edx \n"
"mov 0x20(%esp),%esi \n"
"mov 0x24(%esp),%ebx \n"
"mov 0x28(%esp),%ebp \n"
"mov %esp,%ecx \n"
"sub $0x14,%esp \n"
"and $0xfffffff0,%esp \n"
"mov %ecx,0x10(%esp) \n"
"mov 0x2c(%ecx),%ecx \n"
"1:\n"
"movdqa (%eax),%xmm0\n"
"movdqa (%eax,%edi,1),%xmm1\n"
"lea (%eax,%edi,2),%eax\n"
"movdqa %xmm0,%xmm7\n"
"punpcklbw %xmm1,%xmm0\n"
"punpckhbw %xmm1,%xmm7\n"
"movdqa %xmm7,%xmm1\n"
"movdqa (%eax),%xmm2\n"
"movdqa (%eax,%edi,1),%xmm3\n"
"lea (%eax,%edi,2),%eax\n"
"movdqa %xmm2,%xmm7\n"
"punpcklbw %xmm3,%xmm2\n"
"punpckhbw %xmm3,%xmm7\n"
"movdqa %xmm7,%xmm3\n"
"movdqa (%eax),%xmm4\n"
"movdqa (%eax,%edi,1),%xmm5\n"
"lea (%eax,%edi,2),%eax\n"
"movdqa %xmm4,%xmm7\n"
"punpcklbw %xmm5,%xmm4\n"
"punpckhbw %xmm5,%xmm7\n"
"movdqa %xmm7,%xmm5\n"
"movdqa (%eax),%xmm6\n"
"movdqa (%eax,%edi,1),%xmm7\n"
"lea (%eax,%edi,2),%eax\n"
"movdqa %xmm5,(%esp)\n"
"neg %edi\n"
"movdqa %xmm6,%xmm5\n"
"punpcklbw %xmm7,%xmm6\n"
"punpckhbw %xmm7,%xmm5\n"
"movdqa %xmm5,%xmm7\n"
"lea 0x10(%eax,%edi,8),%eax\n"
"neg %edi\n"
"movdqa %xmm0,%xmm5\n"
"punpcklwd %xmm2,%xmm0\n"
"punpckhwd %xmm2,%xmm5\n"
"movdqa %xmm5,%xmm2\n"
"movdqa %xmm1,%xmm5\n"
"punpcklwd %xmm3,%xmm1\n"
"punpckhwd %xmm3,%xmm5\n"
"movdqa %xmm5,%xmm3\n"
"movdqa %xmm4,%xmm5\n"
"punpcklwd %xmm6,%xmm4\n"
"punpckhwd %xmm6,%xmm5\n"
"movdqa %xmm5,%xmm6\n"
"movdqa (%esp),%xmm5\n"
"movdqa %xmm6,(%esp)\n"
"movdqa %xmm5,%xmm6\n"
"punpcklwd %xmm7,%xmm5\n"
"punpckhwd %xmm7,%xmm6\n"
"movdqa %xmm6,%xmm7\n"
"movdqa %xmm0,%xmm6\n"
"punpckldq %xmm4,%xmm0\n"
"punpckhdq %xmm4,%xmm6\n"
"movdqa %xmm6,%xmm4\n"
"movdqa (%esp),%xmm6\n"
"movlpd %xmm0,(%edx)\n"
"movhpd %xmm0,(%ebx)\n"
"movlpd %xmm4,(%edx,%esi,1)\n"
"lea (%edx,%esi,2),%edx\n"
"movhpd %xmm4,(%ebx,%ebp,1)\n"
"lea (%ebx,%ebp,2),%ebx\n"
"movdqa %xmm2,%xmm0\n"
"punpckldq %xmm6,%xmm2\n"
"movlpd %xmm2,(%edx)\n"
"movhpd %xmm2,(%ebx)\n"
"punpckhdq %xmm6,%xmm0\n"
"movlpd %xmm0,(%edx,%esi,1)\n"
"lea (%edx,%esi,2),%edx\n"
"movhpd %xmm0,(%ebx,%ebp,1)\n"
"lea (%ebx,%ebp,2),%ebx\n"
"movdqa %xmm1,%xmm0\n"
"punpckldq %xmm5,%xmm1\n"
"movlpd %xmm1,(%edx)\n"
"movhpd %xmm1,(%ebx)\n"
"punpckhdq %xmm5,%xmm0\n"
"movlpd %xmm0,(%edx,%esi,1)\n"
"lea (%edx,%esi,2),%edx\n"
"movhpd %xmm0,(%ebx,%ebp,1)\n"
"lea (%ebx,%ebp,2),%ebx\n"
"movdqa %xmm3,%xmm0\n"
"punpckldq %xmm7,%xmm3\n"
"movlpd %xmm3,(%edx)\n"
"movhpd %xmm3,(%ebx)\n"
"punpckhdq %xmm7,%xmm0\n"
"movlpd %xmm0,(%edx,%esi,1)\n"
"lea (%edx,%esi,2),%edx\n"
"movhpd %xmm0,(%ebx,%ebp,1)\n"
"lea (%ebx,%ebp,2),%ebx\n"
"sub $0x8,%ecx\n"
"ja 1b\n"
"mov 0x10(%esp),%esp\n"
"pop %ebp\n"
"pop %edi\n"
"pop %esi\n"
"pop %ebx\n"
"ret\n"
"1: \n"
"movdqa (%eax),%xmm0 \n"
"movdqa (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
"movdqa (%eax),%xmm2 \n"
"movdqa (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
"movdqa (%eax),%xmm4 \n"
"movdqa (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
"movdqa (%eax),%xmm6 \n"
"movdqa (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
"punpckhbw %xmm7,%xmm5 \n"
"movdqa %xmm5,%xmm7 \n"
"lea 0x10(%eax,%edi,8),%eax \n"
"neg %edi \n"
"movdqa %xmm0,%xmm5 \n"
"punpcklwd %xmm2,%xmm0 \n"
"punpckhwd %xmm2,%xmm5 \n"
"movdqa %xmm5,%xmm2 \n"
"movdqa %xmm1,%xmm5 \n"
"punpcklwd %xmm3,%xmm1 \n"
"punpckhwd %xmm3,%xmm5 \n"
"movdqa %xmm5,%xmm3 \n"
"movdqa %xmm4,%xmm5 \n"
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
"movdqa (%esp),%xmm5 \n"
"movdqa %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
"movdqa %xmm6,%xmm7 \n"
"movdqa %xmm0,%xmm6 \n"
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
"movdqa (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm4,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm2,%xmm0 \n"
"punpckldq %xmm6,%xmm2 \n"
"movlpd %xmm2,(%edx) \n"
"movhpd %xmm2,(%ebx) \n"
"punpckhdq %xmm6,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm1,%xmm0 \n"
"punpckldq %xmm5,%xmm1 \n"
"movlpd %xmm1,(%edx) \n"
"movhpd %xmm1,(%ebx) \n"
"punpckhdq %xmm5,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm3,%xmm0 \n"
"punpckldq %xmm7,%xmm3 \n"
"movlpd %xmm3,(%edx) \n"
"movhpd %xmm3,(%ebx) \n"
"punpckhdq %xmm7,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"sub $0x8,%ecx \n"
"ja 1b \n"
"mov 0x10(%esp),%esp \n"
"pop %ebp \n"
"pop %edi \n"
"pop %esi \n"
"pop %ebx \n"
"ret \n"
);
#elif defined (__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile(
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
"1:\n"
"movdqa (%0),%%xmm0\n"
"movdqa (%0,%3),%%xmm1\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm0,%%xmm8\n"
"punpcklbw %%xmm1,%%xmm0\n"
"punpckhbw %%xmm1,%%xmm8\n"
"movdqa (%0),%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n"
"movdqa %%xmm8,%%xmm9\n"
"palignr $0x8,%%xmm1,%%xmm1\n"
"palignr $0x8,%%xmm9,%%xmm9\n"
"movdqa (%0,%3),%%xmm3\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm2,%%xmm10\n"
"punpcklbw %%xmm3,%%xmm2\n"
"punpckhbw %%xmm3,%%xmm10\n"
"movdqa %%xmm2,%%xmm3\n"
"movdqa %%xmm10,%%xmm11\n"
"movdqa (%0),%%xmm4\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"palignr $0x8,%%xmm11,%%xmm11\n"
"movdqa (%0,%3),%%xmm5\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm4,%%xmm12\n"
"punpcklbw %%xmm5,%%xmm4\n"
"punpckhbw %%xmm5,%%xmm12\n"
"movdqa %%xmm4,%%xmm5\n"
"movdqa %%xmm12,%%xmm13\n"
"movdqa (%0),%%xmm6\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"palignr $0x8,%%xmm13,%%xmm13\n"
"movdqa (%0,%3),%%xmm7\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm6,%%xmm14\n"
"punpcklbw %%xmm7,%%xmm6\n"
"punpckhbw %%xmm7,%%xmm14\n"
"neg %3\n"
"movdqa %%xmm6,%%xmm7\n"
"movdqa %%xmm14,%%xmm15\n"
"lea 0x10(%0,%3,8),%0\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"neg %3\n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqa (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqa (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqa (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqa (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqa (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0\n"
"punpcklwd %%xmm3,%%xmm1\n"
"movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n"
"palignr $0x8,%%xmm2,%%xmm2\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"punpcklwd %%xmm6,%%xmm4\n"
"punpcklwd %%xmm7,%%xmm5\n"
"movdqa %%xmm4,%%xmm6\n"
"movdqa %%xmm5,%%xmm7\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"punpcklwd %%xmm10,%%xmm8\n"
"punpcklwd %%xmm11,%%xmm9\n"
"movdqa %%xmm8,%%xmm10\n"
"movdqa %%xmm9,%%xmm11\n"
"palignr $0x8,%%xmm10,%%xmm10\n"
"palignr $0x8,%%xmm11,%%xmm11\n"
"punpcklwd %%xmm14,%%xmm12\n"
"punpcklwd %%xmm15,%%xmm13\n"
"movdqa %%xmm12,%%xmm14\n"
"movdqa %%xmm13,%%xmm15\n"
"palignr $0x8,%%xmm14,%%xmm14\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0\n"
"movq %%xmm0,(%1)\n"
"movdqa %%xmm0,%%xmm4\n"
"palignr $0x8,%%xmm4,%%xmm4\n"
"movq %%xmm4,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm6,%%xmm2\n"
"movdqa %%xmm2,%%xmm6\n"
"movq %%xmm2,(%1)\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"punpckldq %%xmm5,%%xmm1\n"
"movq %%xmm6,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm1,%%xmm5\n"
"movq %%xmm1,(%1)\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"movq %%xmm5,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm7,%%xmm3\n"
"movq %%xmm3,(%1)\n"
"movdqa %%xmm3,%%xmm7\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"movq %%xmm7,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm12,%%xmm8\n"
"movq %%xmm8,(%1)\n"
"movdqa %%xmm8,%%xmm12\n"
"palignr $0x8,%%xmm12,%%xmm12\n"
"movq %%xmm12,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm14,%%xmm10\n"
"movdqa %%xmm10,%%xmm14\n"
"movq %%xmm10,(%1)\n"
"palignr $0x8,%%xmm14,%%xmm14\n"
"punpckldq %%xmm13,%%xmm9\n"
"movq %%xmm14,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm9,%%xmm13\n"
"movq %%xmm9,(%1)\n"
"palignr $0x8,%%xmm13,%%xmm13\n"
"movq %%xmm13,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm15,%%xmm11\n"
"movq %%xmm11,(%1)\n"
"movdqa %%xmm11,%%xmm15\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"movq %%xmm15,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
......@@ -640,98 +640,98 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
asm volatile(
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
"1:\n"
"movdqa (%0),%%xmm0\n"
"movdqa (%0,%4),%%xmm1\n"
"lea (%0,%4,2),%0\n"
"movdqa %%xmm0,%%xmm8\n"
"punpcklbw %%xmm1,%%xmm0\n"
"punpckhbw %%xmm1,%%xmm8\n"
"movdqa %%xmm8,%%xmm1\n"
"movdqa (%0),%%xmm2\n"
"movdqa (%0,%4),%%xmm3\n"
"lea (%0,%4,2),%0\n"
"movdqa %%xmm2,%%xmm8\n"
"punpcklbw %%xmm3,%%xmm2\n"
"punpckhbw %%xmm3,%%xmm8\n"
"movdqa %%xmm8,%%xmm3\n"
"movdqa (%0),%%xmm4\n"
"movdqa (%0,%4),%%xmm5\n"
"lea (%0,%4,2),%0\n"
"movdqa %%xmm4,%%xmm8\n"
"punpcklbw %%xmm5,%%xmm4\n"
"punpckhbw %%xmm5,%%xmm8\n"
"movdqa %%xmm8,%%xmm5\n"
"movdqa (%0),%%xmm6\n"
"movdqa (%0,%4),%%xmm7\n"
"lea (%0,%4,2),%0\n"
"movdqa %%xmm6,%%xmm8\n"
"punpcklbw %%xmm7,%%xmm6\n"
"neg %4\n"
"lea 0x10(%0,%4,8),%0\n"
"punpckhbw %%xmm7,%%xmm8\n"
"movdqa %%xmm8,%%xmm7\n"
"neg %4\n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n"
"movdqa (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqa (%0),%%xmm4 \n"
"movdqa (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqa (%0),%%xmm6 \n"
"movdqa (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8\n"
"movdqa %%xmm1,%%xmm9\n"
"punpckhwd %%xmm2,%%xmm8\n"
"punpckhwd %%xmm3,%%xmm9\n"
"punpcklwd %%xmm2,%%xmm0\n"
"punpcklwd %%xmm3,%%xmm1\n"
"movdqa %%xmm8,%%xmm2\n"
"movdqa %%xmm9,%%xmm3\n"
"movdqa %%xmm4,%%xmm8\n"
"movdqa %%xmm5,%%xmm9\n"
"punpckhwd %%xmm6,%%xmm8\n"
"punpckhwd %%xmm7,%%xmm9\n"
"punpcklwd %%xmm6,%%xmm4\n"
"punpcklwd %%xmm7,%%xmm5\n"
"movdqa %%xmm8,%%xmm6\n"
"movdqa %%xmm9,%%xmm7\n"
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8\n"
"punpckldq %%xmm4,%%xmm0\n"
"movlpd %%xmm0,(%1)\n" // Write back U channel
"movhpd %%xmm0,(%2)\n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8\n"
"movlpd %%xmm8,(%1,%5)\n"
"lea (%1,%5,2),%1\n"
"movhpd %%xmm8,(%2,%6)\n"
"lea (%2,%6,2),%2\n"
"movdqa %%xmm2,%%xmm8\n"
"punpckldq %%xmm6,%%xmm2\n"
"movlpd %%xmm2,(%1)\n"
"movhpd %%xmm2,(%2)\n"
"punpckhdq %%xmm6,%%xmm8\n"
"movlpd %%xmm8,(%1,%5)\n"
"lea (%1,%5,2),%1\n"
"movhpd %%xmm8,(%2,%6)\n"
"lea (%2,%6,2),%2\n"
"movdqa %%xmm1,%%xmm8\n"
"punpckldq %%xmm5,%%xmm1\n"
"movlpd %%xmm1,(%1)\n"
"movhpd %%xmm1,(%2)\n"
"punpckhdq %%xmm5,%%xmm8\n"
"movlpd %%xmm8,(%1,%5)\n"
"lea (%1,%5,2),%1\n"
"movhpd %%xmm8,(%2,%6)\n"
"lea (%2,%6,2),%2\n"
"movdqa %%xmm3,%%xmm8\n"
"punpckldq %%xmm7,%%xmm3\n"
"movlpd %%xmm3,(%1)\n"
"movhpd %%xmm3,(%2)\n"
"punpckhdq %%xmm7,%%xmm8\n"
"movlpd %%xmm8,(%1,%5)\n"
"lea (%1,%5,2),%1\n"
"movhpd %%xmm8,(%2,%6)\n"
"lea (%2,%6,2),%2\n"
"sub $0x8,%3\n"
"ja 1b\n"
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"sub $0x8,%3 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
......@@ -882,17 +882,17 @@ __asm {
#define HAS_REVERSE_LINE_SSSE3
static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile(
"movdqa (%3),%%xmm5\n"
"lea -0x10(%0,%2,1),%0\n"
"1:\n"
"movdqa (%0),%%xmm0\n"
"lea -0x10(%0),%0\n"
"pshufb %%xmm5,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
asm volatile (
"movdqa (%3),%%xmm5 \n"
"lea -0x10(%0,%2,1),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
......@@ -1091,19 +1091,19 @@ void ReverseLineUV_SSSE3(const uint8* src,
uint8* dst_a, uint8* dst_b,
int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile(
"movdqa (%4),%%xmm5\n"
"lea -0x10(%0,%3,2),%0\n"
"1:\n"
"movdqa (%0),%%xmm0\n"
"lea -0x10(%0),%0\n"
"pshufb %%xmm5,%%xmm0\n"
"movlpd %%xmm0,(%1)\n"
"lea 0x8(%1),%1\n"
"movhpd %%xmm0,(%2)\n"
"lea 0x8(%2),%2\n"
"sub $0x8,%3\n"
"ja 1b\n"
asm volatile (
"movdqa (%4),%%xmm5 \n"
"lea -0x10(%0,%3,2),%0 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n"
"movhpd %%xmm0,(%2) \n"
"lea 0x8(%2),%2 \n"
"sub $0x8,%3 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
......
......@@ -15,12 +15,12 @@ namespace libyuv {
#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
void ReverseLine_NEON(const uint8* src, uint8* dst, int width) {
asm volatile(
asm volatile (
// compute where to start writing destination
"add %1, %2\n"
"add %1, %2 \n"
// work on segments that are multiples of 16
"lsrs r3, %2, #4\n"
"lsrs r3, %2, #4 \n"
// the output is written in two block. 8 bytes followed
// by another 8. reading is done sequentially, from left to
......@@ -28,72 +28,72 @@ void ReverseLine_NEON(const uint8* src, uint8* dst, int width) {
// %1, the destination pointer is incremented after writing
// the first of the two blocks. need to subtract that 8 off
// along with 16 to get the next location.
"mov r3, #-24\n"
"mov r3, #-24 \n"
"beq 2f\n"
"beq 2f \n"
// back of destination by the size of the register that is
// going to be reversed
"sub %1, #16\n"
"sub %1, #16 \n"
// the loop needs to run on blocks of 16. what will be left
// over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time.
"sub %2, #16\n"
"sub %2, #16 \n"
"1:\n"
"vld1.8 {q0}, [%0]!\n" // src += 16
"1: \n"
"vld1.8 {q0}, [%0]! \n" // src += 16
// reverse the bytes in the 64 bit segments. unable to reverse
// the bytes in the entire 128 bits in one go.
"vrev64.8 q0, q0\n"
"vrev64.8 q0, q0 \n"
// because of the inability to reverse the entire 128 bits
// reverse the writing out of the two 64 bit segments.
"vst1.8 {d1}, [%1]!\n"
"vst1.8 {d0}, [%1], r3\n" // dst -= 16
"vst1.8 {d1}, [%1]! \n"
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16
"subs %2, #16\n"
"bge 1b\n"
"subs %2, #16 \n"
"bge 1b \n"
// add 16 back to the counter. if the result is 0 there is no
// residuals so jump past
"adds %2, #16\n"
"beq 5f\n"
"adds %2, #16 \n"
"beq 5f \n"
"add %1, #16\n"
"add %1, #16 \n"
"2:\n"
"2: \n"
"mov r3, #-3\n"
"mov r3, #-3 \n"
"sub %1, #2\n"
"subs %2, #2\n"
"sub %1, #2 \n"
"subs %2, #2 \n"
// check for 16*n+1 scenarios where segments_of_2 should not
// be run, but there is something left over.
"blt 4f\n"
"blt 4f \n"
// do this in neon registers as per
// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
"3:\n"
"vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"vst1.8 {d1[0]}, [%1]!\n"
"vst1.8 {d0[0]}, [%1], r3\n" // dst -= 2
"vst1.8 {d1[0]}, [%1]! \n"
"vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
"subs %2, #2\n"
"bge 3b\n"
"subs %2, #2 \n"
"bge 3b \n"
"adds %2, #2\n"
"beq 5f\n"
"adds %2, #2 \n"
"beq 5f \n"
"4:\n"
"add %1, #1\n"
"vld1.8 {d0[0]}, [%0]\n"
"vst1.8 {d0[0]}, [%1]\n"
"4: \n"
"add %1, #1 \n"
"vld1.8 {d0[0]}, [%0] \n"
"vst1.8 {d0[0]}, [%1] \n"
"5:\n"
"5: \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
......@@ -108,154 +108,154 @@ static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) =
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
asm volatile(
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %4, #8\n"
"sub %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1:\n"
"mov r9, %0\n"
"vld1.8 {d0}, [r9], %1\n"
"vld1.8 {d1}, [r9], %1\n"
"vld1.8 {d2}, [r9], %1\n"
"vld1.8 {d3}, [r9], %1\n"
"vld1.8 {d4}, [r9], %1\n"
"vld1.8 {d5}, [r9], %1\n"
"vld1.8 {d6}, [r9], %1\n"
"vld1.8 {d7}, [r9]\n"
"vtrn.8 d1, d0\n"
"vtrn.8 d3, d2\n"
"vtrn.8 d5, d4\n"
"vtrn.8 d7, d6\n"
"vtrn.16 d1, d3\n"
"vtrn.16 d0, d2\n"
"vtrn.16 d5, d7\n"
"vtrn.16 d4, d6\n"
"vtrn.32 d1, d5\n"
"vtrn.32 d0, d4\n"
"vtrn.32 d3, d7\n"
"vtrn.32 d2, d6\n"
"vrev16.8 q0, q0\n"
"vrev16.8 q1, q1\n"
"vrev16.8 q2, q2\n"
"vrev16.8 q3, q3\n"
"mov r9, %2\n"
"vst1.8 {d1}, [r9], %3\n"
"vst1.8 {d0}, [r9], %3\n"
"vst1.8 {d3}, [r9], %3\n"
"vst1.8 {d2}, [r9], %3\n"
"vst1.8 {d5}, [r9], %3\n"
"vst1.8 {d4}, [r9], %3\n"
"vst1.8 {d7}, [r9], %3\n"
"vst1.8 {d6}, [r9]\n"
"add %0, #8\n" // src += 8
"add %2, %2, %3, lsl #3\n" // dst += 8 * dst_stride
"subs %4, #8\n" // w -= 8
"bge 1b\n"
"1: \n"
"mov r9, %0 \n"
"vld1.8 {d0}, [r9], %1 \n"
"vld1.8 {d1}, [r9], %1 \n"
"vld1.8 {d2}, [r9], %1 \n"
"vld1.8 {d3}, [r9], %1 \n"
"vld1.8 {d4}, [r9], %1 \n"
"vld1.8 {d5}, [r9], %1 \n"
"vld1.8 {d6}, [r9], %1 \n"
"vld1.8 {d7}, [r9] \n"
"vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n"
"vtrn.8 d5, d4 \n"
"vtrn.8 d7, d6 \n"
"vtrn.16 d1, d3 \n"
"vtrn.16 d0, d2 \n"
"vtrn.16 d5, d7 \n"
"vtrn.16 d4, d6 \n"
"vtrn.32 d1, d5 \n"
"vtrn.32 d0, d4 \n"
"vtrn.32 d3, d7 \n"
"vtrn.32 d2, d6 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"mov r9, %2 \n"
"vst1.8 {d1}, [r9], %3 \n"
"vst1.8 {d0}, [r9], %3 \n"
"vst1.8 {d3}, [r9], %3 \n"
"vst1.8 {d2}, [r9], %3 \n"
"vst1.8 {d5}, [r9], %3 \n"
"vst1.8 {d4}, [r9], %3 \n"
"vst1.8 {d7}, [r9], %3 \n"
"vst1.8 {d6}, [r9] \n"
"add %0, #8 \n" // src += 8
"add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
"subs %4, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %4, #8\n"
"beq 4f\n"
"adds %4, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %4, #2\n"
"blt 3f\n"
"cmp %4, #2 \n"
"blt 3f \n"
"cmp %4, #4\n"
"blt 2f\n"
"cmp %4, #4 \n"
"blt 2f \n"
// 4x8 block
"mov r9, %0\n"
"vld1.32 {d0[0]}, [r9], %1\n"
"vld1.32 {d0[1]}, [r9], %1\n"
"vld1.32 {d1[0]}, [r9], %1\n"
"vld1.32 {d1[1]}, [r9], %1\n"
"vld1.32 {d2[0]}, [r9], %1\n"
"vld1.32 {d2[1]}, [r9], %1\n"
"vld1.32 {d3[0]}, [r9], %1\n"
"vld1.32 {d3[1]}, [r9]\n"
"mov r9, %0 \n"
"vld1.32 {d0[0]}, [r9], %1 \n"
"vld1.32 {d0[1]}, [r9], %1 \n"
"vld1.32 {d1[0]}, [r9], %1 \n"
"vld1.32 {d1[1]}, [r9], %1 \n"
"vld1.32 {d2[0]}, [r9], %1 \n"
"vld1.32 {d2[1]}, [r9], %1 \n"
"vld1.32 {d3[0]}, [r9], %1 \n"
"vld1.32 {d3[1]}, [r9] \n"
"mov r9, %2\n"
"mov r9, %2 \n"
"vld1.8 {q3}, [%5]\n"
"vld1.8 {q3}, [%5] \n"
"vtbl.8 d4, {d0, d1}, d6\n"
"vtbl.8 d5, {d0, d1}, d7\n"
"vtbl.8 d0, {d2, d3}, d6\n"
"vtbl.8 d1, {d2, d3}, d7\n"
"vtbl.8 d4, {d0, d1}, d6 \n"
"vtbl.8 d5, {d0, d1}, d7 \n"
"vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n"
// TODO: rework shuffle above to write
// out with 4 instead of 8 writes
"vst1.32 {d4[0]}, [r9], %3\n"
"vst1.32 {d4[1]}, [r9], %3\n"
"vst1.32 {d5[0]}, [r9], %3\n"
"vst1.32 {d5[1]}, [r9]\n"
"add r9, %2, #4\n"
"vst1.32 {d0[0]}, [r9], %3\n"
"vst1.32 {d0[1]}, [r9], %3\n"
"vst1.32 {d1[0]}, [r9], %3\n"
"vst1.32 {d1[1]}, [r9]\n"
"add %0, #4\n" // src += 4
"add %2, %2, %3, lsl #2\n" // dst += 4 * dst_stride
"subs %4, #4\n" // w -= 4
"beq 4f\n"
"vst1.32 {d4[0]}, [r9], %3 \n"
"vst1.32 {d4[1]}, [r9], %3 \n"
"vst1.32 {d5[0]}, [r9], %3 \n"
"vst1.32 {d5[1]}, [r9] \n"
"add r9, %2, #4 \n"
"vst1.32 {d0[0]}, [r9], %3 \n"
"vst1.32 {d0[1]}, [r9], %3 \n"
"vst1.32 {d1[0]}, [r9], %3 \n"
"vst1.32 {d1[1]}, [r9] \n"
"add %0, #4 \n" // src += 4
"add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
"subs %4, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %4, #2\n"
"blt 3f\n"
"cmp %4, #2 \n"
"blt 3f \n"
// 2x8 block
"2:\n"
"mov r9, %0\n"
"vld1.16 {d0[0]}, [r9], %1\n"
"vld1.16 {d1[0]}, [r9], %1\n"
"vld1.16 {d0[1]}, [r9], %1\n"
"vld1.16 {d1[1]}, [r9], %1\n"
"vld1.16 {d0[2]}, [r9], %1\n"
"vld1.16 {d1[2]}, [r9], %1\n"
"vld1.16 {d0[3]}, [r9], %1\n"
"vld1.16 {d1[3]}, [r9]\n"
"2: \n"
"mov r9, %0 \n"
"vld1.16 {d0[0]}, [r9], %1 \n"
"vld1.16 {d1[0]}, [r9], %1 \n"
"vld1.16 {d0[1]}, [r9], %1 \n"
"vld1.16 {d1[1]}, [r9], %1 \n"
"vld1.16 {d0[2]}, [r9], %1 \n"
"vld1.16 {d1[2]}, [r9], %1 \n"
"vld1.16 {d0[3]}, [r9], %1 \n"
"vld1.16 {d1[3]}, [r9] \n"
"vtrn.8 d0, d1\n"
"vtrn.8 d0, d1 \n"
"mov r9, %2\n"
"mov r9, %2 \n"
"vst1.64 {d0}, [r9], %3\n"
"vst1.64 {d1}, [r9]\n"
"vst1.64 {d0}, [r9], %3 \n"
"vst1.64 {d1}, [r9] \n"
"add %0, #2\n" // src += 2
"add %2, %2, %3, lsl #1\n" // dst += 2 * dst_stride
"subs %4, #2\n" // w -= 2
"beq 4f\n"
"add %0, #2 \n" // src += 2
"add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
"subs %4, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3:\n"
"vld1.8 {d0[0]}, [%0], %1\n"
"vld1.8 {d0[1]}, [%0], %1\n"
"vld1.8 {d0[2]}, [%0], %1\n"
"vld1.8 {d0[3]}, [%0], %1\n"
"vld1.8 {d0[4]}, [%0], %1\n"
"vld1.8 {d0[5]}, [%0], %1\n"
"vld1.8 {d0[6]}, [%0], %1\n"
"vld1.8 {d0[7]}, [%0]\n"
"3: \n"
"vld1.8 {d0[0]}, [%0], %1 \n"
"vld1.8 {d0[1]}, [%0], %1 \n"
"vld1.8 {d0[2]}, [%0], %1 \n"
"vld1.8 {d0[3]}, [%0], %1 \n"
"vld1.8 {d0[4]}, [%0], %1 \n"
"vld1.8 {d0[5]}, [%0], %1 \n"
"vld1.8 {d0[6]}, [%0], %1 \n"
"vld1.8 {d0[7]}, [%0] \n"
"vst1.64 {d0}, [%2]\n"
"vst1.64 {d0}, [%2] \n"
"4:\n"
"4: \n"
: "+r"(src), // %0
"+r"(src_stride), // %1
......@@ -270,68 +270,68 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
void ReverseLineUV_NEON(const uint8* src,
uint8* dst_a, uint8* dst_b,
int width) {
asm volatile(
asm volatile (
// compute where to start writing destination
"add %1, %3\n" // dst_a + width
"add %2, %3\n" // dst_b + width
"add %1, %3 \n" // dst_a + width
"add %2, %3 \n" // dst_b + width
// work on input segments that are multiples of 16, but
// width that has been passed is output segments, half
// the size of input.
"lsrs r12, %3, #3\n"
"lsrs r12, %3, #3 \n"
"beq 2f\n"
"beq 2f \n"
// the output is written in to two blocks.
"mov r12, #-8\n"
"mov r12, #-8 \n"
// back of destination by the size of the register that is
// going to be reversed
"sub %1, #8\n"
"sub %2, #8\n"
"sub %1, #8 \n"
"sub %2, #8 \n"
// the loop needs to run on blocks of 8. what will be left
// over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time.
"sub %3, #8\n"
"sub %3, #8 \n"
"1:\n"
"vld2.8 {d0, d1}, [%0]!\n" // src += 16
"1: \n"
"vld2.8 {d0, d1}, [%0]! \n" // src += 16
// reverse the bytes in the 64 bit segments
"vrev64.8 q0, q0\n"
"vrev64.8 q0, q0 \n"
"vst1.8 {d0}, [%1], r12\n" // dst_a -= 8
"vst1.8 {d1}, [%2], r12\n" // dst_b -= 8
"vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
"subs %3, #8\n"
"bge 1b\n"
"subs %3, #8 \n"
"bge 1b \n"
// add 8 back to the counter. if the result is 0 there is no
// residuals so return
"adds %3, #8\n"
"beq 4f\n"
"adds %3, #8 \n"
"beq 4f \n"
"add %1, #8\n"
"add %2, #8\n"
"add %1, #8 \n"
"add %2, #8 \n"
"2:\n"
"2: \n"
"mov r12, #-1\n"
"mov r12, #-1 \n"
"sub %1, #1\n"
"sub %2, #1\n"
"sub %1, #1 \n"
"sub %2, #1 \n"
"3:\n"
"vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"vst1.8 {d0[0]}, [%1], r12\n" // dst_a -= 1
"vst1.8 {d1[0]}, [%2], r12\n" // dst_b -= 1
"vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
"vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
"subs %3, %3, #1\n"
"bgt 3b\n"
"4:\n"
"subs %3, %3, #1 \n"
"bgt 3b \n"
"4: \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
......@@ -348,198 +348,198 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
asm volatile(
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %6, #8\n"
"sub %6, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1:\n"
"mov r9, %0\n"
"vld2.8 {d0, d1}, [r9], %1\n"
"vld2.8 {d2, d3}, [r9], %1\n"
"vld2.8 {d4, d5}, [r9], %1\n"
"vld2.8 {d6, d7}, [r9], %1\n"
"vld2.8 {d16, d17}, [r9], %1\n"
"vld2.8 {d18, d19}, [r9], %1\n"
"vld2.8 {d20, d21}, [r9], %1\n"
"vld2.8 {d22, d23}, [r9]\n"
"vtrn.8 q1, q0\n"
"vtrn.8 q3, q2\n"
"vtrn.8 q9, q8\n"
"vtrn.8 q11, q10\n"
"vtrn.16 q1, q3\n"
"vtrn.16 q0, q2\n"
"vtrn.16 q9, q11\n"
"vtrn.16 q8, q10\n"
"vtrn.32 q1, q9\n"
"vtrn.32 q0, q8\n"
"vtrn.32 q3, q11\n"
"vtrn.32 q2, q10\n"
"vrev16.8 q0, q0\n"
"vrev16.8 q1, q1\n"
"vrev16.8 q2, q2\n"
"vrev16.8 q3, q3\n"
"vrev16.8 q8, q8\n"
"vrev16.8 q9, q9\n"
"vrev16.8 q10, q10\n"
"vrev16.8 q11, q11\n"
"mov r9, %2\n"
"vst1.8 {d2}, [r9], %3\n"
"vst1.8 {d0}, [r9], %3\n"
"vst1.8 {d6}, [r9], %3\n"
"vst1.8 {d4}, [r9], %3\n"
"vst1.8 {d18}, [r9], %3\n"
"vst1.8 {d16}, [r9], %3\n"
"vst1.8 {d22}, [r9], %3\n"
"vst1.8 {d20}, [r9]\n"
"mov r9, %4\n"
"vst1.8 {d3}, [r9], %5\n"
"vst1.8 {d1}, [r9], %5\n"
"vst1.8 {d7}, [r9], %5\n"
"vst1.8 {d5}, [r9], %5\n"
"vst1.8 {d19}, [r9], %5\n"
"vst1.8 {d17}, [r9], %5\n"
"vst1.8 {d23}, [r9], %5\n"
"vst1.8 {d21}, [r9]\n"
"add %0, #8*2\n" // src += 8*2
"add %2, %2, %3, lsl #3\n" // dst_a += 8 * dst_stride_a
"add %4, %4, %5, lsl #3\n" // dst_b += 8 * dst_stride_b
"subs %6, #8\n" // w -= 8
"bge 1b\n"
"1: \n"
"mov r9, %0 \n"
"vld2.8 {d0, d1}, [r9], %1 \n"
"vld2.8 {d2, d3}, [r9], %1 \n"
"vld2.8 {d4, d5}, [r9], %1 \n"
"vld2.8 {d6, d7}, [r9], %1 \n"
"vld2.8 {d16, d17}, [r9], %1 \n"
"vld2.8 {d18, d19}, [r9], %1 \n"
"vld2.8 {d20, d21}, [r9], %1 \n"
"vld2.8 {d22, d23}, [r9] \n"
"vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n"
"vtrn.8 q9, q8 \n"
"vtrn.8 q11, q10 \n"
"vtrn.16 q1, q3 \n"
"vtrn.16 q0, q2 \n"
"vtrn.16 q9, q11 \n"
"vtrn.16 q8, q10 \n"
"vtrn.32 q1, q9 \n"
"vtrn.32 q0, q8 \n"
"vtrn.32 q3, q11 \n"
"vtrn.32 q2, q10 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"vrev16.8 q8, q8 \n"
"vrev16.8 q9, q9 \n"
"vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n"
"mov r9, %2 \n"
"vst1.8 {d2}, [r9], %3 \n"
"vst1.8 {d0}, [r9], %3 \n"
"vst1.8 {d6}, [r9], %3 \n"
"vst1.8 {d4}, [r9], %3 \n"
"vst1.8 {d18}, [r9], %3 \n"
"vst1.8 {d16}, [r9], %3 \n"
"vst1.8 {d22}, [r9], %3 \n"
"vst1.8 {d20}, [r9] \n"
"mov r9, %4 \n"
"vst1.8 {d3}, [r9], %5 \n"
"vst1.8 {d1}, [r9], %5 \n"
"vst1.8 {d7}, [r9], %5 \n"
"vst1.8 {d5}, [r9], %5 \n"
"vst1.8 {d19}, [r9], %5 \n"
"vst1.8 {d17}, [r9], %5 \n"
"vst1.8 {d23}, [r9], %5 \n"
"vst1.8 {d21}, [r9] \n"
"add %0, #8*2 \n" // src += 8*2
"add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %6, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %6, #8\n"
"beq 4f\n"
"adds %6, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %6, #2\n"
"blt 3f\n"
"cmp %6, #2 \n"
"blt 3f \n"
"cmp %6, #4\n"
"blt 2f\n"
"cmp %6, #4 \n"
"blt 2f \n"
//TODO(frkoenig) : clean this up
// 4x8 block
"mov r9, %0\n"
"vld1.64 {d0}, [r9], %1\n"
"vld1.64 {d1}, [r9], %1\n"
"vld1.64 {d2}, [r9], %1\n"
"vld1.64 {d3}, [r9], %1\n"
"vld1.64 {d4}, [r9], %1\n"
"vld1.64 {d5}, [r9], %1\n"
"vld1.64 {d6}, [r9], %1\n"
"vld1.64 {d7}, [r9]\n"
"vld1.8 {q15}, [%7]\n"
"vtrn.8 q0, q1\n"
"vtrn.8 q2, q3\n"
"vtbl.8 d16, {d0, d1}, d30\n"
"vtbl.8 d17, {d0, d1}, d31\n"
"vtbl.8 d18, {d2, d3}, d30\n"
"vtbl.8 d19, {d2, d3}, d31\n"
"vtbl.8 d20, {d4, d5}, d30\n"
"vtbl.8 d21, {d4, d5}, d31\n"
"vtbl.8 d22, {d6, d7}, d30\n"
"vtbl.8 d23, {d6, d7}, d31\n"
"mov r9, %2\n"
"vst1.32 {d16[0]}, [r9], %3\n"
"vst1.32 {d16[1]}, [r9], %3\n"
"vst1.32 {d17[0]}, [r9], %3\n"
"vst1.32 {d17[1]}, [r9], %3\n"
"add r9, %2, #4\n"
"vst1.32 {d20[0]}, [r9], %3\n"
"vst1.32 {d20[1]}, [r9], %3\n"
"vst1.32 {d21[0]}, [r9], %3\n"
"vst1.32 {d21[1]}, [r9]\n"
"mov r9, %4\n"
"vst1.32 {d18[0]}, [r9], %5\n"
"vst1.32 {d18[1]}, [r9], %5\n"
"vst1.32 {d19[0]}, [r9], %5\n"
"vst1.32 {d19[1]}, [r9], %5\n"
"add r9, %4, #4\n"
"vst1.32 {d22[0]}, [r9], %5\n"
"vst1.32 {d22[1]}, [r9], %5\n"
"vst1.32 {d23[0]}, [r9], %5\n"
"vst1.32 {d23[1]}, [r9]\n"
"add %0, #4*2\n" // src += 4 * 2
"add %2, %2, %3, lsl #2\n" // dst_a += 4 * dst_stride_a
"add %4, %4, %5, lsl #2\n" // dst_b += 4 * dst_stride_b
"subs %6, #4\n" // w -= 4
"beq 4f\n"
"mov r9, %0 \n"
"vld1.64 {d0}, [r9], %1 \n"
"vld1.64 {d1}, [r9], %1 \n"
"vld1.64 {d2}, [r9], %1 \n"
"vld1.64 {d3}, [r9], %1 \n"
"vld1.64 {d4}, [r9], %1 \n"
"vld1.64 {d5}, [r9], %1 \n"
"vld1.64 {d6}, [r9], %1 \n"
"vld1.64 {d7}, [r9] \n"
"vld1.8 {q15}, [%7] \n"
"vtrn.8 q0, q1 \n"
"vtrn.8 q2, q3 \n"
"vtbl.8 d16, {d0, d1}, d30 \n"
"vtbl.8 d17, {d0, d1}, d31 \n"
"vtbl.8 d18, {d2, d3}, d30 \n"
"vtbl.8 d19, {d2, d3}, d31 \n"
"vtbl.8 d20, {d4, d5}, d30 \n"
"vtbl.8 d21, {d4, d5}, d31 \n"
"vtbl.8 d22, {d6, d7}, d30 \n"
"vtbl.8 d23, {d6, d7}, d31 \n"
"mov r9, %2 \n"
"vst1.32 {d16[0]}, [r9], %3 \n"
"vst1.32 {d16[1]}, [r9], %3 \n"
"vst1.32 {d17[0]}, [r9], %3 \n"
"vst1.32 {d17[1]}, [r9], %3 \n"
"add r9, %2, #4 \n"
"vst1.32 {d20[0]}, [r9], %3 \n"
"vst1.32 {d20[1]}, [r9], %3 \n"
"vst1.32 {d21[0]}, [r9], %3 \n"
"vst1.32 {d21[1]}, [r9] \n"
"mov r9, %4 \n"
"vst1.32 {d18[0]}, [r9], %5 \n"
"vst1.32 {d18[1]}, [r9], %5 \n"
"vst1.32 {d19[0]}, [r9], %5 \n"
"vst1.32 {d19[1]}, [r9], %5 \n"
"add r9, %4, #4 \n"
"vst1.32 {d22[0]}, [r9], %5 \n"
"vst1.32 {d22[1]}, [r9], %5 \n"
"vst1.32 {d23[0]}, [r9], %5 \n"
"vst1.32 {d23[1]}, [r9] \n"
"add %0, #4*2 \n" // src += 4 * 2
"add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %6, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %6, #2\n"
"blt 3f\n"
"cmp %6, #2 \n"
"blt 3f \n"
// 2x8 block
"2:\n"
"mov r9, %0\n"
"vld2.16 {d0[0], d2[0]}, [r9], %1\n"
"vld2.16 {d1[0], d3[0]}, [r9], %1\n"
"vld2.16 {d0[1], d2[1]}, [r9], %1\n"
"vld2.16 {d1[1], d3[1]}, [r9], %1\n"
"vld2.16 {d0[2], d2[2]}, [r9], %1\n"
"vld2.16 {d1[2], d3[2]}, [r9], %1\n"
"vld2.16 {d0[3], d2[3]}, [r9], %1\n"
"vld2.16 {d1[3], d3[3]}, [r9]\n"
"2: \n"
"mov r9, %0 \n"
"vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
"vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
"vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
"vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
"vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
"vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
"vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
"vld2.16 {d1[3], d3[3]}, [r9] \n"
"vtrn.8 d0, d1\n"
"vtrn.8 d2, d3\n"
"vtrn.8 d0, d1 \n"
"vtrn.8 d2, d3 \n"
"mov r9, %2\n"
"mov r9, %2 \n"
"vst1.64 {d0}, [r9], %3\n"
"vst1.64 {d2}, [r9]\n"
"vst1.64 {d0}, [r9], %3 \n"
"vst1.64 {d2}, [r9] \n"
"mov r9, %4\n"
"mov r9, %4 \n"
"vst1.64 {d1}, [r9], %5\n"
"vst1.64 {d3}, [r9]\n"
"vst1.64 {d1}, [r9], %5 \n"
"vst1.64 {d3}, [r9] \n"
"add %0, #2*2\n" // src += 2 * 2
"add %2, %2, %3, lsl #1\n" // dst_a += 2 * dst_stride_a
"add %4, %4, %5, lsl #1\n" // dst_b += 2 * dst_stride_b
"subs %6, #2\n" // w -= 2
"beq 4f\n"
"add %0, #2*2 \n" // src += 2 * 2
"add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %6, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3:\n"
"vld2.8 {d0[0], d1[0]}, [%0], %1\n"
"vld2.8 {d0[1], d1[1]}, [%0], %1\n"
"vld2.8 {d0[2], d1[2]}, [%0], %1\n"
"vld2.8 {d0[3], d1[3]}, [%0], %1\n"
"vld2.8 {d0[4], d1[4]}, [%0], %1\n"
"vld2.8 {d0[5], d1[5]}, [%0], %1\n"
"vld2.8 {d0[6], d1[6]}, [%0], %1\n"
"vld2.8 {d0[7], d1[7]}, [%0]\n"
"vst1.64 {d0}, [%2]\n"
"vst1.64 {d1}, [%4]\n"
"4:\n"
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
"vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
"vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
"vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
"vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
"vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
"vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
"vld2.8 {d0[7], d1[7]}, [%0] \n"
"vst1.64 {d0}, [%2] \n"
"vst1.64 {d1}, [%4] \n"
"4: \n"
: "+r"(src), // %0
"+r"(src_stride), // %1
......
......@@ -59,23 +59,23 @@ static const uvec8 kShuffleMaskBGRAToARGB = {
};
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5\n"
"pslld $0x18,%%xmm5\n"
"1:"
"movq (%0),%%xmm0\n"
"lea 0x8(%0),%0\n"
"punpcklbw %%xmm0,%%xmm0\n"
"movdqa %%xmm0,%%xmm1\n"
"punpcklwd %%xmm0,%%xmm0\n"
"punpckhwd %%xmm1,%%xmm1\n"
"por %%xmm5,%%xmm0\n"
"por %%xmm5,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"movdqa %%xmm1,0x10(%1)\n"
"lea 0x20(%1),%1\n"
"sub $0x8,%2\n"
"ja 1b\n"
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"ja 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -88,16 +88,16 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
}
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile(
"movdqa %3,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"lea 0x10(%0),%0\n"
"pshufb %%xmm5,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x4,%2\n"
"ja 1b\n"
asm volatile (
"movdqa %3,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"ja 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -111,16 +111,16 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
}
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
asm volatile(
"movdqa %3,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"lea 0x10(%0),%0\n"
"pshufb %%xmm5,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x4,%2\n"
"ja 1b\n"
asm volatile (
"movdqa %3,%%xmm5 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"ja 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -133,34 +133,34 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
}
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
"pslld $0x18,%%xmm5\n"
"movdqa %3,%%xmm4\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm4,%%xmm2\n"
"por %%xmm5,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm4,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm5,%%xmm0\n"
"pshufb %%xmm4,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm5,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm4,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm5,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n"
"movdqa %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_bg24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -173,34 +173,34 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
}
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
"pslld $0x18,%%xmm5\n"
"movdqa %3,%%xmm4\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm4,%%xmm2\n"
"por %%xmm5,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm4,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm5,%%xmm0\n"
"pshufb %%xmm4,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm5,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm4,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm5,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n"
"movdqa %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
......@@ -213,29 +213,29 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
}
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile(
"movdqa %4,%%xmm5\n"
"movdqa %3,%%xmm4\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm2\n"
"movdqa 0x30(%0),%%xmm3\n"
"pmaddubsw %%xmm4,%%xmm0\n"
"pmaddubsw %%xmm4,%%xmm1\n"
"pmaddubsw %%xmm4,%%xmm2\n"
"pmaddubsw %%xmm4,%%xmm3\n"
"lea 0x40(%0),%0\n"
"phaddw %%xmm1,%%xmm0\n"
"phaddw %%xmm3,%%xmm2\n"
"psrlw $0x7,%%xmm0\n"
"psrlw $0x7,%%xmm2\n"
"packuswb %%xmm2,%%xmm0\n"
"paddb %%xmm5,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
......@@ -253,10 +253,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#ifdef HAS_ARGBTOUVROW_SSSE3
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile(
"movdqa %0,%%xmm4\n"
"movdqa %1,%%xmm3\n"
"movdqa %2,%%xmm5\n"
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
......@@ -266,43 +266,43 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"xmm3", "xmm4", "xmm5"
#endif
);
asm volatile(
"sub %1,%2\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm2\n"
"movdqa 0x30(%0),%%xmm6\n"
"pavgb (%0,%4,1),%%xmm0\n"
"pavgb 0x10(%0,%4,1),%%xmm1\n"
"pavgb 0x20(%0,%4,1),%%xmm2\n"
"pavgb 0x30(%0,%4,1),%%xmm6\n"
"lea 0x40(%0),%0\n"
"movdqa %%xmm0,%%xmm7\n"
"shufps $0x88,%%xmm1,%%xmm0\n"
"shufps $0xdd,%%xmm1,%%xmm7\n"
"pavgb %%xmm7,%%xmm0\n"
"movdqa %%xmm2,%%xmm7\n"
"shufps $0x88,%%xmm6,%%xmm2\n"
"shufps $0xdd,%%xmm6,%%xmm7\n"
"pavgb %%xmm7,%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n"
"movdqa %%xmm2,%%xmm6\n"
"pmaddubsw %%xmm4,%%xmm0\n"
"pmaddubsw %%xmm4,%%xmm2\n"
"pmaddubsw %%xmm3,%%xmm1\n"
"pmaddubsw %%xmm3,%%xmm6\n"
"phaddw %%xmm2,%%xmm0\n"
"phaddw %%xmm6,%%xmm1\n"
"psraw $0x8,%%xmm0\n"
"psraw $0x8,%%xmm1\n"
"packsswb %%xmm1,%%xmm0\n"
"paddb %%xmm5,%%xmm0\n"
"movlps %%xmm0,(%1)\n"
"movhps %%xmm0,(%1,%2,1)\n"
"lea 0x8(%1),%1\n"
"sub $0x10,%3\n"
"ja 1b\n"
asm volatile (
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm6 \n"
"pavgb (%0,%4,1),%%xmm0 \n"
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"ja 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
......@@ -332,98 +332,65 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#define OMITFP __attribute__((optimize("omit-frame-pointer")))
#endif
#if defined(__APPLE__)
// REG6 version uses 1 less register but is slower
#define REG6
#endif
#ifdef REG6
// 6 register version only has REG_a for temporary
#define CLOBBER "%"REG_a
#define YUVTORGB \
"1:" \
"movzb (%1),%%"REG_a"\n" \
"lea 1(%1),%1\n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
"movzb (%2),%%"REG_a"\n" \
"lea 1(%2),%2\n" \
"movq 4096(%5,%%"REG_a",8),%%xmm1\n" \
"paddsw %%xmm1,%%xmm0\n" \
"movzb (%0),%%"REG_a"\n" \
"movq 0(%5,%%"REG_a",8),%%xmm2\n" \
"movzb 0x1(%0),%%"REG_a"\n" \
"movq 0(%5,%%"REG_a",8),%%xmm3\n" \
"lea 2(%0),%0\n" \
"paddsw %%xmm0,%%xmm2\n" \
"paddsw %%xmm0,%%xmm3\n" \
"shufps $0x44,%%xmm3,%%xmm2\n" \
"psraw $0x6,%%xmm2\n" \
"packuswb %%xmm2,%%xmm2\n" \
"movq %%xmm2,0x0(%3)\n" \
"lea 8(%3),%3\n" \
"sub $0x2,%4\n" \
"ja 1b\n"
#else
#define CLOBBER "%"REG_a, "%"REG_d
// This version produces 2 pixels
#define YUVTORGB \
"1:" \
"movzb (%1),%%"REG_a"\n" \
"lea 1(%1),%1\n" \
"movzb (%2),%%"REG_d"\n" \
"lea 1(%2),%2\n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
"movzb 0(%0),%%"REG_a"\n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
"paddsw %%xmm1,%%xmm0\n" \
"movzb 1(%0),%%"REG_d"\n" \
"punpcklqdq %%xmm0,%%xmm0\n" \
"lea 2(%0),%0\n" \
"movq 0(%5,%%"REG_a",8),%%xmm1\n" \
"movhps 0(%5,%%"REG_d",8),%%xmm1\n" \
"paddsw %%xmm0,%%xmm1\n" \
"psraw $6,%%xmm1\n" \
"packuswb %%xmm1,%%xmm1\n" \
"movq %%xmm1,0(%3)\n" \
"lea 8(%3),%3\n" \
"sub $0x2,%4\n" \
"ja 1b\n"
"1: \n" \
"movzb (%1),%%"REG_a" \n" \
"lea 1(%1),%1 \n" \
"movzb (%2),%%"REG_d" \n" \
"lea 1(%2),%2 \n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
"movzb 0(%0),%%"REG_a" \n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
"paddsw %%xmm1,%%xmm0 \n" \
"movzb 1(%0),%%"REG_d" \n" \
"punpcklqdq %%xmm0,%%xmm0 \n" \
"lea 2(%0),%0 \n" \
"movq 0(%5,%%"REG_a",8),%%xmm1 \n" \
"movhps 0(%5,%%"REG_d",8),%%xmm1 \n" \
"paddsw %%xmm0,%%xmm1 \n" \
"psraw $6,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"movq %%xmm1,0(%3) \n" \
"lea 8(%3),%3 \n" \
"sub $0x2,%4 \n" \
"ja 1b \n"
// This version produces 4 pixels
#define YUVTORGB4 \
"1:" \
"movzb 0(%1),%%"REG_a"\n" \
"movzb 0(%2),%%"REG_d"\n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
"movzb 0(%0),%%"REG_a"\n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
"paddsw %%xmm1,%%xmm0\n" \
"movzb 1(%0),%%"REG_d"\n" \
"punpcklqdq %%xmm0,%%xmm0\n" \
"movq 0(%5,%%"REG_a",8),%%xmm2\n" \
"movhps 0(%5,%%"REG_d",8),%%xmm2\n" \
"paddsw %%xmm0,%%xmm2\n" \
"psraw $6,%%xmm2\n" \
"movzb 1(%1),%%"REG_a"\n" \
"movzb 1(%2),%%"REG_d"\n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
"movzb 2(%0),%%"REG_a"\n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
"paddsw %%xmm1,%%xmm0\n" \
"movzb 3(%0),%%"REG_d"\n" \
"punpcklqdq %%xmm0,%%xmm0\n" \
"movq 0(%5,%%"REG_a",8),%%xmm3\n" \
"movhps 0(%5,%%"REG_d",8),%%xmm3\n" \
"paddsw %%xmm0,%%xmm3\n" \
"psraw $6,%%xmm3\n" \
"lea 2(%1),%1\n" \
"lea 2(%2),%2\n" \
"lea 4(%0),%0\n" \
"packuswb %%xmm3,%%xmm2\n" \
"movdqa %%xmm2,0(%3)\n" \
"lea 16(%3),%3\n" \
"sub $0x4,%4\n" \
"ja 1b\n"
#endif
"1: \n" \
"movzb 0(%1),%%"REG_a" \n" \
"movzb 0(%2),%%"REG_d" \n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
"movzb 0(%0),%%"REG_a" \n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
"paddsw %%xmm1,%%xmm0 \n" \
"movzb 1(%0),%%"REG_d" \n" \
"punpcklqdq %%xmm0,%%xmm0 \n" \
"movq 0(%5,%%"REG_a",8),%%xmm2 \n" \
"movhps 0(%5,%%"REG_d",8),%%xmm2 \n" \
"paddsw %%xmm0,%%xmm2 \n" \
"psraw $6,%%xmm2 \n" \
"movzb 1(%1),%%"REG_a" \n" \
"movzb 1(%2),%%"REG_d" \n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
"movzb 2(%0),%%"REG_a" \n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
"paddsw %%xmm1,%%xmm0 \n" \
"movzb 3(%0),%%"REG_d" \n" \
"punpcklqdq %%xmm0,%%xmm0 \n" \
"movq 0(%5,%%"REG_a",8),%%xmm3 \n" \
"movhps 0(%5,%%"REG_d",8),%%xmm3 \n" \
"paddsw %%xmm0,%%xmm3 \n" \
"psraw $6,%%xmm3 \n" \
"lea 2(%1),%1 \n" \
"lea 2(%2),%2 \n" \
"lea 4(%0),%0 \n" \
"packuswb %%xmm3,%%xmm2 \n" \
"movdqa %%xmm2,0(%3) \n" \
"lea 16(%3),%3 \n" \
"sub $0x4,%4 \n" \
"ja 1b \n" \
// 6 or 7 registers
void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
......@@ -431,7 +398,7 @@ void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile(
asm volatile (
YUVTORGB
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
......@@ -452,7 +419,7 @@ void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile(
asm volatile (
YUVTORGB4
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
......@@ -472,7 +439,7 @@ void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile(
asm volatile (
YUVTORGB
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
......@@ -492,7 +459,7 @@ void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile(
asm volatile (
YUVTORGB
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
......@@ -513,26 +480,26 @@ void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile(
"1:"
"movzb (%1),%%"REG_a"\n"
"lea 1(%1),%1\n"
"movq 2048(%5,%%"REG_a",8),%%xmm0\n"
"movzb (%2),%%"REG_a"\n"
"lea 1(%2),%2\n"
"movq 4096(%5,%%"REG_a",8),%%xmm1\n"
"paddsw %%xmm1,%%xmm0\n"
"movzb (%0),%%"REG_a"\n"
"lea 1(%0),%0\n"
"movq 0(%5,%%"REG_a",8),%%xmm2\n"
"paddsw %%xmm0,%%xmm2\n"
"shufps $0x44,%%xmm2,%%xmm2\n"
"psraw $0x6,%%xmm2\n"
"packuswb %%xmm2,%%xmm2\n"
"movd %%xmm2,0x0(%3)\n"
"lea 4(%3),%3\n"
"sub $0x1,%4\n"
"ja 1b\n"
asm volatile (
"1: \n"
"movzb (%1),%%"REG_a" \n"
"lea 1(%1),%1 \n"
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n"
"movzb (%2),%%"REG_a" \n"
"lea 1(%2),%2 \n"
"movq 4096(%5,%%"REG_a",8),%%xmm1 \n"
"paddsw %%xmm1,%%xmm0 \n"
"movzb (%0),%%"REG_a" \n"
"lea 1(%0),%0 \n"
"movq 0(%5,%%"REG_a",8),%%xmm2 \n"
"paddsw %%xmm0,%%xmm2 \n"
"shufps $0x44,%%xmm2,%%xmm2 \n"
"psraw $0x6,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,0x0(%3) \n"
"lea 4(%3),%3 \n"
"sub $0x1,%4 \n"
"ja 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
......@@ -550,19 +517,19 @@ void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile(
"1:"
"movzb (%0),%%"REG_a"\n"
"movzb 0x1(%0),%%"REG_d"\n"
"movq (%3,%%"REG_a",8),%%xmm2\n"
"lea 2(%0),%0\n"
"movhps (%3,%%"REG_d",8),%%xmm2\n"
"psraw $0x6,%%xmm2\n"
"packuswb %%xmm2,%%xmm2\n"
"movq %%xmm2,0x0(%1)\n"
"lea 8(%1),%1\n"
"sub $0x2,%2\n"
"ja 1b\n"
asm volatile (
"1: \n"
"movzb (%0),%%"REG_a" \n"
"movzb 0x1(%0),%%"REG_d" \n"
"movq (%3,%%"REG_a",8),%%xmm2 \n"
"lea 2(%0),%0 \n"
"movhps (%3,%%"REG_d",8),%%xmm2 \n"
"psraw $0x6,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movq %%xmm2,0x0(%1) \n"
"lea 8(%1),%1 \n"
"sub $0x2,%2 \n"
"ja 1b \n"
: "+r"(y_buf), // %0
"+r"(rgb_buf), // %1
"+rm"(width) // %2
......@@ -591,44 +558,44 @@ void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width);
asm(
".text\n"
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToARGBRow_MMX\n"
"_FastConvertYUVToARGBRow_MMX:\n"
".globl _FastConvertYUVToARGBRow_MMX \n"
"_FastConvertYUVToARGBRow_MMX: \n"
#else
".global FastConvertYUVToARGBRow_MMX\n"
"FastConvertYUVToARGBRow_MMX:\n"
".global FastConvertYUVToARGBRow_MMX \n"
"FastConvertYUVToARGBRow_MMX: \n"
#endif
"pusha\n"
"mov 0x24(%esp),%edx\n"
"mov 0x28(%esp),%edi\n"
"mov 0x2c(%esp),%esi\n"
"mov 0x30(%esp),%ebp\n"
"mov 0x34(%esp),%ecx\n"
"1:"
"movzbl (%edi),%eax\n"
"lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n"
"pusha \n"
"mov 0x24(%esp),%edx \n"
"mov 0x28(%esp),%edi \n"
"mov 0x2c(%esp),%esi \n"
"mov 0x30(%esp),%ebp \n"
"mov 0x34(%esp),%ecx \n"
"1: \n"
"movzbl (%edi),%eax \n"
"lea 1(%edi),%edi \n"
"movzbl (%esi),%ebx \n"
"lea 1(%esi),%esi \n"
"movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n"
"movzbl (%edx),%eax \n"
"paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx\n"
"movzbl 0x1(%edx),%ebx \n"
"movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx\n"
"lea 2(%edx),%edx \n"
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1\n"
"paddsw %mm0,%mm2\n"
"psraw $0x6,%mm1\n"
"psraw $0x6,%mm2\n"
"packuswb %mm2,%mm1\n"
"movq %mm1,0x0(%ebp)\n"
"lea 8(%ebp),%ebp\n"
"sub $0x2,%ecx\n"
"ja 1b\n"
"popa\n"
"ret\n"
"paddsw %mm0,%mm1 \n"
"paddsw %mm0,%mm2 \n"
"psraw $0x6,%mm1 \n"
"psraw $0x6,%mm2 \n"
"packuswb %mm2,%mm1 \n"
"movq %mm1,0x0(%ebp) \n"
"lea 8(%ebp),%ebp \n"
"sub $0x2,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
......@@ -637,44 +604,44 @@ void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width);
asm(
".text\n"
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToBGRARow_MMX\n"
"_FastConvertYUVToBGRARow_MMX:\n"
".globl _FastConvertYUVToBGRARow_MMX \n"
"_FastConvertYUVToBGRARow_MMX: \n"
#else
".global FastConvertYUVToBGRARow_MMX\n"
"FastConvertYUVToBGRARow_MMX:\n"
".global FastConvertYUVToBGRARow_MMX \n"
"FastConvertYUVToBGRARow_MMX: \n"
#endif
"pusha\n"
"mov 0x24(%esp),%edx\n"
"mov 0x28(%esp),%edi\n"
"mov 0x2c(%esp),%esi\n"
"mov 0x30(%esp),%ebp\n"
"mov 0x34(%esp),%ecx\n"
"1:"
"movzbl (%edi),%eax\n"
"lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n"
"pusha \n"
"mov 0x24(%esp),%edx \n"
"mov 0x28(%esp),%edi \n"
"mov 0x2c(%esp),%esi \n"
"mov 0x30(%esp),%ebp \n"
"mov 0x34(%esp),%ecx \n"
"1: \n"
"movzbl (%edi),%eax \n"
"lea 1(%edi),%edi \n"
"movzbl (%esi),%ebx \n"
"lea 1(%esi),%esi \n"
"movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n"
"movzbl (%edx),%eax \n"
"paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx\n"
"movzbl 0x1(%edx),%ebx \n"
"movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx\n"
"lea 2(%edx),%edx \n"
"movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1\n"
"paddsw %mm0,%mm2\n"
"psraw $0x6,%mm1\n"
"psraw $0x6,%mm2\n"
"packuswb %mm2,%mm1\n"
"movq %mm1,0x0(%ebp)\n"
"lea 8(%ebp),%ebp\n"
"sub $0x2,%ecx\n"
"ja 1b\n"
"popa\n"
"ret\n"
"paddsw %mm0,%mm1 \n"
"paddsw %mm0,%mm2 \n"
"psraw $0x6,%mm1 \n"
"psraw $0x6,%mm2 \n"
"packuswb %mm2,%mm1 \n"
"movq %mm1,0x0(%ebp) \n"
"lea 8(%ebp),%ebp \n"
"sub $0x2,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
......@@ -683,44 +650,44 @@ void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width);
asm(
".text\n"
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToABGRRow_MMX\n"
"_FastConvertYUVToABGRRow_MMX:\n"
".globl _FastConvertYUVToABGRRow_MMX \n"
"_FastConvertYUVToABGRRow_MMX: \n"
#else
".global FastConvertYUVToABGRRow_MMX\n"
"FastConvertYUVToABGRRow_MMX:\n"
".global FastConvertYUVToABGRRow_MMX \n"
"FastConvertYUVToABGRRow_MMX: \n"
#endif
"pusha\n"
"mov 0x24(%esp),%edx\n"
"mov 0x28(%esp),%edi\n"
"mov 0x2c(%esp),%esi\n"
"mov 0x30(%esp),%ebp\n"
"mov 0x34(%esp),%ecx\n"
"1:"
"movzbl (%edi),%eax\n"
"lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n"
"pusha \n"
"mov 0x24(%esp),%edx \n"
"mov 0x28(%esp),%edi \n"
"mov 0x2c(%esp),%esi \n"
"mov 0x30(%esp),%ebp \n"
"mov 0x34(%esp),%ecx \n"
"1: \n"
"movzbl (%edi),%eax \n"
"lea 1(%edi),%edi \n"
"movzbl (%esi),%ebx \n"
"lea 1(%esi),%esi \n"
"movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n"
"movzbl (%edx),%eax \n"
"paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx\n"
"movzbl 0x1(%edx),%ebx \n"
"movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx\n"
"lea 2(%edx),%edx \n"
"movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1\n"
"paddsw %mm0,%mm2\n"
"psraw $0x6,%mm1\n"
"psraw $0x6,%mm2\n"
"packuswb %mm2,%mm1\n"
"movq %mm1,0x0(%ebp)\n"
"lea 8(%ebp),%ebp\n"
"sub $0x2,%ecx\n"
"ja 1b\n"
"popa\n"
"ret\n"
"paddsw %mm0,%mm1 \n"
"paddsw %mm0,%mm2 \n"
"psraw $0x6,%mm1 \n"
"psraw $0x6,%mm2 \n"
"packuswb %mm2,%mm1 \n"
"movq %mm1,0x0(%ebp) \n"
"lea 8(%ebp),%ebp \n"
"sub $0x2,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
......@@ -729,73 +696,73 @@ void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width);
asm(
".text\n"
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYUV444ToARGBRow_MMX\n"
"_FastConvertYUV444ToARGBRow_MMX:\n"
".globl _FastConvertYUV444ToARGBRow_MMX \n"
"_FastConvertYUV444ToARGBRow_MMX: \n"
#else
".global FastConvertYUV444ToARGBRow_MMX\n"
"FastConvertYUV444ToARGBRow_MMX:\n"
".global FastConvertYUV444ToARGBRow_MMX \n"
"FastConvertYUV444ToARGBRow_MMX: \n"
#endif
"pusha\n"
"mov 0x24(%esp),%edx\n"
"mov 0x28(%esp),%edi\n"
"mov 0x2c(%esp),%esi\n"
"mov 0x30(%esp),%ebp\n"
"mov 0x34(%esp),%ecx\n"
"1:"
"movzbl (%edi),%eax\n"
"lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n"
"pusha \n"
"mov 0x24(%esp),%edx \n"
"mov 0x28(%esp),%edi \n"
"mov 0x2c(%esp),%esi \n"
"mov 0x30(%esp),%ebp \n"
"mov 0x34(%esp),%ecx \n"
"1: \n"
"movzbl (%edi),%eax \n"
"lea 1(%edi),%edi \n"
"movzbl (%esi),%ebx \n"
"lea 1(%esi),%esi \n"
"movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n"
"movzbl (%edx),%eax \n"
"paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
"lea 1(%edx),%edx\n"
"lea 1(%edx),%edx \n"
"paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
"psraw $0x6,%mm0\n"
"packuswb %mm0,%mm0\n"
"movd %mm0,0x0(%ebp)\n"
"lea 4(%ebp),%ebp\n"
"sub $0x1,%ecx\n"
"ja 1b\n"
"popa\n"
"ret\n"
"psraw $0x6,%mm0 \n"
"packuswb %mm0,%mm0 \n"
"movd %mm0,0x0(%ebp) \n"
"lea 4(%ebp),%ebp \n"
"sub $0x1,%ecx \n"
"ja 1b \n"
"popa \n"
"ret \n"
);
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width);
asm(
".text\n"
".text \n"
#if defined(OSX) || defined(IOS)
".globl _FastConvertYToARGBRow_MMX\n"
"_FastConvertYToARGBRow_MMX:\n"
".globl _FastConvertYToARGBRow_MMX \n"
"_FastConvertYToARGBRow_MMX: \n"
#else
".global FastConvertYToARGBRow_MMX\n"
"FastConvertYToARGBRow_MMX:\n"
".global FastConvertYToARGBRow_MMX \n"
"FastConvertYToARGBRow_MMX: \n"
#endif
"push %ebx\n"
"mov 0x8(%esp),%eax\n"
"mov 0xc(%esp),%edx\n"
"mov 0x10(%esp),%ecx\n"
"push %ebx \n"
"mov 0x8(%esp),%eax \n"
"mov 0xc(%esp),%edx \n"
"mov 0x10(%esp),%ecx \n"
"1:"
"movzbl (%eax),%ebx\n"
"1: \n"
"movzbl (%eax),%ebx \n"
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
"psraw $0x6,%mm0\n"
"movzbl 0x1(%eax),%ebx\n"
"psraw $0x6,%mm0 \n"
"movzbl 0x1(%eax),%ebx \n"
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
"psraw $0x6,%mm1\n"
"packuswb %mm1,%mm0\n"
"lea 0x2(%eax),%eax\n"
"movq %mm0,(%edx)\n"
"lea 0x8(%edx),%edx\n"
"sub $0x2,%ecx\n"
"ja 1b\n"
"pop %ebx\n"
"ret\n"
"psraw $0x6,%mm1 \n"
"packuswb %mm1,%mm0 \n"
"lea 0x2(%eax),%eax \n"
"movq %mm0,(%edx) \n"
"lea 0x8(%edx),%edx \n"
"sub $0x2,%ecx \n"
"ja 1b \n"
"pop %ebx \n"
"ret \n"
);
#endif
......
......@@ -92,7 +92,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
wloop:
convertloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm0
......@@ -105,7 +105,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
ja wloop
ja convertloop
ret
}
}
......@@ -753,18 +753,18 @@ SIMD_ALIGNED(const int16 kUVBiasR[8]) = {
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
__asm psubw xmm1, kUVBiasG \
__asm psubw xmm2, kUVBiasR \
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
__asm psubw xmm1, kUVBiasG \
__asm psubw xmm2, kUVBiasR \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] \
__asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \
__asm psubsw xmm3, kYSub16 \
__asm pmullw xmm3, kYToRgb \
__asm psubsw xmm3, kYSub16 \
__asm pmullw xmm3, kYToRgb \
__asm paddw xmm0, xmm3 /* B += Y */ \
__asm paddw xmm1, xmm3 /* G += Y */ \
__asm paddw xmm2, xmm3 /* R += Y */ \
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment