Commit d14936a0 authored by fbarchard@google.com's avatar fbarchard@google.com

scale avoid xmm6 and xmm7 usage

BUG=none
TEST=scale test in talk
Review URL: http://webrtc-codereview.appspot.com/256009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@56 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 4562b62c
...@@ -54,10 +54,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, ...@@ -54,10 +54,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
__asm__ volatile __asm__ volatile
( (
"1:\n" "1:\n"
"vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 "vld2.u8 {q0,q1}, [%0]!\n" // load even pixels into q0, odd into q1
"vst1.u8 {q0}, [%1]! \n" // store even pixels "vst1.u8 {q0}, [%1]!\n" // store even pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16\n" // 16 processed per loop
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -70,23 +70,23 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -70,23 +70,23 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"mov r4, #2 \n" // rounding constant "mov r4, #2\n" // rounding constant
"add %1, %0 \n" // change the stride to row 2 pointer "add %1, %0\n" // change the stride to row 2 pointer
"vdup.16 q4, r4 \n" "vdup.16 q4, r4\n"
"1:\n" "1:\n"
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment "vld1.u8 {q0,q1}, [%0]!\n" // load row 1 and post increment
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment "vld1.u8 {q2,q3}, [%1]!\n" // load row 2 and post increment
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q0, q0\n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n" "vpaddl.u8 q1, q1\n"
"vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 "vpadal.u8 q0, q2\n" // row 2 add adjacent, add row 1 to row 2
"vpadal.u8 q1, q3 \n" "vpadal.u8 q1, q3\n"
"vadd.u16 q0, q4 \n" // rounding "vadd.u16 q0, q4\n" // rounding
"vadd.u16 q1, q4 \n" "vadd.u16 q1, q4\n"
"vshrn.u16 d0, q0, #2 \n" // downshift and pack "vshrn.u16 d0, q0, #2\n" // downshift and pack
"vshrn.u16 d1, q1, #2 \n" "vshrn.u16 d1, q1, #2\n"
"vst1.u8 {q0}, [%2]! \n" "vst1.u8 {q0}, [%2]!\n"
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16\n" // 16 processed per loop
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
"+r"(dst), // %2 "+r"(dst), // %2
...@@ -104,15 +104,15 @@ static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, ...@@ -104,15 +104,15 @@ static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"mov r4, #4 \n" "mov r4, #4\n"
"1: \n" "1:\n"
"vld1.u8 {d0[0]}, [%0],r4 \n" // load up only 2 pixels of data to "vld1.u8 {d0[0]}, [%0],r4\n" // load up only 2 pixels of data to
"vld1.u8 {d0[1]}, [%0],r4 \n" // represent the entire 8x4 block "vld1.u8 {d0[1]}, [%0],r4\n" // represent the entire 8x4 block
"vst1.u16 {d0[0]}, [%1]! \n" "vst1.u16 {d0[0]}, [%1]!\n"
"subs %2, #2 \n" // dst_width -= 2 "subs %2, #2\n" // dst_width -= 2
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -125,40 +125,40 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -125,40 +125,40 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"1: \n" "1:\n"
"mov r4, %0 \n" "mov r4, %0\n"
"vld1.u8 {d0}, [r4],%3 \n" // load up 8x4 block of input data "vld1.u8 {d0}, [r4],%3\n" // load up 8x4 block of input data
"vld1.u8 {d1}, [r4],%3 \n" "vld1.u8 {d1}, [r4],%3\n"
"vld1.u8 {d2}, [r4],%3 \n" "vld1.u8 {d2}, [r4],%3\n"
"vld1.u8 {d3}, [r4] \n" "vld1.u8 {d3}, [r4]\n"
// data is loaded up int q0 and q1 // data is loaded up int q0 and q1
// q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13 // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
// q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23 // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
// q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13 // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 q0, q0\n"
// d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23 // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
// d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23 // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
"vpadal.u8 q0, q1 \n" "vpadal.u8 q0, q1\n"
// d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23 // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
// d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23 // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
"vpaddl.u16 q0, q0 \n" "vpaddl.u16 q0, q0\n"
// d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23 // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
// b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23 // b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
"vadd.u32 d0, d1 \n" "vadd.u32 d0, d1\n"
"vrshr.u32 d0, d0, #4 \n" // divide by 16 w/rounding "vrshr.u32 d0, d0, #4\n" // divide by 16 w/rounding
"vst1.u8 {d0[0]}, [%1]! \n" "vst1.u8 {d0[0]}, [%1]!\n"
"vst1.u8 {d0[4]}, [%1]! \n" "vst1.u8 {d0[4]}, [%1]!\n"
"add %0, #8 \n" // move src pointer to next 8 pixels "add %0, #8\n" // move src pointer to next 8 pixels
"subs %2, #2 \n" // dst_width -= 2 "subs %2, #2\n" // dst_width -= 2
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -176,12 +176,12 @@ static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, ...@@ -176,12 +176,12 @@ static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"1: \n" "1:\n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.u8 {d0, d1, d2, d3}, [%0]!\n" // src line 0
"vmov d2, d3 \n" // order needs to be d0, d1, d2 "vmov d2, d3\n" // order needs to be d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.u8 {d0, d1, d2}, [%1]!\n"
"subs %2, #24 \n" "subs %2, #24\n"
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -194,49 +194,49 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -194,49 +194,49 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"vmov.u8 d16, #3 \n" "vmov.u8 d16, #3\n"
"add %3, %0 \n" "add %3, %0\n"
"1: \n" "1:\n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.u8 {d0, d1, d2, d3}, [%0]!\n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.u8 {d4, d5, d6, d7}, [%3]!\n" // src line 1
// filter src line 0 with src line 1 // filter src line 0 with src line 1
// expand chars to shorts to allow for room // expand chars to shorts to allow for room
// when adding lines together // when adding lines together
"vmovl.u8 q4, d4 \n" "vmovl.u8 q4, d4\n"
"vmovl.u8 q5, d5 \n" "vmovl.u8 q5, d5\n"
"vmovl.u8 q6, d6 \n" "vmovl.u8 q6, d6\n"
"vmovl.u8 q7, d7 \n" "vmovl.u8 q7, d7\n"
// 3 * line_0 + line_1 // 3 * line_0 + line_1
"vmlal.u8 q4, d0, d16 \n" "vmlal.u8 q4, d0, d16\n"
"vmlal.u8 q5, d1, d16 \n" "vmlal.u8 q5, d1, d16\n"
"vmlal.u8 q6, d2, d16 \n" "vmlal.u8 q6, d2, d16\n"
"vmlal.u8 q7, d3, d16 \n" "vmlal.u8 q7, d3, d16\n"
// (3 * line_0 + line_1) >> 2 // (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q4, #2 \n" "vqrshrn.u16 d0, q4, #2\n"
"vqrshrn.u16 d1, q5, #2 \n" "vqrshrn.u16 d1, q5, #2\n"
"vqrshrn.u16 d2, q6, #2 \n" "vqrshrn.u16 d2, q6, #2\n"
"vqrshrn.u16 d3, q7, #2 \n" "vqrshrn.u16 d3, q7, #2\n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2 // a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q4, d1 \n" "vmovl.u8 q4, d1\n"
"vmlal.u8 q4, d0, d16 \n" "vmlal.u8 q4, d0, d16\n"
"vqrshrn.u16 d0, q4, #2 \n" "vqrshrn.u16 d0, q4, #2\n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1 // a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n" "vrhadd.u8 d1, d1, d2\n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2 // a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q4, d2 \n" "vmovl.u8 q4, d2\n"
"vmlal.u8 q4, d3, d16 \n" "vmlal.u8 q4, d3, d16\n"
"vqrshrn.u16 d2, q4, #2 \n" "vqrshrn.u16 d2, q4, #2\n"
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.u8 {d0, d1, d2}, [%1]!\n"
"subs %2, #24 \n" "subs %2, #24\n"
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -250,33 +250,33 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -250,33 +250,33 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"vmov.u8 d10, #3 \n" "vmov.u8 d10, #3\n"
"add %3, %0 \n" "add %3, %0\n"
"1: \n" "1:\n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.u8 {d0, d1, d2, d3}, [%0]!\n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.u8 {d4, d5, d6, d7}, [%3]!\n" // src line 1
// average src line 0 with src line 1 // average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n" "vrhadd.u8 q0, q0, q2\n"
"vrhadd.u8 q1, q1, q3 \n" "vrhadd.u8 q1, q1, q3\n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2 // a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1 \n" "vmovl.u8 q3, d1\n"
"vmlal.u8 q3, d0, d10 \n" "vmlal.u8 q3, d0, d10\n"
"vqrshrn.u16 d0, q3, #2 \n" "vqrshrn.u16 d0, q3, #2\n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1 // a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n" "vrhadd.u8 d1, d1, d2\n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2 // a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2 \n" "vmovl.u8 q3, d2\n"
"vmlal.u8 q3, d3, d10 \n" "vmlal.u8 q3, d3, d10\n"
"vqrshrn.u16 d2, q3, #2 \n" "vqrshrn.u16 d2, q3, #2\n"
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.u8 {d0, d1, d2}, [%1]!\n"
"subs %2, #24 \n" "subs %2, #24\n"
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -303,15 +303,15 @@ static void ScaleRowDown38_NEON(const uint8* src_ptr, int, ...@@ -303,15 +303,15 @@ static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"vld1.u8 {q3}, [%3] \n" "vld1.u8 {q3}, [%3]\n"
"1: \n" "1:\n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n" "vld1.u8 {d0, d1, d2, d3}, [%0]!\n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6\n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7\n"
"vst1.u8 {d4}, [%1]! \n" "vst1.u8 {d4}, [%1]!\n"
"vst1.u32 {d5[0]}, [%1]! \n" "vst1.u32 {d5[0]}, [%1]!\n"
"subs %2, #12 \n" "subs %2, #12\n"
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -325,57 +325,57 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -325,57 +325,57 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"vld1.u16 {q4}, [%4] \n" "vld1.u16 {q4}, [%4]\n"
"vld1.u8 {q5}, [%5] \n" "vld1.u8 {q5}, [%5]\n"
"vld1.u8 {q8}, [%6] \n" "vld1.u8 {q8}, [%6]\n"
"add r4, %0, %3, lsl #1 \n" "add r4, %0, %3, lsl #1\n"
"add %3, %0 \n" "add %3, %0\n"
"1: \n" "1:\n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63 // d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" "vld4.u8 {d0, d1, d2, d3}, [%0]!\n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" "vld4.u8 {d4, d5, d6, d7}, [%3]!\n"
"vld4.u8 {d12, d13, d14, d15}, [r4]! \n" "vld4.u8 {d12, d13, d14, d15}, [r4]!\n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13 // d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53 // d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n" "vtrn.u8 d0, d1\n"
"vtrn.u8 d4, d5 \n" "vtrn.u8 d4, d5\n"
"vtrn.u8 d12, d13 \n" "vtrn.u8 d12, d13\n"
// d2 = 20 30 21 31 22 32 23 33 // d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73 // d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n" "vtrn.u8 d2, d3\n"
"vtrn.u8 d6, d7 \n" "vtrn.u8 d6, d7\n"
"vtrn.u8 d14, d15 \n" "vtrn.u8 d14, d15\n"
// d0 = 00+10 01+11 02+12 03+13 // d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53 // d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 q0, q0\n"
"vpaddl.u8 q2, q2 \n" "vpaddl.u8 q2, q2\n"
"vpaddl.u8 q6, q6 \n" "vpaddl.u8 q6, q6\n"
// d3 = 60+70 61+71 62+72 63+73 // d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n" "vpaddl.u8 d3, d3\n"
"vpaddl.u8 d7, d7 \n" "vpaddl.u8 d7, d7\n"
"vpaddl.u8 d15, d15 \n" "vpaddl.u8 d15, d15\n"
// combine source lines // combine source lines
"vadd.u16 q0, q2 \n" "vadd.u16 q0, q2\n"
"vadd.u16 q0, q6 \n" "vadd.u16 q0, q6\n"
"vadd.u16 d4, d3, d7 \n" "vadd.u16 d4, d3, d7\n"
"vadd.u16 d4, d15 \n" "vadd.u16 d4, d15\n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6 // + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q4 \n" "vqrdmulh.s16 q2, q4\n"
"vmovn.u16 d4, q2 \n" "vmovn.u16 d4, q2\n"
// Shuffle 2,3 reg around so that 2 can be added to the // Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This // 0,1 reg and 3 can be added to the 4,5 reg. This
...@@ -383,41 +383,41 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -383,41 +383,41 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
// registers are already expanded. Then do transposes // registers are already expanded. Then do transposes
// to get aligned. // to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n" "vmovl.u8 q1, d2\n"
"vmovl.u8 q3, d6 \n" "vmovl.u8 q3, d6\n"
"vmovl.u8 q7, d14 \n" "vmovl.u8 q7, d14\n"
// combine source lines // combine source lines
"vadd.u16 q1, q3 \n" "vadd.u16 q1, q3\n"
"vadd.u16 q1, q7 \n" "vadd.u16 q1, q7\n"
// d4 = xx 20 xx 30 xx 22 xx 32 // d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33 // d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n" "vtrn.u32 d2, d3\n"
// d4 = xx 20 xx 21 xx 22 xx 23 // d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33 // d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n" "vtrn.u16 d2, d3\n"
// 0+1+2, 3+4+5 // 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n" "vadd.u16 q0, q1\n"
// Need to divide, but can't downshift as the the value // Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n // isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits. // and take the upper 16 bits.
"vqrdmulh.s16 q0, q8 \n" "vqrdmulh.s16 q0, q8\n"
// Align for table lookup, vtbl requires registers to // Align for table lookup, vtbl requires registers to
// be adjacent // be adjacent
"vmov.u8 d2, d4 \n" "vmov.u8 d2, d4\n"
"vtbl.u8 d3, {d0, d1, d2}, d10 \n" "vtbl.u8 d3, {d0, d1, d2}, d10\n"
"vtbl.u8 d4, {d0, d1, d2}, d11 \n" "vtbl.u8 d4, {d0, d1, d2}, d11\n"
"vst1.u8 {d3}, [%1]! \n" "vst1.u8 {d3}, [%1]!\n"
"vst1.u32 {d4[0]}, [%1]! \n" "vst1.u32 {d4[0]}, [%1]!\n"
"subs %2, #12 \n" "subs %2, #12\n"
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -435,45 +435,45 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -435,45 +435,45 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm__ volatile __asm__ volatile
( (
"vld1.u16 {q4}, [%4] \n" "vld1.u16 {q4}, [%4]\n"
"vld1.u8 {q5}, [%5] \n" "vld1.u8 {q5}, [%5]\n"
"add %3, %0 \n" "add %3, %0\n"
"1: \n" "1:\n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63 // d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" "vld4.u8 {d0, d1, d2, d3}, [%0]!\n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" "vld4.u8 {d4, d5, d6, d7}, [%3]!\n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13 // d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53 // d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n" "vtrn.u8 d0, d1\n"
"vtrn.u8 d4, d5 \n" "vtrn.u8 d4, d5\n"
// d2 = 20 30 21 31 22 32 23 33 // d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73 // d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n" "vtrn.u8 d2, d3\n"
"vtrn.u8 d6, d7 \n" "vtrn.u8 d6, d7\n"
// d0 = 00+10 01+11 02+12 03+13 // d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53 // d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 q0, q0\n"
"vpaddl.u8 q2, q2 \n" "vpaddl.u8 q2, q2\n"
// d3 = 60+70 61+71 62+72 63+73 // d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n" "vpaddl.u8 d3, d3\n"
"vpaddl.u8 d7, d7 \n" "vpaddl.u8 d7, d7\n"
// combine source lines // combine source lines
"vadd.u16 q0, q2 \n" "vadd.u16 q0, q2\n"
"vadd.u16 d4, d3, d7 \n" "vadd.u16 d4, d3, d7\n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
"vqrshrn.u16 d4, q2, #2 \n" "vqrshrn.u16 d4, q2, #2\n"
// Shuffle 2,3 reg around so that 2 can be added to the // Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This // 0,1 reg and 3 can be added to the 4,5 reg. This
...@@ -481,39 +481,39 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -481,39 +481,39 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
// registers are already expanded. Then do transposes // registers are already expanded. Then do transposes
// to get aligned. // to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n" "vmovl.u8 q1, d2\n"
"vmovl.u8 q3, d6 \n" "vmovl.u8 q3, d6\n"
// combine source lines // combine source lines
"vadd.u16 q1, q3 \n" "vadd.u16 q1, q3\n"
// d4 = xx 20 xx 30 xx 22 xx 32 // d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33 // d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n" "vtrn.u32 d2, d3\n"
// d4 = xx 20 xx 21 xx 22 xx 23 // d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33 // d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n" "vtrn.u16 d2, d3\n"
// 0+1+2, 3+4+5 // 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n" "vadd.u16 q0, q1\n"
// Need to divide, but can't downshift as the the value // Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n // isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits. // and take the upper 16 bits.
"vqrdmulh.s16 q0, q4 \n" "vqrdmulh.s16 q0, q4\n"
// Align for table lookup, vtbl requires registers to // Align for table lookup, vtbl requires registers to
// be adjacent // be adjacent
"vmov.u8 d2, d4 \n" "vmov.u8 d2, d4\n"
"vtbl.u8 d3, {d0, d1, d2}, d10 \n" "vtbl.u8 d3, {d0, d1, d2}, d10\n"
"vtbl.u8 d4, {d0, d1, d2}, d11 \n" "vtbl.u8 d4, {d0, d1, d2}, d11\n"
"vst1.u8 {d3}, [%1]! \n" "vst1.u8 {d3}, [%1]!\n"
"vst1.u32 {d4[0]}, [%1]! \n" "vst1.u32 {d4[0]}, [%1]!\n"
"subs %2, #12 \n" "subs %2, #12\n"
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -630,15 +630,15 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ...@@ -630,15 +630,15 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
// src_stride ignored // src_stride ignored
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm5, 8
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pand xmm0, xmm7 pand xmm0, xmm5
pand xmm1, xmm7 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
...@@ -659,8 +659,8 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -659,8 +659,8 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm5, 8
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -675,8 +675,8 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -675,8 +675,8 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
psrlw xmm0, 8 psrlw xmm0, 8
movdqa xmm3, xmm1 movdqa xmm3, xmm1
psrlw xmm1, 8 psrlw xmm1, 8
pand xmm2, xmm7 pand xmm2, xmm5
pand xmm3, xmm7 pand xmm3, xmm5
pavgw xmm0, xmm2 pavgw xmm0, xmm2
pavgw xmm1, xmm3 pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
...@@ -703,15 +703,15 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, ...@@ -703,15 +703,15 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
// src_stride ignored // src_stride ignored
mov edi, [esp + 32 + 12] // dst_ptr mov edi, [esp + 32 + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 32 + 16] // dst_width
pcmpeqb xmm7, xmm7 // generate mask 0x000000ff pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
psrld xmm7, 24 psrld xmm5, 24
wloop: wloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm1, [esi + 16] movdqa xmm1, [esi + 16]
lea esi, [esi + 32] lea esi, [esi + 32]
pand xmm0, xmm7 pand xmm0, xmm5
pand xmm1, xmm7 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edi], xmm0 movq qword ptr [edi], xmm0
...@@ -794,15 +794,15 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, ...@@ -794,15 +794,15 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
// src_stride ignored // src_stride ignored
mov edi, [esp + 32 + 12] // dst_ptr mov edi, [esp + 32 + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 32 + 16] // dst_width
pcmpeqb xmm7, xmm7 // generate mask isolating 1 src 8 bytes pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
psrlq xmm7, 56 psrlq xmm5, 56
wloop: wloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm1, [esi + 16] movdqa xmm1, [esi + 16]
lea esi, [esi + 32] lea esi, [esi + 32]
pand xmm0, xmm7 pand xmm0, xmm5
pand xmm1, xmm7 pand xmm1, xmm5
packuswb xmm0, xmm1 // 32->16 packuswb xmm0, xmm1 // 32->16
packuswb xmm0, xmm0 // 16->8 packuswb xmm0, xmm0 // 16->8
packuswb xmm0, xmm0 // 8->4 packuswb xmm0, xmm0 // 8->4
...@@ -906,9 +906,9 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -906,9 +906,9 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
wloop: wloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm2, [esi + 16] movdqa xmm1, [esi + 16]
lea esi, [esi + 32] lea esi, [esi + 32]
movdqa xmm1, xmm2 movdqa xmm2, xmm1
palignr xmm1, xmm0, 8 palignr xmm1, xmm0, 8
pshufb xmm0, xmm3 pshufb xmm0, xmm3
pshufb xmm1, xmm4 pshufb xmm1, xmm4
...@@ -1069,16 +1069,15 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1069,16 +1069,15 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
mov edx, [esp + 32 + 8] // src_stride mov edx, [esp + 32 + 8] // src_stride
mov edi, [esp + 32 + 12] // dst_ptr mov edi, [esp + 32 + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 32 + 16] // dst_width
movdqa xmm5, _shuf38a movdqa xmm4, _shuf38a
movdqa xmm6, _shuf38b movdqa xmm5, _shuf38b
pxor xmm7, xmm7
xloop: xloop:
movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5
movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11
lea esi, [esi + 32] lea esi, [esi + 32]
pshufb xmm0, xmm5 pshufb xmm0, xmm4
pshufb xmm1, xmm6 pshufb xmm1, xmm5
paddusb xmm0, xmm1 paddusb xmm0, xmm1
movq qword ptr [edi], xmm0 // write 12 pixels movq qword ptr [edi], xmm0 // write 12 pixels
...@@ -1214,7 +1213,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1214,7 +1213,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
mov edi, [esp + 32 + 12] // dst_ptr mov edi, [esp + 32 + 12] // dst_ptr
mov ecx, [esp + 32 + 16] // dst_width mov ecx, [esp + 32 + 16] // dst_width
mov ebx, [esp + 32 + 20] // height mov ebx, [esp + 32 + 20] // height
pxor xmm7, xmm7 pxor xmm5, xmm5
dec ebx dec ebx
xloop: xloop:
...@@ -1223,16 +1222,16 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1223,16 +1222,16 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
lea eax, [esi + edx] lea eax, [esi + edx]
movhlps xmm3, xmm2 movhlps xmm3, xmm2
mov ebp, ebx mov ebp, ebx
punpcklbw xmm2, xmm7 punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm7 punpcklbw xmm3, xmm5
// sum remaining rows // sum remaining rows
yloop: yloop:
movdqa xmm0, [eax] // read 16 pixels movdqa xmm0, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row lea eax, [eax + edx] // advance to next row
movhlps xmm1, xmm0 movhlps xmm1, xmm0
punpcklbw xmm0, xmm7 punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm7 punpcklbw xmm1, xmm5
paddusw xmm2, xmm0 // sum 16 words paddusw xmm2, xmm0 // sum 16 words
paddusw xmm3, xmm1 paddusw xmm3, xmm1
sub ebp, 1 sub ebp, 1
...@@ -1365,9 +1364,9 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1365,9 +1364,9 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov ah,al mov ah,al
neg al neg al
add al, 128 add al, 128
movd xmm7, eax movd xmm5, eax
punpcklwd xmm7, xmm7 punpcklwd xmm5, xmm5
pshufd xmm7, xmm7, 0 pshufd xmm5, xmm5, 0
xloop: xloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
...@@ -1376,8 +1375,8 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1376,8 +1375,8 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklbw xmm0, xmm2 punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2 punpckhbw xmm1, xmm2
pmaddubsw xmm0, xmm7 pmaddubsw xmm0, xmm5
pmaddubsw xmm1, xmm7 pmaddubsw xmm1, xmm5
psrlw xmm0, 7 psrlw xmm0, 7
psrlw xmm1, 7 psrlw xmm1, 7
packuswb xmm0, xmm1 packuswb xmm0, xmm1
...@@ -1482,14 +1481,14 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1482,14 +1481,14 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"pand %%xmm7,%%xmm1\n" "pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,(%1)\n" "movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n" "lea 0x10(%1),%1\n"
...@@ -1499,15 +1498,15 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1499,15 +1498,15 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory" : "memory", "cc"
); );
} }
static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
...@@ -1520,8 +1519,8 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1520,8 +1519,8 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
"psrlw $0x8,%%xmm0\n" "psrlw $0x8,%%xmm0\n"
"movdqa %%xmm1,%%xmm3\n" "movdqa %%xmm1,%%xmm3\n"
"psrlw $0x8,%%xmm1\n" "psrlw $0x8,%%xmm1\n"
"pand %%xmm7,%%xmm2\n" "pand %%xmm5,%%xmm2\n"
"pand %%xmm7,%%xmm3\n" "pand %%xmm5,%%xmm3\n"
"pavgw %%xmm2,%%xmm0\n" "pavgw %%xmm2,%%xmm0\n"
"pavgw %%xmm3,%%xmm1\n" "pavgw %%xmm3,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
...@@ -1533,7 +1532,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1533,7 +1532,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(static_cast<intptr_t>(src_stride)) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %3
: "memory" : "memory", "cc"
); );
} }
...@@ -1541,14 +1540,14 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1541,14 +1540,14 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrld $0x18,%%xmm7\n" "psrld $0x18,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"pand %%xmm7,%%xmm1\n" "pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%1)\n" "movq %%xmm0,(%1)\n"
...@@ -1559,7 +1558,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1559,7 +1558,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory" : "memory", "cc"
); );
} }
...@@ -1609,7 +1608,10 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1609,7 +1608,10 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(temp) // %3 "+r"(temp) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4 : "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory" : "memory", "cc"
#if defined(__x86_64__)
, "xmm6", "xmm7"
#endif
); );
} }
...@@ -1617,14 +1619,14 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1617,14 +1619,14 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrlq $0x38,%%xmm7\n" "psrlq $0x38,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"pand %%xmm7,%%xmm1\n" "pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n"
...@@ -1636,7 +1638,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1636,7 +1638,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory" : "memory", "cc"
); );
} }
...@@ -1891,16 +1893,15 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1891,16 +1893,15 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
"mov 0x28(%esp),%edx\n" "mov 0x28(%esp),%edx\n"
"mov 0x2c(%esp),%edi\n" "mov 0x2c(%esp),%edi\n"
"mov 0x30(%esp),%ecx\n" "mov 0x30(%esp),%ecx\n"
"movdqa _shuf38a ,%xmm5\n" "movdqa _shuf38a ,%xmm4\n"
"movdqa _shuf38b ,%xmm6\n" "movdqa _shuf38b ,%xmm5\n"
"pxor %xmm7,%xmm7\n"
"1:" "1:"
"movdqa (%esi),%xmm0\n" "movdqa (%esi),%xmm0\n"
"movdqa 0x10(%esi),%xmm1\n" "movdqa 0x10(%esi),%xmm1\n"
"lea 0x20(%esi),%esi\n" "lea 0x20(%esi),%esi\n"
"pshufb %xmm5,%xmm0\n" "pshufb %xmm4,%xmm0\n"
"pshufb %xmm6,%xmm1\n" "pshufb %xmm5,%xmm1\n"
"paddusb %xmm1,%xmm0\n" "paddusb %xmm1,%xmm0\n"
"movq %xmm0,(%edi)\n" "movq %xmm0,(%edi)\n"
"movhlps %xmm0,%xmm1\n" "movhlps %xmm0,%xmm1\n"
...@@ -2040,22 +2041,22 @@ extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -2040,22 +2041,22 @@ extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"mov 0x2c(%esp),%edi\n" "mov 0x2c(%esp),%edi\n"
"mov 0x30(%esp),%ecx\n" "mov 0x30(%esp),%ecx\n"
"mov 0x34(%esp),%ebx\n" "mov 0x34(%esp),%ebx\n"
"pxor %xmm7,%xmm7\n" "pxor %xmm5,%xmm5\n"
"1:" "1:"
"movdqa (%esi),%xmm2\n" "movdqa (%esi),%xmm2\n"
"lea (%esi,%edx,1),%eax\n" "lea (%esi,%edx,1),%eax\n"
"movhlps %xmm2,%xmm3\n" "movhlps %xmm2,%xmm3\n"
"lea -0x1(%ebx),%ebp\n" "lea -0x1(%ebx),%ebp\n"
"punpcklbw %xmm7,%xmm2\n" "punpcklbw %xmm5,%xmm2\n"
"punpcklbw %xmm7,%xmm3\n" "punpcklbw %xmm5,%xmm3\n"
"2:" "2:"
"movdqa (%eax),%xmm0\n" "movdqa (%eax),%xmm0\n"
"lea (%eax,%edx,1),%eax\n" "lea (%eax,%edx,1),%eax\n"
"movhlps %xmm0,%xmm1\n" "movhlps %xmm0,%xmm1\n"
"punpcklbw %xmm7,%xmm0\n" "punpcklbw %xmm5,%xmm0\n"
"punpcklbw %xmm7,%xmm1\n" "punpcklbw %xmm5,%xmm1\n"
"paddusw %xmm0,%xmm2\n" "paddusw %xmm0,%xmm2\n"
"paddusw %xmm1,%xmm3\n" "paddusw %xmm1,%xmm3\n"
"sub $0x1,%ebp\n" "sub $0x1,%ebp\n"
...@@ -2195,9 +2196,9 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2195,9 +2196,9 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"mov %al,%ah\n" "mov %al,%ah\n"
"neg %al\n" "neg %al\n"
"add $0x80,%al\n" "add $0x80,%al\n"
"movd %eax,%xmm7\n" "movd %eax,%xmm5\n"
"punpcklwd %xmm7,%xmm7\n" "punpcklwd %xmm5,%xmm5\n"
"pshufd $0x0,%xmm7,%xmm7\n" "pshufd $0x0,%xmm5,%xmm5\n"
"1:" "1:"
"movdqa (%esi),%xmm0\n" "movdqa (%esi),%xmm0\n"
...@@ -2206,8 +2207,8 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2206,8 +2207,8 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"movdqa %xmm0,%xmm1\n" "movdqa %xmm0,%xmm1\n"
"punpcklbw %xmm2,%xmm0\n" "punpcklbw %xmm2,%xmm0\n"
"punpckhbw %xmm2,%xmm1\n" "punpckhbw %xmm2,%xmm1\n"
"pmaddubsw %xmm7,%xmm0\n" "pmaddubsw %xmm5,%xmm0\n"
"pmaddubsw %xmm7,%xmm1\n" "pmaddubsw %xmm5,%xmm1\n"
"psrlw $0x7,%xmm0\n" "psrlw $0x7,%xmm0\n"
"psrlw $0x7,%xmm1\n" "psrlw $0x7,%xmm1\n"
"packuswb %xmm1,%xmm0\n" "packuswb %xmm1,%xmm0\n"
...@@ -2305,8 +2306,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -2305,8 +2306,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(static_cast<intptr_t>(src_stride)) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %3
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", : "memory", "cc", "r10", "r11", "xmm6", "xmm7"
"xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
...@@ -2338,7 +2338,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2338,7 +2338,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
: "r"(_shuf0), // %3 : "r"(_shuf0), // %3
"r"(_shuf1), // %4 "r"(_shuf1), // %4
"r"(_shuf2) // %5 "r"(_shuf2) // %5
: "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" : "memory", "cc"
); );
} }
...@@ -2395,8 +2395,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2395,8 +2395,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
"r"(_madd11), // %8 "r"(_madd11), // %8
"r"(_round34), // %9 "r"(_round34), // %9
"r"(_madd21) // %10 "r"(_madd21) // %10
: "memory", "xmm0", "xmm1", "xmm2", "xmm3", : "memory", "cc", "xmm6", "xmm7", "xmm8"
"xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
); );
} }
...@@ -2456,8 +2455,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2456,8 +2455,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
"r"(_madd11), // %8 "r"(_madd11), // %8
"r"(_round34), // %9 "r"(_round34), // %9
"r"(_madd21) // %10 "r"(_madd21) // %10
: "memory", "xmm0", "xmm1", "xmm2", "xmm3", : "memory", "cc", "xmm6", "xmm7", "xmm8"
"xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
); );
} }
...@@ -2465,15 +2463,14 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2465,15 +2463,14 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile( asm volatile(
"movdqa (%3),%%xmm5\n" "movdqa (%3),%%xmm4\n"
"movdqa (%4),%%xmm6\n" "movdqa (%4),%%xmm5\n"
"pxor %%xmm7,%%xmm7\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"pshufb %%xmm5,%%xmm0\n" "pshufb %%xmm4,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n" "pshufb %%xmm5,%%xmm1\n"
"paddusb %%xmm1,%%xmm0\n" "paddusb %%xmm1,%%xmm0\n"
"movq %%xmm0,(%1)\n" "movq %%xmm0,(%1)\n"
"movhlps %%xmm0,%%xmm1\n" "movhlps %%xmm0,%%xmm1\n"
...@@ -2486,7 +2483,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2486,7 +2483,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(_shuf38a), // %3 : "r"(_shuf38a), // %3
"r"(_shuf38b) // %4 "r"(_shuf38b) // %4
: "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7" : "memory", "cc"
); );
} }
...@@ -2543,8 +2540,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2543,8 +2540,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
"r"(_shufac0), // %4 "r"(_shufac0), // %4
"r"(_shufac3), // %5 "r"(_shufac3), // %5
"r"(_scaleac3) // %6 "r"(_scaleac3) // %6
: "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3", : "memory", "cc", "rax", "xmm6", "xmm7"
"xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
...@@ -2582,8 +2578,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2582,8 +2578,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
"r"(_shufab1), // %5 "r"(_shufab1), // %5
"r"(_shufab2), // %6 "r"(_shufab2), // %6
"r"(_scaleab2) // %7 "r"(_scaleab2) // %7
: "memory", "rax", "xmm0", "xmm1", "xmm2", : "memory", "cc", "rax", "xmm6", "xmm7"
"xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
...@@ -2592,21 +2587,21 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -2592,21 +2587,21 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, uint16* dst_ptr, int src_width,
int src_height) { int src_height) {
asm volatile( asm volatile(
"pxor %%xmm7,%%xmm7\n" "pxor %%xmm5,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm2\n" "movdqa (%0),%%xmm2\n"
"lea (%0,%4,1),%%r10\n" "lea (%0,%4,1),%%r10\n"
"movhlps %%xmm2,%%xmm3\n" "movhlps %%xmm2,%%xmm3\n"
"lea -0x1(%3),%%r11\n" "lea -0x1(%3),%%r11\n"
"punpcklbw %%xmm7,%%xmm2\n" "punpcklbw %%xmm5,%%xmm2\n"
"punpcklbw %%xmm7,%%xmm3\n" "punpcklbw %%xmm5,%%xmm3\n"
"2:" "2:"
"movdqa (%%r10),%%xmm0\n" "movdqa (%%r10),%%xmm0\n"
"lea (%%r10,%4,1),%%r10\n" "lea (%%r10,%4,1),%%r10\n"
"movhlps %%xmm0,%%xmm1\n" "movhlps %%xmm0,%%xmm1\n"
"punpcklbw %%xmm7,%%xmm0\n" "punpcklbw %%xmm5,%%xmm0\n"
"punpcklbw %%xmm7,%%xmm1\n" "punpcklbw %%xmm5,%%xmm1\n"
"paddusw %%xmm0,%%xmm2\n" "paddusw %%xmm0,%%xmm2\n"
"paddusw %%xmm1,%%xmm3\n" "paddusw %%xmm1,%%xmm3\n"
"sub $0x1,%%r11\n" "sub $0x1,%%r11\n"
...@@ -2623,7 +2618,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -2623,7 +2618,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"+r"(src_width), // %2 "+r"(src_width), // %2
"+r"(src_height) // %3 "+r"(src_height) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4 : "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7" : "memory", "cc", "r10", "r11"
); );
} }
...@@ -2647,7 +2642,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2647,7 +2642,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory", "rax", "xmm0" : "memory", "cc", "rax"
); );
return; return;
} else if (source_y_fraction == 128) { } else if (source_y_fraction == 128) {
...@@ -2667,7 +2662,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2667,7 +2662,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(static_cast<intptr_t>(src_stride)) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %3
: "memory", "rax", "xmm0", "xmm2" : "memory", "cc", "rax"
); );
return; return;
} else { } else {
...@@ -2712,8 +2707,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2712,8 +2707,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(source_y_fraction) // %3 "+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4 : "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3", : "memory", "cc", "rax", "xmm6", "xmm7"
"xmm5", "xmm6", "xmm7"
); );
} }
return; return;
...@@ -2739,7 +2733,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2739,7 +2733,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory", "rax", "xmm0" : "memory", "cc", "rax"
); );
return; return;
} else if (source_y_fraction == 128) { } else if (source_y_fraction == 128) {
...@@ -2759,7 +2753,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2759,7 +2753,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(static_cast<intptr_t>(src_stride)) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %3
: "memory", "rax", "xmm0", "xmm2" : "memory", "cc", "rax"
); );
return; return;
} else { } else {
...@@ -2769,9 +2763,9 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2769,9 +2763,9 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"mov %%al,%%ah\n" "mov %%al,%%ah\n"
"neg %%al\n" "neg %%al\n"
"add $0x80,%%al\n" "add $0x80,%%al\n"
"movd %%eax,%%xmm7\n" "movd %%eax,%%xmm5\n"
"punpcklwd %%xmm7,%%xmm7\n" "punpcklwd %%xmm5,%%xmm5\n"
"pshufd $0x0,%%xmm7,%%xmm7\n" "pshufd $0x0,%%xmm5,%%xmm5\n"
"1:" "1:"
"movdqa (%1),%%xmm0\n" "movdqa (%1),%%xmm0\n"
"movdqa (%1,%4,1),%%xmm2\n" "movdqa (%1,%4,1),%%xmm2\n"
...@@ -2779,8 +2773,8 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2779,8 +2773,8 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"movdqa %%xmm0,%%xmm1\n" "movdqa %%xmm0,%%xmm1\n"
"punpcklbw %%xmm2,%%xmm0\n" "punpcklbw %%xmm2,%%xmm0\n"
"punpckhbw %%xmm2,%%xmm1\n" "punpckhbw %%xmm2,%%xmm1\n"
"pmaddubsw %%xmm7,%%xmm0\n" "pmaddubsw %%xmm5,%%xmm0\n"
"pmaddubsw %%xmm7,%%xmm1\n" "pmaddubsw %%xmm5,%%xmm1\n"
"psrlw $0x7,%%xmm0\n" "psrlw $0x7,%%xmm0\n"
"psrlw $0x7,%%xmm1\n" "psrlw $0x7,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
...@@ -2795,7 +2789,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2795,7 +2789,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(source_y_fraction) // %3 "+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4 : "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7" : "memory", "cc", "rax"
); );
} }
return; return;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment