Commit bd4a849b authored by fbarchard@google.com's avatar fbarchard@google.com

fix for nv21 u, v order, align all loops, and make addrows support 1 row

BUG=17
TEST=none
Review URL: https://webrtc-codereview.appspot.com/435004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@208 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent ba3aeed3
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 207 Version: 208
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 207 #define LIBYUV_VERSION 208
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -89,6 +89,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -89,6 +89,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
pxor xmm5, xmm5 pxor xmm5, xmm5
sub edx, eax sub edx, eax
align 16
wloop: wloop:
movdqa xmm1, [eax] movdqa xmm1, [eax]
movdqa xmm2, [eax + edx] movdqa xmm2, [eax + edx]
......
...@@ -1746,8 +1746,8 @@ int ConvertToI420(const uint8* sample, size_t sample_size, ...@@ -1746,8 +1746,8 @@ int ConvertToI420(const uint8* sample, size_t sample_size,
r = NV12ToI420Rotate(src, src_width, r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width, src_uv, aligned_src_width,
y, y_stride, y, y_stride,
u, u_stride,
v, v_stride, v, v_stride,
u, u_stride,
dst_width, inv_dst_height, rotation); dst_width, inv_dst_height, rotation);
break; break;
case FOURCC_M420: case FOURCC_M420:
......
...@@ -222,6 +222,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, ...@@ -222,6 +222,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edx, esi sub edx, esi
align 16
convertloop: convertloop:
movq xmm2, qword ptr [esi] // U movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V movq xmm3, qword ptr [esi + edx] // V
...@@ -260,6 +261,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, ...@@ -260,6 +261,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edx, esi sub edx, esi
align 16
convertloop: convertloop:
movq xmm2, qword ptr [esi] // U movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V movq xmm3, qword ptr [esi + edx] // V
......
...@@ -722,6 +722,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, ...@@ -722,6 +722,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
lea ecx, [ebp * 4] lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4 sub edx, ecx // stride - width * 4
align 16
convertloop: convertloop:
mov ecx, ebp mov ecx, ebp
rep stosd rep stosd
......
...@@ -83,9 +83,11 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -83,9 +83,11 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
mov edx, [esp + 12 + 12] // dst mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width mov ecx, [esp + 12 + 20] // width
convertloop:
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
align 16
convertloop:
movq xmm0, qword ptr [eax] movq xmm0, qword ptr [eax]
lea ebp, [eax + 8] lea ebp, [eax + 8]
movq xmm1, qword ptr [eax + edi] movq xmm1, qword ptr [eax + edi]
...@@ -182,6 +184,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -182,6 +184,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
and esp, ~15 and esp, ~15
mov [esp + 16], ecx mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w mov ecx, [ecx + 16 + 28] // w
align 16
convertloop: convertloop:
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
......
...@@ -685,6 +685,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ...@@ -685,6 +685,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 16
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -714,6 +715,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -714,6 +715,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 16
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -757,6 +759,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, ...@@ -757,6 +759,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x000000ff pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
psrld xmm5, 24 psrld xmm5, 24
align 16
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -790,6 +793,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -790,6 +793,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm7, 8
align 16
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -848,6 +852,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, ...@@ -848,6 +852,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
psrlq xmm5, 56 psrlq xmm5, 56
align 16
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -882,6 +887,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -882,6 +887,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
lea edi, [esi + esi * 2] // src_stride * 3 lea edi, [esi + esi * 2] // src_stride * 3
pxor xmm7, xmm7 pxor xmm7, xmm7
align 16
wloop: wloop:
movdqa xmm0, [eax] // average 8 rows to 1 movdqa xmm0, [eax] // average 8 rows to 1
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -957,6 +963,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -957,6 +963,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
movdqa xmm4, _shuf1 movdqa xmm4, _shuf1
movdqa xmm5, _shuf2 movdqa xmm5, _shuf2
align 16
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -1009,6 +1016,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1009,6 +1016,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
movdqa xmm6, _madd11 movdqa xmm6, _madd11
movdqa xmm7, _round34 movdqa xmm7, _round34
align 16
wloop: wloop:
movdqa xmm0, [eax] // pixels 0..7 movdqa xmm0, [eax] // pixels 0..7
movdqa xmm1, [eax + esi] movdqa xmm1, [eax + esi]
...@@ -1066,6 +1074,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1066,6 +1074,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
movdqa xmm6, _madd11 movdqa xmm6, _madd11
movdqa xmm7, _round34 movdqa xmm7, _round34
align 16
wloop: wloop:
movdqa xmm0, [eax] // pixels 0..7 movdqa xmm0, [eax] // pixels 0..7
movdqa xmm1, [eax + esi] movdqa xmm1, [eax + esi]
...@@ -1123,6 +1132,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1123,6 +1132,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
movdqa xmm4, _shuf38a movdqa xmm4, _shuf38a
movdqa xmm5, _shuf38b movdqa xmm5, _shuf38b
align 16
xloop: xloop:
movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
...@@ -1158,6 +1168,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1158,6 +1168,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
movdqa xmm6, _scaleac3 movdqa xmm6, _scaleac3
pxor xmm7, xmm7 pxor xmm7, xmm7
align 16
xloop: xloop:
movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
movdqa xmm2, [eax + esi] movdqa xmm2, [eax + esi]
...@@ -1224,6 +1235,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1224,6 +1235,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
movdqa xmm6, _shufab2 movdqa xmm6, _shufab2
movdqa xmm7, _scaleab2 movdqa xmm7, _scaleab2
align 16
xloop: xloop:
movdqa xmm2, [eax] // average 2 rows into xmm2 movdqa xmm2, [eax] // average 2 rows into xmm2
pavgb xmm2, [eax + esi] pavgb xmm2, [eax + esi]
...@@ -1256,8 +1268,6 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1256,8 +1268,6 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
#define HAS_SCALEADDROWS_SSE2 #define HAS_SCALEADDROWS_SSE2
// Reads 16xN bytes and produces 16 shorts at a time. // Reads 16xN bytes and produces 16 shorts at a time.
// TODO(fbarchard): support 1 rows
// TODO(fbarchard): align loops
__declspec(naked) __declspec(naked)
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, uint16* dst_ptr, int src_width,
...@@ -1275,6 +1285,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1275,6 +1285,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
pxor xmm4, xmm4 pxor xmm4, xmm4
dec ebx dec ebx
align 16
xloop: xloop:
// first row // first row
movdqa xmm0, [esi] movdqa xmm0, [esi]
...@@ -1284,8 +1295,11 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1284,8 +1295,11 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
punpckhbw xmm1, xmm4 punpckhbw xmm1, xmm4
lea esi, [esi + 16] lea esi, [esi + 16]
mov ebp, ebx mov ebp, ebx
test ebp, ebp
je ydone
// sum remaining rows // sum remaining rows
align 16
yloop: yloop:
movdqa xmm2, [eax] // read 16 pixels movdqa xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row lea eax, [eax + edx] // advance to next row
...@@ -1296,7 +1310,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1296,7 +1310,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
paddusw xmm1, xmm3 paddusw xmm1, xmm3
sub ebp, 1 sub ebp, 1
ja yloop ja yloop
ydone:
movdqa [edi], xmm0 movdqa [edi], xmm0
movdqa [edi + 16], xmm1 movdqa [edi + 16], xmm1
lea edi, [edi + 32] lea edi, [edi + 32]
...@@ -1342,6 +1356,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1342,6 +1356,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
pshufd xmm5, xmm5, 0 pshufd xmm5, xmm5, 0
pxor xmm7, xmm7 pxor xmm7, xmm7
align 16
xloop: xloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm2, [esi + edx] movdqa xmm2, [esi + edx]
...@@ -1371,6 +1386,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1371,6 +1386,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
pop esi pop esi
ret ret
align 16
xloop1: xloop1:
movdqa xmm0, [esi] movdqa xmm0, [esi]
sub ecx, 16 sub ecx, 16
...@@ -1384,6 +1400,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1384,6 +1400,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
pop esi pop esi
ret ret
align 16
xloop2: xloop2:
movdqa xmm0, [esi] movdqa xmm0, [esi]
pavgb xmm0, [esi + edx] pavgb xmm0, [esi + edx]
...@@ -1428,6 +1445,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1428,6 +1445,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
punpcklwd xmm5, xmm5 punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0 pshufd xmm5, xmm5, 0
align 16
xloop: xloop:
movdqa xmm0, [esi] movdqa xmm0, [esi]
movdqa xmm2, [esi + edx] movdqa xmm2, [esi + edx]
...@@ -1450,6 +1468,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1450,6 +1468,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
pop esi pop esi
ret ret
align 16
xloop1: xloop1:
movdqa xmm0, [esi] movdqa xmm0, [esi]
sub ecx, 16 sub ecx, 16
...@@ -1463,6 +1482,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1463,6 +1482,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
pop esi pop esi
ret ret
align 16
xloop2: xloop2:
movdqa xmm0, [esi] movdqa xmm0, [esi]
pavgb xmm0, [esi + edx] pavgb xmm0, [esi + edx]
...@@ -1496,6 +1516,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1496,6 +1516,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa xmm6, _madd11 movdqa xmm6, _madd11
movdqa xmm7, _madd21 movdqa xmm7, _madd21
align 16
wloop: wloop:
movdqa xmm0, [eax] // pixels 0..7 movdqa xmm0, [eax] // pixels 0..7
pshufb xmm0, xmm2 pshufb xmm0, xmm2
...@@ -1712,6 +1733,8 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1712,6 +1733,8 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"punpcklbw %%xmm4,%%xmm0 \n" "punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n" "punpckhbw %%xmm4,%%xmm1 \n"
"mov %5,%2 \n" "mov %5,%2 \n"
"test %2,%2 \n"
"je 3f \n"
"2: \n" "2: \n"
"movdqa (%0),%%xmm2 \n" "movdqa (%0),%%xmm2 \n"
"add %6,%0 \n" "add %6,%0 \n"
...@@ -1722,6 +1745,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1722,6 +1745,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"paddusw %%xmm3,%%xmm1 \n" "paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%2 \n" "sub $0x1,%2 \n"
"ja 2b \n" "ja 2b \n"
"3: \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,0x10(%1) \n"
"lea 0x10(%3),%0 \n" "lea 0x10(%3),%0 \n"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment