Commit 18184fd1 authored by fbarchard@google.com's avatar fbarchard@google.com

switch looping to jg from ja to allow non-multiple of 16 to underflow to a negative

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/453001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@214 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 1ff03571
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 213 Version: 214
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 213 #define LIBYUV_VERSION 214
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -58,7 +58,7 @@ static uint32 SumSquareError_NEON(const uint8* src_a, ...@@ -58,7 +58,7 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
"vmlal.s16 q8, d5, d5 \n" "vmlal.s16 q8, d5, d5 \n"
"vmlal.s16 q10, d7, d7 \n" "vmlal.s16 q10, d7, d7 \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
"bhi 1b \n" "bgt 1b \n"
"vadd.u32 q7, q7, q8 \n" "vadd.u32 q7, q7, q8 \n"
"vadd.u32 q9, q9, q10 \n" "vadd.u32 q9, q9, q10 \n"
...@@ -94,6 +94,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -94,6 +94,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
movdqa xmm1, [eax] movdqa xmm1, [eax]
movdqa xmm2, [eax + edx] movdqa xmm2, [eax + edx]
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 16
movdqa xmm3, xmm1 movdqa xmm3, xmm1
psubusb xmm1, xmm2 psubusb xmm1, xmm2
psubusb xmm2, xmm3 psubusb xmm2, xmm3
...@@ -105,8 +106,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -105,8 +106,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
pmaddwd xmm2, xmm2 pmaddwd xmm2, xmm2
paddd xmm0, xmm1 paddd xmm0, xmm1
paddd xmm0, xmm2 paddd xmm0, xmm2
sub ecx, 16 jg wloop
ja wloop
pshufd xmm1, xmm0, 0EEh pshufd xmm1, xmm0, 0EEh
paddd xmm0, xmm1 paddd xmm0, xmm1
...@@ -131,6 +131,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -131,6 +131,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
"movdqa (%0),%%xmm1 \n" "movdqa (%0),%%xmm1 \n"
"movdqa (%0,%1,1),%%xmm2 \n" "movdqa (%0,%1,1),%%xmm2 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n" "psubusb %%xmm3,%%xmm2 \n"
...@@ -142,8 +143,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -142,8 +143,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
"pmaddwd %%xmm2,%%xmm2 \n" "pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n" "pshufd $0xee,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
......
...@@ -77,10 +77,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -77,10 +77,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
pavgb xmm0, [eax + edx] pavgb xmm0, [eax + edx]
sub ecx, 16
movdqa [eax + edi], xmm0 movdqa [eax + edi], xmm0
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
ret ret
} }
...@@ -95,10 +95,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -95,10 +95,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"pavgb (%0,%3),%%xmm0 \n" "pavgb (%0,%3),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%0,%1) \n" "movdqa %%xmm0,(%0,%1) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_uv), // %1 "+r"(dst_uv), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -495,10 +495,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ...@@ -495,10 +495,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
lea esi, [esi + 8] lea esi, [esi + 8]
psrlw xmm1, 8 // V psrlw xmm1, 8 // V
packuswb xmm1, xmm1 packuswb xmm1, xmm1
sub ecx, 16
movq qword ptr [edi], xmm1 movq qword ptr [edi], xmm1
lea edi, [edi + 8] lea edi, [edi + 8]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
pop esi pop esi
...@@ -534,10 +534,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, ...@@ -534,10 +534,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
"lea 0x8(%2),%2 \n" "lea 0x8(%2),%2 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n"
"movq %%xmm1,(%3) \n" "movq %%xmm1,(%3) \n"
"lea 0x8(%3),%3 \n" "lea 0x8(%3),%3 \n"
"sub $0x10,%4 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
......
...@@ -237,7 +237,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, ...@@ -237,7 +237,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
movdqa [edi + 16], xmm1 movdqa [edi + 16], xmm1
lea edi, [edi + 32] lea edi, [edi + 32]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -276,7 +276,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, ...@@ -276,7 +276,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
movdqa [edi + 16], xmm2 movdqa [edi + 16], xmm2
lea edi, [edi + 32] lea edi, [edi + 32]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -305,7 +305,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, ...@@ -305,7 +305,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
"movdqa %%xmm1,0x10(%3) \n" "movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n" "sub $0x10,%4 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -340,7 +340,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, ...@@ -340,7 +340,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
"movdqa %%xmm2,0x10(%3) \n" "movdqa %%xmm2,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n" "sub $0x10,%4 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -1084,10 +1084,11 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1084,10 +1084,11 @@ int ConvertFromI420(const uint8* y, int y_stride,
if (y == NULL || u == NULL || v == NULL || dst_sample == NULL) { if (y == NULL || u == NULL || v == NULL || dst_sample == NULL) {
return -1; return -1;
} }
int r = 0;
switch (format) { switch (format) {
// Single plane formats // Single plane formats
case FOURCC_YUY2: case FOURCC_YUY2:
I420ToYUY2(y, y_stride, r = I420ToYUY2(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1095,7 +1096,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1095,7 +1096,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_UYVY: case FOURCC_UYVY:
I420ToUYVY(y, y_stride, r = I420ToUYVY(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1103,7 +1104,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1103,7 +1104,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_V210: case FOURCC_V210:
I420ToV210(y, y_stride, r = I420ToV210(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1112,7 +1113,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1112,7 +1113,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_RGBP: case FOURCC_RGBP:
I420ToRGB565(y, y_stride, r = I420ToRGB565(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1120,7 +1121,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1120,7 +1121,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_RGBO: case FOURCC_RGBO:
I420ToARGB1555(y, y_stride, r = I420ToARGB1555(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1128,7 +1129,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1128,7 +1129,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_R444: case FOURCC_R444:
I420ToARGB4444(y, y_stride, r = I420ToARGB4444(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1136,7 +1137,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1136,7 +1137,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_24BG: case FOURCC_24BG:
I420ToRGB24(y, y_stride, r = I420ToRGB24(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1144,7 +1145,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1144,7 +1145,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_RAW: case FOURCC_RAW:
I420ToRAW(y, y_stride, r = I420ToRAW(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1152,7 +1153,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1152,7 +1153,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_ARGB: case FOURCC_ARGB:
I420ToARGB(y, y_stride, r = I420ToARGB(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1160,7 +1161,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1160,7 +1161,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_BGRA: case FOURCC_BGRA:
I420ToBGRA(y, y_stride, r = I420ToBGRA(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1168,7 +1169,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1168,7 +1169,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_ABGR: case FOURCC_ABGR:
I420ToABGR(y, y_stride, r = I420ToABGR(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1176,7 +1177,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1176,7 +1177,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_BGGR: case FOURCC_BGGR:
I420ToBayerBGGR(y, y_stride, r = I420ToBayerBGGR(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1184,7 +1185,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1184,7 +1185,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_GBRG: case FOURCC_GBRG:
I420ToBayerGBRG(y, y_stride, r = I420ToBayerGBRG(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1192,7 +1193,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1192,7 +1193,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_GRBG: case FOURCC_GRBG:
I420ToBayerGRBG(y, y_stride, r = I420ToBayerGRBG(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1200,7 +1201,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1200,7 +1201,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_RGGB: case FOURCC_RGGB:
I420ToBayerRGGB(y, y_stride, r = I420ToBayerRGGB(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1208,7 +1209,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1208,7 +1209,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_I400: case FOURCC_I400:
I400Copy(y, y_stride, r = I400Copy(y, y_stride,
dst_sample, dst_sample,
dst_sample_stride ? dst_sample_stride : width, dst_sample_stride ? dst_sample_stride : width,
width, height); width, height);
...@@ -1228,7 +1229,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1228,7 +1229,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height; dst_v = dst_sample + width * height;
dst_u = dst_v + halfwidth * halfheight; dst_u = dst_v + halfwidth * halfheight;
} }
I420Copy(y, y_stride, r = I420Copy(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, width, dst_sample, width,
...@@ -1249,7 +1250,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1249,7 +1250,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height; dst_v = dst_sample + width * height;
dst_u = dst_v + halfwidth * height; dst_u = dst_v + halfwidth * height;
} }
I420ToI422(y, y_stride, r = I420ToI422(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, width, dst_sample, width,
...@@ -1269,7 +1270,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1269,7 +1270,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height; dst_v = dst_sample + width * height;
dst_u = dst_v + width * height; dst_u = dst_v + width * height;
} }
I420ToI444(y, y_stride, r = I420ToI444(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, width, dst_sample, width,
...@@ -1282,7 +1283,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1282,7 +1283,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
int quarterwidth = (width + 3) / 4; int quarterwidth = (width + 3) / 4;
uint8* dst_u = dst_sample + width * height; uint8* dst_u = dst_sample + width * height;
uint8* dst_v = dst_u + quarterwidth * height; uint8* dst_v = dst_u + quarterwidth * height;
I420ToI411(y, y_stride, r = I420ToI411(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, width, dst_sample, width,
...@@ -1296,7 +1297,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1296,7 +1297,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
default: default:
return -1; // unknown fourcc - return failure code. return -1; // unknown fourcc - return failure code.
} }
return 0; return r;
} }
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -40,10 +40,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, ...@@ -40,10 +40,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
sub ecx, 4
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 4 jg wloop
ja wloop
ret ret
} }
} }
...@@ -60,10 +60,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, ...@@ -60,10 +60,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"sub $0x4,%2 \n"
"movd %%xmm0,(%1) \n" "movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n" "lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
"+r"(pix) // %2 "+r"(pix) // %2
......
...@@ -685,7 +685,7 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) { ...@@ -685,7 +685,7 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u32 {q0}, [%0]! \n" // store "vst1.u32 {q0}, [%0]! \n" // store
"bhi 1b \n" "bgt 1b \n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
: "r"(v32) // %2 : "r"(v32) // %2
...@@ -738,7 +738,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, ...@@ -738,7 +738,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
rep stosd rep stosd
add edi, edx add edi, edx
sub ebx, 1 sub ebx, 1
ja convertloop jg convertloop
pop ebp pop ebp
pop edi pop edi
......
...@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
sub ecx, 8 sub ecx, 8
movq qword ptr [edx + esi], xmm7 movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi] lea edx, [edx + 2 * esi]
ja convertloop jg convertloop
pop ebp pop ebp
pop esi pop esi
...@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi] lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0 movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp] lea ebx, [ebx + 2 * ebp]
ja convertloop jg convertloop
mov esp, [esp + 16] mov esp, [esp + 16]
pop ebp pop ebp
...@@ -366,7 +366,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -366,7 +366,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n" "movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -493,7 +493,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -493,7 +493,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"lea (%edx,%esi,2),%edx \n" "lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n" "movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n" "lea (%ebx,%ebp,2),%ebx \n"
"ja 1b \n" "jg 1b \n"
"mov 0x10(%esp),%esp \n" "mov 0x10(%esp),%esp \n"
"pop %ebp \n" "pop %ebp \n"
"pop %edi \n" "pop %edi \n"
...@@ -629,7 +629,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, ...@@ -629,7 +629,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n" "movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -737,7 +737,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -737,7 +737,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_a), // %1 "+r"(dst_a), // %1
"+r"(dst_b), // %2 "+r"(dst_b), // %2
...@@ -755,8 +755,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -755,8 +755,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
static void TransposeWx8_C(const uint8* src, int src_stride, static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride,
int w) { int width) {
for (int i = 0; i < w; ++i) { for (int i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride]; dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride]; dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride]; dst[2] = src[2 * src_stride];
...@@ -888,9 +888,9 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -888,9 +888,9 @@ void RotatePlane180(const uint8* src, int src_stride,
static void TransposeUVWx8_C(const uint8* src, int src_stride, static void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int w) { int width) {
int i; int i;
for (i = 0; i < w; ++i) { for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0]; dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1]; dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0]; dst_a[1] = src[1 * src_stride + 0];
...@@ -916,10 +916,10 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride, ...@@ -916,10 +916,10 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
static void TransposeUVWxH_C(const uint8* src, int src_stride, static void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int w, int h) { int width, int height) {
int i, j; int i, j;
for (i = 0; i < w * 2; i += 2) for (i = 0; i < width * 2; i += 2)
for (j = 0; j < h; ++j) { for (j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
} }
......
...@@ -73,7 +73,7 @@ YUVTORGB ...@@ -73,7 +73,7 @@ YUVTORGB
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n" "vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -106,7 +106,7 @@ YUVTORGB ...@@ -106,7 +106,7 @@ YUVTORGB
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
"vst4.u8 {d19, d20, d21, d22}, [%3]! \n" "vst4.u8 {d19, d20, d21, d22}, [%3]! \n"
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -139,7 +139,7 @@ YUVTORGB ...@@ -139,7 +139,7 @@ YUVTORGB
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n" "vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -163,7 +163,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { ...@@ -163,7 +163,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store U "vst1.u8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // Store V "vst1.u8 {q1}, [%2]! \n" // Store V
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -183,7 +183,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -183,7 +183,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
"vldm %0!,{q0,q1,q2,q3} \n" // load 64 "vldm %0!,{q0,q1,q2,q3} \n" // load 64
"subs %2, %2, #64 \n" // 64 processed per loop "subs %2, %2, #64 \n" // 64 processed per loop
"vstm %1!,{q0,q1,q2,q3} \n" // store 64 "vstm %1!,{q0,q1,q2,q3} \n" // store 64
"bhi 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 // Output registers "+r"(count) // %2 // Output registers
......
...@@ -125,7 +125,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -125,7 +125,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -140,14 +140,15 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -140,14 +140,15 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
"sub %0,%1 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"ja 1b \n" "movdqa %%xmm0,(%0,%1,1) \n"
"lea 0x10(%0),%0 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -162,14 +163,14 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { ...@@ -162,14 +163,14 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
"sub %0,%1 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"ja 1b \n" "movdqa %%xmm0,(%0,%1,1) \n"
"lea 0x10(%0),%0 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -206,10 +207,10 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -206,10 +207,10 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"pshufb %%xmm4,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm3,0x30(%1) \n" "movdqa %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n" "lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -246,10 +247,10 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -246,10 +247,10 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"pshufb %%xmm4,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm3,0x30(%1) \n" "movdqa %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n" "lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -298,7 +299,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { ...@@ -298,7 +299,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm2,0x10(%1,%0,2) \n" "movdqa %%xmm2,0x10(%1,%0,2) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -350,7 +351,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { ...@@ -350,7 +351,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm2,0x10(%1,%0,2) \n" "movdqa %%xmm2,0x10(%1,%0,2) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -389,7 +390,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { ...@@ -389,7 +390,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm1,0x10(%1,%0,2) \n" "movdqa %%xmm1,0x10(%1,%0,2) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -429,7 +430,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { ...@@ -429,7 +430,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm2,0x20(%1) \n" "movdqa %%xmm2,0x20(%1) \n"
"lea 0x30(%1),%1 \n" "lea 0x30(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -469,7 +470,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { ...@@ -469,7 +470,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm2,0x20(%1) \n" "movdqa %%xmm2,0x20(%1) \n"
"lea 0x30(%1),%1 \n" "lea 0x30(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -508,7 +509,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { ...@@ -508,7 +509,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
"movq %%xmm0,(%1) \n" "movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -551,7 +552,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { ...@@ -551,7 +552,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"movq %%xmm0,(%1) \n" "movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -582,7 +583,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { ...@@ -582,7 +583,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
"movq %%xmm0,(%1) \n" "movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -614,10 +615,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -614,10 +615,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -650,10 +651,10 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -650,10 +651,10 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -718,11 +719,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -718,11 +719,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0,(%1) \n" "movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n" "movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -786,11 +787,11 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -786,11 +787,11 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0,(%1) \n" "movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n" "movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -823,10 +824,10 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -823,10 +824,10 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -859,10 +860,10 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -859,10 +860,10 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -922,11 +923,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, ...@@ -922,11 +923,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0,(%1) \n" "movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n" "movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_bgra0), // %0 : "+r"(src_bgra0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -990,11 +991,11 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, ...@@ -990,11 +991,11 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0,(%1) \n" "movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n" "movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_bgra0), // %0 : "+r"(src_bgra0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1027,10 +1028,10 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -1027,10 +1028,10 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1063,10 +1064,10 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -1063,10 +1064,10 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1126,11 +1127,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ...@@ -1126,11 +1127,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0,(%1) \n" "movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n" "movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_abgr0), // %0 : "+r"(src_abgr0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1194,11 +1195,11 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ...@@ -1194,11 +1195,11 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0,(%1) \n" "movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n" "movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_abgr0), // %0 : "+r"(src_abgr0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1305,7 +1306,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1305,7 +1306,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm1,0x10(%3) \n" "movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -1340,7 +1341,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1340,7 +1341,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,0x10(%3) \n" "movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -1374,7 +1375,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1374,7 +1375,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm1,0x10(%3) \n" "movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -1427,10 +1428,10 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1427,10 +1428,10 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm2 \n"
"punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm2,%%xmm0 \n"
"sub $0x4,%4 \n"
"movdqa %%xmm0,(%3) \n" "movdqa %%xmm0,(%3) \n"
"lea 0x10(%3),%3 \n" "lea 0x10(%3),%3 \n"
"sub $0x4,%4 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -1479,7 +1480,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -1479,7 +1480,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"lea 32(%1),%1 \n" "lea 32(%1),%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(rgb_buf), // %1 "+r"(rgb_buf), // %1
"+rm"(width) // %2 "+rm"(width) // %2
...@@ -1509,7 +1510,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -1509,7 +1510,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(temp_width) // %2 "+r"(temp_width) // %2
...@@ -1539,7 +1540,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -1539,7 +1540,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(temp_width) // %2 "+r"(temp_width) // %2
...@@ -1572,7 +1573,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ...@@ -1572,7 +1573,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
"movlpd %%xmm0,(%1) \n" "movlpd %%xmm0,(%1) \n"
"movhpd %%xmm0,(%1,%2) \n" "movhpd %%xmm0,(%1,%2) \n"
"lea 8(%1),%1 \n" "lea 8(%1),%1 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1608,7 +1609,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -1608,7 +1609,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"movdqa %%xmm2,(%1,%2) \n" "movdqa %%xmm2,(%1,%2) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%3 \n" "sub $0x10,%3 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -1633,7 +1634,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -1633,7 +1634,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
"movdqa %%xmm1,0x10(%0,%1) \n" "movdqa %%xmm1,0x10(%0,%1) \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 "+r"(count) // %2
...@@ -1676,7 +1677,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -1676,7 +1677,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1714,7 +1715,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -1714,7 +1715,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"movq %%xmm1,(%1,%2) \n" "movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "sub $0x10,%3 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_y), // %2 "+r"(dst_y), // %2
...@@ -1739,10 +1740,10 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, ...@@ -1739,10 +1740,10 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
"pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1782,7 +1783,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, ...@@ -1782,7 +1783,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
"movq %%xmm1,(%1,%2) \n" "movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "sub $0x10,%3 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_y), // %2 "+r"(dst_y), // %2
...@@ -1804,10 +1805,10 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -1804,10 +1805,10 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1845,7 +1846,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1845,7 +1846,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
"movq %%xmm1,(%1,%2) \n" "movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "sub $0x10,%3 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_y), // %2 "+r"(dst_y), // %2
...@@ -1868,10 +1869,10 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, ...@@ -1868,10 +1869,10 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -1909,7 +1910,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1909,7 +1910,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
"movq %%xmm1,(%1,%2) \n" "movq %%xmm1,(%1,%2) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n" "sub $0x10,%3 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_y), // %2 "+r"(dst_y), // %2
......
...@@ -122,7 +122,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -122,7 +122,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -134,16 +134,16 @@ __asm { ...@@ -134,16 +134,16 @@ __asm {
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskABGRToARGB movdqa xmm5, kShuffleMaskABGRToARGB
sub edx, eax
align 16 align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4 sub ecx, 4
ja convertloop movdqa [eax + edx], xmm0
lea eax, [eax + 16]
jg convertloop
ret ret
} }
} }
...@@ -155,16 +155,16 @@ __asm { ...@@ -155,16 +155,16 @@ __asm {
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm5, kShuffleMaskBGRAToARGB movdqa xmm5, kShuffleMaskBGRAToARGB
sub edx, eax
align 16 align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4 sub ecx, 4
ja convertloop movdqa [eax + edx], xmm0
lea eax, [eax + 16]
jg convertloop
ret ret
} }
} }
...@@ -200,10 +200,10 @@ __asm { ...@@ -200,10 +200,10 @@ __asm {
pshufb xmm3, xmm4 pshufb xmm3, xmm4
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
por xmm3, xmm5 por xmm3, xmm5
sub ecx, 16
movdqa [edx + 48], xmm3 movdqa [edx + 48], xmm3
lea edx, [edx + 64] lea edx, [edx + 64]
sub ecx, 16 jg convertloop
ja convertloop
ret ret
} }
} }
...@@ -240,10 +240,10 @@ __asm { ...@@ -240,10 +240,10 @@ __asm {
pshufb xmm3, xmm4 pshufb xmm3, xmm4
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
por xmm3, xmm5 por xmm3, xmm5
sub ecx, 16
movdqa [edx + 48], xmm3 movdqa [edx + 48], xmm3
lea edx, [edx + 64] lea edx, [edx + 64]
sub ecx, 16 jg convertloop
ja convertloop
ret ret
} }
} }
...@@ -300,7 +300,7 @@ __asm { ...@@ -300,7 +300,7 @@ __asm {
movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 8 sub ecx, 8
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -354,7 +354,7 @@ __asm { ...@@ -354,7 +354,7 @@ __asm {
movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 8 sub ecx, 8
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -394,7 +394,7 @@ __asm { ...@@ -394,7 +394,7 @@ __asm {
movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 8 sub ecx, 8
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -433,7 +433,7 @@ __asm { ...@@ -433,7 +433,7 @@ __asm {
movdqa [edx + 32], xmm2 // store 2 movdqa [edx + 32], xmm2 // store 2
lea edx, [edx + 48] lea edx, [edx + 48]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -472,7 +472,7 @@ __asm { ...@@ -472,7 +472,7 @@ __asm {
movdqa [edx + 32], xmm2 // store 2 movdqa [edx + 32], xmm2 // store 2
lea edx, [edx + 48] lea edx, [edx + 48]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -510,7 +510,7 @@ __asm { ...@@ -510,7 +510,7 @@ __asm {
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 4 sub ecx, 4
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -553,7 +553,7 @@ __asm { ...@@ -553,7 +553,7 @@ __asm {
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 4 sub ecx, 4
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -583,7 +583,7 @@ __asm { ...@@ -583,7 +583,7 @@ __asm {
movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 4 sub ecx, 4
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -618,7 +618,7 @@ __asm { ...@@ -618,7 +618,7 @@ __asm {
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -652,7 +652,7 @@ __asm { ...@@ -652,7 +652,7 @@ __asm {
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -686,7 +686,7 @@ __asm { ...@@ -686,7 +686,7 @@ __asm {
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -720,7 +720,7 @@ __asm { ...@@ -720,7 +720,7 @@ __asm {
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -754,7 +754,7 @@ __asm { ...@@ -754,7 +754,7 @@ __asm {
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -785,10 +785,10 @@ __asm { ...@@ -785,10 +785,10 @@ __asm {
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
paddb xmm0, xmm5 paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 jg convertloop
ja convertloop
ret ret
} }
} }
...@@ -847,11 +847,12 @@ __asm { ...@@ -847,11 +847,12 @@ __asm {
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
pop esi pop esi
ret ret
...@@ -916,11 +917,12 @@ __asm { ...@@ -916,11 +917,12 @@ __asm {
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
pop esi pop esi
ret ret
...@@ -981,11 +983,12 @@ __asm { ...@@ -981,11 +983,12 @@ __asm {
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
pop esi pop esi
ret ret
...@@ -1050,11 +1053,12 @@ __asm { ...@@ -1050,11 +1053,12 @@ __asm {
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
pop esi pop esi
ret ret
...@@ -1115,11 +1119,12 @@ __asm { ...@@ -1115,11 +1119,12 @@ __asm {
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
pop esi pop esi
ret ret
...@@ -1184,11 +1189,12 @@ __asm { ...@@ -1184,11 +1189,12 @@ __asm {
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
pop esi pop esi
ret ret
...@@ -1293,9 +1299,8 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1293,9 +1299,8 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
movdqa [edx], xmm0 movdqa [edx], xmm0
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -1334,9 +1339,8 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1334,9 +1339,8 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
movdqa [edx], xmm5 movdqa [edx], xmm5
movdqa [edx + 16], xmm0 movdqa [edx + 16], xmm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -1375,9 +1379,8 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1375,9 +1379,8 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
movdqa [edx], xmm2 movdqa [edx], xmm2
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -1441,9 +1444,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1441,9 +1444,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
punpcklwd xmm0, xmm2 // BGRA 4 pixels punpcklwd xmm0, xmm2 // BGRA 4 pixels
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4 sub ecx, 4
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -1490,9 +1492,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -1490,9 +1492,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
movdqa [edx], xmm0 movdqa [edx], xmm0
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
ja convertloop jg convertloop
ret ret
} }
...@@ -1523,7 +1524,7 @@ __asm { ...@@ -1523,7 +1524,7 @@ __asm {
sub ecx, 16 sub ecx, 16
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -1553,7 +1554,7 @@ __asm { ...@@ -1553,7 +1554,7 @@ __asm {
sub ecx, 16 sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -1587,7 +1588,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ...@@ -1587,7 +1588,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
movlpd qword ptr [edx], xmm0 movlpd qword ptr [edx], xmm0
movhpd qword ptr [edx + edi], xmm0 movhpd qword ptr [edx + edi], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
ja convertloop jg convertloop
pop edi pop edi
ret ret
...@@ -1625,7 +1626,8 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -1625,7 +1626,8 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
movdqa [edx + edi], xmm2 movdqa [edx + edi], xmm2
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
ret ret
} }
...@@ -1650,7 +1652,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -1650,7 +1652,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
movdqa [eax + edx + 16], xmm1 movdqa [eax + edx + 16], xmm1
lea eax, [eax + 32] lea eax, [eax + 32]
sub ecx, 32 sub ecx, 32
ja convertloop jg convertloop
ret ret
} }
} }
...@@ -1693,10 +1695,10 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, ...@@ -1693,10 +1695,10 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
pand xmm0, xmm5 // even bytes are Y pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 jg convertloop
ja convertloop
ret ret
} }
} }
...@@ -1737,7 +1739,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -1737,7 +1739,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
movq qword ptr [edx + edi], xmm1 movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -1763,10 +1765,10 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, ...@@ -1763,10 +1765,10 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
pand xmm0, xmm5 // even bytes are Y pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 jg convertloop
ja convertloop
ret ret
} }
} }
...@@ -1807,7 +1809,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -1807,7 +1809,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
movq qword ptr [edx + edi], xmm1 movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -1831,10 +1833,10 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, ...@@ -1831,10 +1833,10 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
psrlw xmm0, 8 // odd bytes are Y psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 jg convertloop
ja convertloop
ret ret
} }
} }
...@@ -1875,7 +1877,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1875,7 +1877,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
movq qword ptr [edx + edi], xmm1 movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -1899,10 +1901,10 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, ...@@ -1899,10 +1901,10 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
psrlw xmm0, 8 // odd bytes are Y psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 jg convertloop
ja convertloop
ret ret
} }
} }
...@@ -1943,7 +1945,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -1943,7 +1945,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
movq qword ptr [edx + edi], xmm1 movq qword ptr [edx + edi], xmm1
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
......
...@@ -64,7 +64,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, ...@@ -64,7 +64,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
"vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1
"vst1.u8 {q0}, [%1]! \n" // store even pixels "vst1.u8 {q0}, [%1]! \n" // store even pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -88,7 +88,7 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -88,7 +88,7 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
"vst1.u8 {q0}, [%2]! \n" "vst1.u8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
"+r"(dst), // %2 "+r"(dst), // %2
...@@ -109,7 +109,7 @@ static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, ...@@ -109,7 +109,7 @@ static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
"vst1.u32 {d0[1]}, [%1]! \n" "vst1.u32 {d0[1]}, [%1]! \n"
"subs %2, #4 \n" "subs %2, #4 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -143,7 +143,7 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -143,7 +143,7 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
"vst1.u32 {d0[0]}, [%1]! \n" "vst1.u32 {d0[0]}, [%1]! \n"
"subs %2, #4 \n" "subs %2, #4 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
...@@ -165,7 +165,7 @@ static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, ...@@ -165,7 +165,7 @@ static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
"vmov d2, d3 \n" // order needs to be d0, d1, d2 "vmov d2, d3 \n" // order needs to be d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n" "subs %2, #24 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -219,7 +219,7 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -219,7 +219,7 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n" "subs %2, #24 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -258,7 +258,7 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -258,7 +258,7 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
"vst3.u8 {d0, d1, d2}, [%1]! \n" "vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n" "subs %2, #24 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -292,7 +292,7 @@ static void ScaleRowDown38_NEON(const uint8* src_ptr, int, ...@@ -292,7 +292,7 @@ static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
"vst1.u8 {d4}, [%1]! \n" "vst1.u8 {d4}, [%1]! \n"
"vst1.u32 {d5[0]}, [%1]! \n" "vst1.u32 {d5[0]}, [%1]! \n"
"subs %2, #12 \n" "subs %2, #12 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -397,7 +397,7 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -397,7 +397,7 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
"vst1.u8 {d3}, [%1]! \n" "vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n" "vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n" "subs %2, #12 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -492,7 +492,7 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, ...@@ -492,7 +492,7 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
"vst1.u8 {d3}, [%1]! \n" "vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n" "vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n" "subs %2, #12 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -529,14 +529,14 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -529,14 +529,14 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
"vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bhi 1b \n" "bgt 1b \n"
"b 4f \n" "b 4f \n"
"2: \n" "2: \n"
"vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n" "subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bhi 2b \n" "bgt 2b \n"
"b 4f \n" "b 4f \n"
"3: \n" "3: \n"
...@@ -545,7 +545,7 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -545,7 +545,7 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
"subs %3, #16 \n" "subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n" "vst1.u8 {q0}, [%0]! \n"
"bhi 3b \n" "bgt 3b \n"
"4: \n" "4: \n"
"vst1.u8 {d1[7]}, [%0] \n" "vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -697,7 +697,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ...@@ -697,7 +697,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
sub ecx, 16 sub ecx, 16
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
ja wloop jg wloop
ret ret
} }
...@@ -739,7 +739,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -739,7 +739,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
sub ecx, 16 sub ecx, 16
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
ja wloop jg wloop
pop esi pop esi
ret ret
...@@ -772,7 +772,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, ...@@ -772,7 +772,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
sub ecx, 8 sub ecx, 8
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
ja wloop jg wloop
ret ret
} }
...@@ -831,7 +831,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -831,7 +831,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
sub ecx, 8 sub ecx, 8
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
ja wloop jg wloop
pop edi pop edi
pop esi pop esi
...@@ -866,7 +866,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, ...@@ -866,7 +866,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
sub ecx, 4 sub ecx, 4
movd dword ptr [edx], xmm0 movd dword ptr [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
ja wloop jg wloop
ret ret
} }
...@@ -936,7 +936,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -936,7 +936,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
sub ecx, 4 sub ecx, 4
movd dword ptr [edx], xmm0 movd dword ptr [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
ja wloop jg wloop
pop ebp pop ebp
pop edi pop edi
...@@ -979,7 +979,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -979,7 +979,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
movq qword ptr [edx + 16], xmm2 movq qword ptr [edx + 16], xmm2
lea edx, [edx + 24] lea edx, [edx + 24]
sub ecx, 24 sub ecx, 24
ja wloop jg wloop
ret ret
} }
...@@ -1050,7 +1050,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1050,7 +1050,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
sub ecx, 24 sub ecx, 24
movq qword ptr [edx + 16], xmm0 movq qword ptr [edx + 16], xmm0
lea edx, [edx + 24] lea edx, [edx + 24]
ja wloop jg wloop
pop esi pop esi
ret ret
...@@ -1111,7 +1111,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1111,7 +1111,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
sub ecx, 24 sub ecx, 24
movq qword ptr [edx + 16], xmm0 movq qword ptr [edx + 16], xmm0
lea edx, [edx+24] lea edx, [edx+24]
ja wloop jg wloop
pop esi pop esi
ret ret
...@@ -1147,7 +1147,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1147,7 +1147,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
movhlps xmm1, xmm0 movhlps xmm1, xmm0
movd [edx + 8], xmm1 movd [edx + 8], xmm1
lea edx, [edx + 12] lea edx, [edx + 12]
ja xloop jg xloop
ret ret
} }
...@@ -1212,7 +1212,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1212,7 +1212,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
pextrw ebx, xmm2, 2 pextrw ebx, xmm2, 2
mov [edx + 4], bx mov [edx + 4], bx
lea edx, [edx + 6] lea edx, [edx + 6]
ja xloop jg xloop
pop ebx pop ebx
pop esi pop esi
...@@ -1258,7 +1258,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1258,7 +1258,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
pextrw ebx, xmm0, 2 pextrw ebx, xmm0, 2
mov [edx + 4], bx mov [edx + 4], bx
lea edx, [edx + 6] lea edx, [edx + 6]
ja xloop jg xloop
pop ebx pop ebx
pop esi pop esi
...@@ -1310,14 +1310,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1310,14 +1310,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
paddusw xmm0, xmm2 // sum 16 words paddusw xmm0, xmm2 // sum 16 words
paddusw xmm1, xmm3 paddusw xmm1, xmm3
sub ebp, 1 sub ebp, 1
ja yloop jg yloop
ydone: ydone:
movdqa [edi], xmm0 movdqa [edi], xmm0
movdqa [edi + 16], xmm1 movdqa [edi + 16], xmm1
lea edi, [edi + 32] lea edi, [edi + 32]
sub ecx, 16 sub ecx, 16
ja xloop jg xloop
pop ebp pop ebp
pop ebx pop ebx
...@@ -1379,7 +1379,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1379,7 +1379,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
ja xloop jg xloop
mov al, [esi + edi - 1] mov al, [esi + edi - 1]
mov [esi + edi], al mov [esi + edi], al
...@@ -1393,7 +1393,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1393,7 +1393,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
ja xloop1 jg xloop1
mov al, [esi + edi - 1] mov al, [esi + edi - 1]
mov [esi + edi], al mov [esi + edi], al
...@@ -1408,7 +1408,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1408,7 +1408,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
ja xloop2 jg xloop2
mov al, [esi + edi - 1] mov al, [esi + edi - 1]
mov [esi + edi], al mov [esi + edi], al
...@@ -1460,7 +1460,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1460,7 +1460,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
ja xloop jg xloop
mov al, [esi + edi - 1] mov al, [esi + edi - 1]
mov [esi + edi], al mov [esi + edi], al
...@@ -1474,7 +1474,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1474,7 +1474,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
ja xloop1 jg xloop1
mov al, [esi + edi - 1] mov al, [esi + edi - 1]
mov [esi + edi], al mov [esi + edi], al
...@@ -1489,7 +1489,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1489,7 +1489,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 16 sub ecx, 16
movdqa [esi + edi], xmm0 movdqa [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
ja xloop2 jg xloop2
mov al, [esi + edi - 1] mov al, [esi + edi - 1]
mov [esi + edi], al mov [esi + edi], al
...@@ -1542,7 +1542,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1542,7 +1542,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 24 sub ecx, 24
movq qword ptr [edx+16], xmm0 movq qword ptr [edx+16], xmm0
lea edx, [edx+24] lea edx, [edx+24]
ja wloop jg wloop
ret ret
} }
} }
...@@ -1568,7 +1568,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1568,7 +1568,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -1602,7 +1602,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1602,7 +1602,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -1628,7 +1628,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1628,7 +1628,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
"movq %%xmm0,(%1) \n" "movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -1677,7 +1677,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1677,7 +1677,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
"movq %%xmm0,(%1) \n" "movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
...@@ -1708,7 +1708,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1708,7 +1708,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
"movd %%xmm0,(%1) \n" "movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n" "lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -1744,14 +1744,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1744,14 +1744,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n" "paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%2 \n" "sub $0x1,%2 \n"
"ja 2b \n" "jg 2b \n"
"3: \n" "3: \n"
"movdqa %%xmm0,(%1) \n" "movdqa %%xmm0,(%1) \n"
"movdqa %%xmm1,0x10(%1) \n" "movdqa %%xmm1,0x10(%1) \n"
"lea 0x10(%3),%0 \n" "lea 0x10(%3),%0 \n"
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x10,%4 \n" "sub $0x10,%4 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(tmp_height), // %2 "+r"(tmp_height), // %2
...@@ -1823,7 +1823,7 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1823,7 +1823,7 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
"sub $0x4,%ecx \n" "sub $0x4,%ecx \n"
"movd %xmm0,(%edi) \n" "movd %xmm0,(%edi) \n"
"lea 0x4(%edi),%edi \n" "lea 0x4(%edi),%edi \n"
"ja 1b \n" "jg 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
); );
...@@ -1857,7 +1857,7 @@ extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1857,7 +1857,7 @@ extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
"movq %xmm2,0x10(%edi) \n" "movq %xmm2,0x10(%edi) \n"
"lea 0x18(%edi),%edi \n" "lea 0x18(%edi),%edi \n"
"sub $0x18,%ecx \n" "sub $0x18,%ecx \n"
"ja 1b \n" "jg 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
); );
...@@ -1910,7 +1910,7 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1910,7 +1910,7 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
"sub $0x18,%ecx \n" "sub $0x18,%ecx \n"
"movq %xmm0,0x10(%edi) \n" "movq %xmm0,0x10(%edi) \n"
"lea 0x18(%edi),%edi \n" "lea 0x18(%edi),%edi \n"
"ja 1b \n" "jg 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
...@@ -1967,7 +1967,7 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1967,7 +1967,7 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
"sub $0x18,%ecx \n" "sub $0x18,%ecx \n"
"movq %xmm0,0x10(%edi) \n" "movq %xmm0,0x10(%edi) \n"
"lea 0x18(%edi),%edi \n" "lea 0x18(%edi),%edi \n"
"ja 1b \n" "jg 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
); );
...@@ -1997,7 +1997,7 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1997,7 +1997,7 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
"sub $0xc,%ecx \n" "sub $0xc,%ecx \n"
"movd %xmm1,0x8(%edi) \n" "movd %xmm1,0x8(%edi) \n"
"lea 0xc(%edi),%edi \n" "lea 0xc(%edi),%edi \n"
"ja 1b \n" "jg 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
); );
...@@ -2054,7 +2054,7 @@ extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2054,7 +2054,7 @@ extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
"mov %ax,0x4(%edi) \n" "mov %ax,0x4(%edi) \n"
"lea 0x6(%edi),%edi \n" "lea 0x6(%edi),%edi \n"
"sub $0x6,%ecx \n" "sub $0x6,%ecx \n"
"ja 1b \n" "jg 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
); );
...@@ -2091,7 +2091,7 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2091,7 +2091,7 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
"mov %ax,0x4(%edi) \n" "mov %ax,0x4(%edi) \n"
"lea 0x6(%edi),%edi \n" "lea 0x6(%edi),%edi \n"
"sub $0x6,%ecx \n" "sub $0x6,%ecx \n"
"ja 1b \n" "jg 1b \n"
"popa \n" "popa \n"
"ret \n" "ret \n"
); );
...@@ -2147,7 +2147,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2147,7 +2147,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n" "movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n" "lea 0x10(%esi),%esi \n"
"ja 1b \n" "jg 1b \n"
"mov -0x1(%esi,%edi,1),%al \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n" "mov %al,(%esi,%edi,1) \n"
...@@ -2160,7 +2160,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2160,7 +2160,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n" "movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n" "lea 0x10(%esi),%esi \n"
"ja 2b \n" "jg 2b \n"
"mov -0x1(%esi,%edi,1),%al \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n" "mov %al,(%esi,%edi,1) \n"
...@@ -2174,7 +2174,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2174,7 +2174,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n" "movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n" "lea 0x10(%esi),%esi \n"
"ja 3b \n" "jg 3b \n"
"mov -0x1(%esi,%edi,1),%al \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n" "mov %al,(%esi,%edi,1) \n"
...@@ -2224,7 +2224,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2224,7 +2224,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n" "movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n" "lea 0x10(%esi),%esi \n"
"ja 1b \n" "jg 1b \n"
"mov -0x1(%esi,%edi,1),%al \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n" "mov %al,(%esi,%edi,1) \n"
...@@ -2237,7 +2237,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2237,7 +2237,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n" "movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n" "lea 0x10(%esi),%esi \n"
"ja 2b \n" "jg 2b \n"
"mov -0x1(%esi,%edi,1),%al \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n" "mov %al,(%esi,%edi,1) \n"
...@@ -2251,7 +2251,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2251,7 +2251,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"sub $0x10,%ecx \n" "sub $0x10,%ecx \n"
"movdqa %xmm0,(%esi,%edi,1) \n" "movdqa %xmm0,(%esi,%edi,1) \n"
"lea 0x10(%esi),%esi \n" "lea 0x10(%esi),%esi \n"
"ja 3b \n" "jg 3b \n"
"mov -0x1(%esi,%edi,1),%al \n" "mov -0x1(%esi,%edi,1),%al \n"
"mov %al,(%esi,%edi,1) \n" "mov %al,(%esi,%edi,1) \n"
...@@ -2310,7 +2310,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -2310,7 +2310,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
"movd %%xmm0,(%1) \n" "movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n" "lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n" "sub $0x4,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -2340,7 +2340,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2340,7 +2340,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
"movq %%xmm2,0x10(%1) \n" "movq %%xmm2,0x10(%1) \n"
"lea 0x18(%1),%1 \n" "lea 0x18(%1),%1 \n"
"sub $0x18,%2 \n" "sub $0x18,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -2392,7 +2392,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2392,7 +2392,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
"movq %%xmm0,0x10(%1) \n" "movq %%xmm0,0x10(%1) \n"
"lea 0x18(%1),%1 \n" "lea 0x18(%1),%1 \n"
"sub $0x18,%2 \n" "sub $0x18,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -2452,7 +2452,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2452,7 +2452,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
"movq %%xmm0,0x10(%1) \n" "movq %%xmm0,0x10(%1) \n"
"lea 0x18(%1),%1 \n" "lea 0x18(%1),%1 \n"
"sub $0x18,%2 \n" "sub $0x18,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -2486,7 +2486,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2486,7 +2486,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
"movd %%xmm1,0x8(%1) \n" "movd %%xmm1,0x8(%1) \n"
"lea 0xc(%1),%1 \n" "lea 0xc(%1),%1 \n"
"sub $0xc,%2 \n" "sub $0xc,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -2541,7 +2541,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2541,7 +2541,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
"mov %%ax,0x4(%1) \n" "mov %%ax,0x4(%1) \n"
"lea 0x6(%1),%1 \n" "lea 0x6(%1),%1 \n"
"sub $0x6,%2 \n" "sub $0x6,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -2578,7 +2578,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -2578,7 +2578,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
"mov %%ax,0x4(%1) \n" "mov %%ax,0x4(%1) \n"
"lea 0x6(%1),%1 \n" "lea 0x6(%1),%1 \n"
"sub $0x6,%2 \n" "sub $0x6,%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
...@@ -2604,7 +2604,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2604,7 +2604,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
"movdqa %%xmm0,(%0) \n" "movdqa %%xmm0,(%0) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
"mov -0x1(%0),%%al \n" "mov -0x1(%0),%%al \n"
"mov %%al,(%0) \n" "mov %%al,(%0) \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -2624,7 +2624,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2624,7 +2624,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
"movdqa %%xmm0,(%0) \n" "movdqa %%xmm0,(%0) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
"mov -0x1(%0),%%al \n" "mov -0x1(%0),%%al \n"
"mov %%al,(%0) \n" "mov %%al,(%0) \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -2668,7 +2668,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, ...@@ -2668,7 +2668,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
"movdqa %%xmm0,(%0) \n" "movdqa %%xmm0,(%0) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
"mov -0x1(%0),%%al \n" "mov -0x1(%0),%%al \n"
"mov %%al,(%0) \n" "mov %%al,(%0) \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -2695,7 +2695,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2695,7 +2695,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"movdqa %%xmm0,(%0) \n" "movdqa %%xmm0,(%0) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
"mov -0x1(%0),%%al \n" "mov -0x1(%0),%%al \n"
"mov %%al,(%0) \n" "mov %%al,(%0) \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -2715,7 +2715,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2715,7 +2715,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"movdqa %%xmm0,(%0) \n" "movdqa %%xmm0,(%0) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
"mov -0x1(%0),%%al \n" "mov -0x1(%0),%%al \n"
"mov %%al,(%0) \n" "mov %%al,(%0) \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
...@@ -2750,7 +2750,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2750,7 +2750,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"movdqa %%xmm0,(%0) \n" "movdqa %%xmm0,(%0) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"ja 1b \n" "jg 1b \n"
"mov -0x1(%0),%%al \n" "mov -0x1(%0),%%al \n"
"mov %%al,(%0) \n" "mov %%al,(%0) \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment