Commit 18184fd1 authored by fbarchard@google.com's avatar fbarchard@google.com

switch looping to jg from ja to allow non-multiple of 16 to underflow to a negative

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/453001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@214 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 1ff03571
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 213 Version: 214
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 213 #define LIBYUV_VERSION 214
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -58,7 +58,7 @@ static uint32 SumSquareError_NEON(const uint8* src_a, ...@@ -58,7 +58,7 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
"vmlal.s16 q8, d5, d5 \n" "vmlal.s16 q8, d5, d5 \n"
"vmlal.s16 q10, d7, d7 \n" "vmlal.s16 q10, d7, d7 \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
"bhi 1b \n" "bgt 1b \n"
"vadd.u32 q7, q7, q8 \n" "vadd.u32 q7, q7, q8 \n"
"vadd.u32 q9, q9, q10 \n" "vadd.u32 q9, q9, q10 \n"
...@@ -94,6 +94,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -94,6 +94,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
movdqa xmm1, [eax] movdqa xmm1, [eax]
movdqa xmm2, [eax + edx] movdqa xmm2, [eax + edx]
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 16
movdqa xmm3, xmm1 movdqa xmm3, xmm1
psubusb xmm1, xmm2 psubusb xmm1, xmm2
psubusb xmm2, xmm3 psubusb xmm2, xmm3
...@@ -105,8 +106,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -105,8 +106,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
pmaddwd xmm2, xmm2 pmaddwd xmm2, xmm2
paddd xmm0, xmm1 paddd xmm0, xmm1
paddd xmm0, xmm2 paddd xmm0, xmm2
sub ecx, 16 jg wloop
ja wloop
pshufd xmm1, xmm0, 0EEh pshufd xmm1, xmm0, 0EEh
paddd xmm0, xmm1 paddd xmm0, xmm1
...@@ -131,6 +131,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -131,6 +131,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
"movdqa (%0),%%xmm1 \n" "movdqa (%0),%%xmm1 \n"
"movdqa (%0,%1,1),%%xmm2 \n" "movdqa (%0,%1,1),%%xmm2 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n" "psubusb %%xmm3,%%xmm2 \n"
...@@ -142,8 +143,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, ...@@ -142,8 +143,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
"pmaddwd %%xmm2,%%xmm2 \n" "pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n" "pshufd $0xee,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
......
...@@ -77,10 +77,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -77,10 +77,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
pavgb xmm0, [eax + edx] pavgb xmm0, [eax + edx]
sub ecx, 16
movdqa [eax + edi], xmm0 movdqa [eax + edi], xmm0
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
ret ret
} }
...@@ -95,10 +95,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -95,10 +95,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"pavgb (%0,%3),%%xmm0 \n" "pavgb (%0,%3),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%0,%1) \n" "movdqa %%xmm0,(%0,%1) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_uv), // %1 "+r"(dst_uv), // %1
"+r"(pix) // %2 "+r"(pix) // %2
...@@ -495,10 +495,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ...@@ -495,10 +495,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
lea esi, [esi + 8] lea esi, [esi + 8]
psrlw xmm1, 8 // V psrlw xmm1, 8 // V
packuswb xmm1, xmm1 packuswb xmm1, xmm1
sub ecx, 16
movq qword ptr [edi], xmm1 movq qword ptr [edi], xmm1
lea edi, [edi + 8] lea edi, [edi + 8]
sub ecx, 16 jg convertloop
ja convertloop
pop edi pop edi
pop esi pop esi
...@@ -534,10 +534,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, ...@@ -534,10 +534,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
"lea 0x8(%2),%2 \n" "lea 0x8(%2),%2 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n"
"movq %%xmm1,(%3) \n" "movq %%xmm1,(%3) \n"
"lea 0x8(%3),%3 \n" "lea 0x8(%3),%3 \n"
"sub $0x10,%4 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(dst_u), // %2 "+r"(dst_u), // %2
......
...@@ -237,7 +237,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, ...@@ -237,7 +237,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
movdqa [edi + 16], xmm1 movdqa [edi + 16], xmm1
lea edi, [edi + 32] lea edi, [edi + 32]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -276,7 +276,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, ...@@ -276,7 +276,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
movdqa [edi + 16], xmm2 movdqa [edi + 16], xmm2
lea edi, [edi + 32] lea edi, [edi + 32]
sub ecx, 16 sub ecx, 16
ja convertloop jg convertloop
pop edi pop edi
pop esi pop esi
...@@ -305,7 +305,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, ...@@ -305,7 +305,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
"movdqa %%xmm1,0x10(%3) \n" "movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n" "sub $0x10,%4 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -340,7 +340,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, ...@@ -340,7 +340,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
"movdqa %%xmm2,0x10(%3) \n" "movdqa %%xmm2,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n" "sub $0x10,%4 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
"+r"(src_v), // %2 "+r"(src_v), // %2
...@@ -1084,10 +1084,11 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1084,10 +1084,11 @@ int ConvertFromI420(const uint8* y, int y_stride,
if (y == NULL || u == NULL || v == NULL || dst_sample == NULL) { if (y == NULL || u == NULL || v == NULL || dst_sample == NULL) {
return -1; return -1;
} }
int r = 0;
switch (format) { switch (format) {
// Single plane formats // Single plane formats
case FOURCC_YUY2: case FOURCC_YUY2:
I420ToYUY2(y, y_stride, r = I420ToYUY2(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1095,7 +1096,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1095,7 +1096,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_UYVY: case FOURCC_UYVY:
I420ToUYVY(y, y_stride, r = I420ToUYVY(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1103,7 +1104,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1103,7 +1104,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_V210: case FOURCC_V210:
I420ToV210(y, y_stride, r = I420ToV210(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1112,7 +1113,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1112,7 +1113,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_RGBP: case FOURCC_RGBP:
I420ToRGB565(y, y_stride, r = I420ToRGB565(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1120,7 +1121,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1120,7 +1121,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_RGBO: case FOURCC_RGBO:
I420ToARGB1555(y, y_stride, r = I420ToARGB1555(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1128,7 +1129,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1128,7 +1129,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_R444: case FOURCC_R444:
I420ToARGB4444(y, y_stride, r = I420ToARGB4444(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1136,7 +1137,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1136,7 +1137,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_24BG: case FOURCC_24BG:
I420ToRGB24(y, y_stride, r = I420ToRGB24(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1144,7 +1145,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1144,7 +1145,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_RAW: case FOURCC_RAW:
I420ToRAW(y, y_stride, r = I420ToRAW(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1152,7 +1153,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1152,7 +1153,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_ARGB: case FOURCC_ARGB:
I420ToARGB(y, y_stride, r = I420ToARGB(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1160,7 +1161,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1160,7 +1161,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_BGRA: case FOURCC_BGRA:
I420ToBGRA(y, y_stride, r = I420ToBGRA(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1168,7 +1169,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1168,7 +1169,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_ABGR: case FOURCC_ABGR:
I420ToABGR(y, y_stride, r = I420ToABGR(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1176,7 +1177,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1176,7 +1177,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_BGGR: case FOURCC_BGGR:
I420ToBayerBGGR(y, y_stride, r = I420ToBayerBGGR(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1184,7 +1185,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1184,7 +1185,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_GBRG: case FOURCC_GBRG:
I420ToBayerGBRG(y, y_stride, r = I420ToBayerGBRG(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1192,7 +1193,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1192,7 +1193,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_GRBG: case FOURCC_GRBG:
I420ToBayerGRBG(y, y_stride, r = I420ToBayerGRBG(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1200,7 +1201,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1200,7 +1201,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_RGGB: case FOURCC_RGGB:
I420ToBayerRGGB(y, y_stride, r = I420ToBayerRGGB(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, dst_sample,
...@@ -1208,7 +1209,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1208,7 +1209,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height); width, height);
break; break;
case FOURCC_I400: case FOURCC_I400:
I400Copy(y, y_stride, r = I400Copy(y, y_stride,
dst_sample, dst_sample,
dst_sample_stride ? dst_sample_stride : width, dst_sample_stride ? dst_sample_stride : width,
width, height); width, height);
...@@ -1228,7 +1229,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1228,7 +1229,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height; dst_v = dst_sample + width * height;
dst_u = dst_v + halfwidth * halfheight; dst_u = dst_v + halfwidth * halfheight;
} }
I420Copy(y, y_stride, r = I420Copy(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, width, dst_sample, width,
...@@ -1249,7 +1250,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1249,7 +1250,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height; dst_v = dst_sample + width * height;
dst_u = dst_v + halfwidth * height; dst_u = dst_v + halfwidth * height;
} }
I420ToI422(y, y_stride, r = I420ToI422(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, width, dst_sample, width,
...@@ -1269,7 +1270,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1269,7 +1270,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height; dst_v = dst_sample + width * height;
dst_u = dst_v + width * height; dst_u = dst_v + width * height;
} }
I420ToI444(y, y_stride, r = I420ToI444(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, width, dst_sample, width,
...@@ -1282,7 +1283,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1282,7 +1283,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
int quarterwidth = (width + 3) / 4; int quarterwidth = (width + 3) / 4;
uint8* dst_u = dst_sample + width * height; uint8* dst_u = dst_sample + width * height;
uint8* dst_v = dst_u + quarterwidth * height; uint8* dst_v = dst_u + quarterwidth * height;
I420ToI411(y, y_stride, r = I420ToI411(y, y_stride,
u, u_stride, u, u_stride,
v, v_stride, v, v_stride,
dst_sample, width, dst_sample, width,
...@@ -1296,7 +1297,7 @@ int ConvertFromI420(const uint8* y, int y_stride, ...@@ -1296,7 +1297,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
default: default:
return -1; // unknown fourcc - return failure code. return -1; // unknown fourcc - return failure code.
} }
return 0; return r;
} }
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -40,10 +40,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, ...@@ -40,10 +40,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
sub ecx, 4
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 4 jg wloop
ja wloop
ret ret
} }
} }
...@@ -60,10 +60,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, ...@@ -60,10 +60,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"sub $0x4,%2 \n"
"movd %%xmm0,(%1) \n" "movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n" "lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n" "jg 1b \n"
"ja 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
"+r"(pix) // %2 "+r"(pix) // %2
......
...@@ -685,7 +685,7 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) { ...@@ -685,7 +685,7 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u32 {q0}, [%0]! \n" // store "vst1.u32 {q0}, [%0]! \n" // store
"bhi 1b \n" "bgt 1b \n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
: "r"(v32) // %2 : "r"(v32) // %2
...@@ -738,7 +738,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, ...@@ -738,7 +738,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
rep stosd rep stosd
add edi, edx add edi, edx
sub ebx, 1 sub ebx, 1
ja convertloop jg convertloop
pop ebp pop ebp
pop edi pop edi
......
...@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
sub ecx, 8 sub ecx, 8
movq qword ptr [edx + esi], xmm7 movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi] lea edx, [edx + 2 * esi]
ja convertloop jg convertloop
pop ebp pop ebp
pop esi pop esi
...@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi] lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0 movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp] lea ebx, [ebx + 2 * ebp]
ja convertloop jg convertloop
mov esp, [esp + 16] mov esp, [esp + 16]
pop ebp pop ebp
...@@ -366,7 +366,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -366,7 +366,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n" "movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -493,7 +493,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -493,7 +493,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"lea (%edx,%esi,2),%edx \n" "lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n" "movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n" "lea (%ebx,%ebp,2),%ebx \n"
"ja 1b \n" "jg 1b \n"
"mov 0x10(%esp),%esp \n" "mov 0x10(%esp),%esp \n"
"pop %ebp \n" "pop %ebp \n"
"pop %edi \n" "pop %edi \n"
...@@ -629,7 +629,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, ...@@ -629,7 +629,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n" "movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
...@@ -737,7 +737,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -737,7 +737,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"ja 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_a), // %1 "+r"(dst_a), // %1
"+r"(dst_b), // %2 "+r"(dst_b), // %2
...@@ -755,8 +755,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -755,8 +755,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
static void TransposeWx8_C(const uint8* src, int src_stride, static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride,
int w) { int width) {
for (int i = 0; i < w; ++i) { for (int i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride]; dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride]; dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride]; dst[2] = src[2 * src_stride];
...@@ -888,9 +888,9 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -888,9 +888,9 @@ void RotatePlane180(const uint8* src, int src_stride,
static void TransposeUVWx8_C(const uint8* src, int src_stride, static void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int w) { int width) {
int i; int i;
for (i = 0; i < w; ++i) { for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0]; dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1]; dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0]; dst_a[1] = src[1 * src_stride + 0];
...@@ -916,10 +916,10 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride, ...@@ -916,10 +916,10 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
static void TransposeUVWxH_C(const uint8* src, int src_stride, static void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int w, int h) { int width, int height) {
int i, j; int i, j;
for (i = 0; i < w * 2; i += 2) for (i = 0; i < width * 2; i += 2)
for (j = 0; j < h; ++j) { for (j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
} }
......
...@@ -73,7 +73,7 @@ YUVTORGB ...@@ -73,7 +73,7 @@ YUVTORGB
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n" "vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -106,7 +106,7 @@ YUVTORGB ...@@ -106,7 +106,7 @@ YUVTORGB
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
"vst4.u8 {d19, d20, d21, d22}, [%3]! \n" "vst4.u8 {d19, d20, d21, d22}, [%3]! \n"
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -139,7 +139,7 @@ YUVTORGB ...@@ -139,7 +139,7 @@ YUVTORGB
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n" "vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n" "subs %4, %4, #8 \n"
"bhi 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
...@@ -163,7 +163,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { ...@@ -163,7 +163,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store U "vst1.u8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // Store V "vst1.u8 {q1}, [%2]! \n" // Store V
"bhi 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
...@@ -183,7 +183,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -183,7 +183,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
"vldm %0!,{q0,q1,q2,q3} \n" // load 64 "vldm %0!,{q0,q1,q2,q3} \n" // load 64
"subs %2, %2, #64 \n" // 64 processed per loop "subs %2, %2, #64 \n" // 64 processed per loop
"vstm %1!,{q0,q1,q2,q3} \n" // store 64 "vstm %1!,{q0,q1,q2,q3} \n" // store 64
"bhi 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 // Output registers "+r"(count) // %2 // Output registers
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment