Commit 18184fd1 authored by fbarchard@google.com's avatar fbarchard@google.com

switch looping to jg from ja to allow non-multiple of 16 to underflow to a negative

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/453001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@214 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 1ff03571
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 213
Version: 214
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 213
#define LIBYUV_VERSION 214
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -58,7 +58,7 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
"vmlal.s16 q8, d5, d5 \n"
"vmlal.s16 q10, d7, d7 \n"
"subs %2, %2, #16 \n"
"bhi 1b \n"
"bgt 1b \n"
"vadd.u32 q7, q7, q8 \n"
"vadd.u32 q9, q9, q10 \n"
......@@ -94,6 +94,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
movdqa xmm1, [eax]
movdqa xmm2, [eax + edx]
lea eax, [eax + 16]
sub ecx, 16
movdqa xmm3, xmm1
psubusb xmm1, xmm2
psubusb xmm2, xmm3
......@@ -105,8 +106,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
sub ecx, 16
ja wloop
jg wloop
pshufd xmm1, xmm0, 0EEh
paddd xmm0, xmm1
......@@ -131,6 +131,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
"movdqa (%0),%%xmm1 \n"
"movdqa (%0,%1,1),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n"
......@@ -142,8 +143,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
"pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
......
......@@ -77,10 +77,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
convertloop:
movdqa xmm0, [eax]
pavgb xmm0, [eax + edx]
sub ecx, 16
movdqa [eax + edi], xmm0
lea eax, [eax + 16]
sub ecx, 16
ja convertloop
jg convertloop
pop edi
ret
}
......@@ -95,10 +95,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pavgb (%0,%3),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%0,%1) \n"
"lea 0x10(%0),%0 \n"
"sub $0x10,%2 \n"
"ja 1b \n"
"jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(pix) // %2
......@@ -495,10 +495,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
lea esi, [esi + 8]
psrlw xmm1, 8 // V
packuswb xmm1, xmm1
sub ecx, 16
movq qword ptr [edi], xmm1
lea edi, [edi + 8]
sub ecx, 16
ja convertloop
jg convertloop
pop edi
pop esi
......@@ -534,10 +534,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
"lea 0x8(%2),%2 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n"
"movq %%xmm1,(%3) \n"
"lea 0x8(%3),%3 \n"
"sub $0x10,%4 \n"
"ja 1b \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(dst_u), // %2
......
......@@ -237,7 +237,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
movdqa [edi + 16], xmm1
lea edi, [edi + 32]
sub ecx, 16
ja convertloop
jg convertloop
pop edi
pop esi
......@@ -276,7 +276,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
movdqa [edi + 16], xmm2
lea edi, [edi + 32]
sub ecx, 16
ja convertloop
jg convertloop
pop edi
pop esi
......@@ -305,7 +305,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n"
"ja 1b \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
......@@ -340,7 +340,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
"movdqa %%xmm2,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n"
"ja 1b \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
......@@ -1084,10 +1084,11 @@ int ConvertFromI420(const uint8* y, int y_stride,
if (y == NULL || u == NULL || v == NULL || dst_sample == NULL) {
return -1;
}
int r = 0;
switch (format) {
// Single plane formats
case FOURCC_YUY2:
I420ToYUY2(y, y_stride,
r = I420ToYUY2(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1095,7 +1096,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_UYVY:
I420ToUYVY(y, y_stride,
r = I420ToUYVY(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1103,7 +1104,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_V210:
I420ToV210(y, y_stride,
r = I420ToV210(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1112,7 +1113,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_RGBP:
I420ToRGB565(y, y_stride,
r = I420ToRGB565(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1120,7 +1121,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_RGBO:
I420ToARGB1555(y, y_stride,
r = I420ToARGB1555(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1128,7 +1129,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_R444:
I420ToARGB4444(y, y_stride,
r = I420ToARGB4444(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1136,7 +1137,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_24BG:
I420ToRGB24(y, y_stride,
r = I420ToRGB24(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1144,7 +1145,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_RAW:
I420ToRAW(y, y_stride,
r = I420ToRAW(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1152,7 +1153,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_ARGB:
I420ToARGB(y, y_stride,
r = I420ToARGB(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1160,7 +1161,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_BGRA:
I420ToBGRA(y, y_stride,
r = I420ToBGRA(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1168,7 +1169,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_ABGR:
I420ToABGR(y, y_stride,
r = I420ToABGR(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1176,7 +1177,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_BGGR:
I420ToBayerBGGR(y, y_stride,
r = I420ToBayerBGGR(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1184,7 +1185,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_GBRG:
I420ToBayerGBRG(y, y_stride,
r = I420ToBayerGBRG(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1192,7 +1193,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_GRBG:
I420ToBayerGRBG(y, y_stride,
r = I420ToBayerGRBG(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1200,7 +1201,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_RGGB:
I420ToBayerRGGB(y, y_stride,
r = I420ToBayerRGGB(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
......@@ -1208,7 +1209,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
case FOURCC_I400:
I400Copy(y, y_stride,
r = I400Copy(y, y_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
width, height);
......@@ -1228,7 +1229,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height;
dst_u = dst_v + halfwidth * halfheight;
}
I420Copy(y, y_stride,
r = I420Copy(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample, width,
......@@ -1249,7 +1250,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height;
dst_u = dst_v + halfwidth * height;
}
I420ToI422(y, y_stride,
r = I420ToI422(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample, width,
......@@ -1269,7 +1270,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_v = dst_sample + width * height;
dst_u = dst_v + width * height;
}
I420ToI444(y, y_stride,
r = I420ToI444(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample, width,
......@@ -1282,7 +1283,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
int quarterwidth = (width + 3) / 4;
uint8* dst_u = dst_sample + width * height;
uint8* dst_v = dst_u + quarterwidth * height;
I420ToI411(y, y_stride,
r = I420ToI411(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample, width,
......@@ -1296,7 +1297,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
default:
return -1; // unknown fourcc - return failure code.
}
return 0;
return r;
}
#ifdef __cplusplus
......
......@@ -40,10 +40,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm5
sub ecx, 4
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 4
ja wloop
jg wloop
ret
}
}
......@@ -60,10 +60,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x4,%2 \n"
"movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n"
"sub $0x4,%2 \n"
"ja 1b \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
......
......@@ -685,7 +685,7 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u32 {q0}, [%0]! \n" // store
"bhi 1b \n"
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
......@@ -738,7 +738,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
rep stosd
add edi, edx
sub ebx, 1
ja convertloop
jg convertloop
pop ebp
pop edi
......
......@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
sub ecx, 8
movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi]
ja convertloop
jg convertloop
pop ebp
pop esi
......@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
ja convertloop
jg convertloop
mov esp, [esp + 16]
pop ebp
......@@ -366,7 +366,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"ja 1b \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
......@@ -493,7 +493,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"ja 1b \n"
"jg 1b \n"
"mov 0x10(%esp),%esp \n"
"pop %ebp \n"
"pop %edi \n"
......@@ -629,7 +629,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"ja 1b \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
......@@ -737,7 +737,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"ja 1b \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
......@@ -755,8 +755,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int w) {
for (int i = 0; i < w; ++i) {
int width) {
for (int i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
......@@ -888,9 +888,9 @@ void RotatePlane180(const uint8* src, int src_stride,
static void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
int width) {
int i;
for (i = 0; i < w; ++i) {
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0];
......@@ -916,10 +916,10 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
static void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w, int h) {
int width, int height) {
int i, j;
for (i = 0; i < w * 2; i += 2)
for (j = 0; j < h; ++j) {
for (i = 0; i < width * 2; i += 2)
for (j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
}
......
......@@ -73,7 +73,7 @@ YUVTORGB
"vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n"
"bhi 1b \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
......@@ -106,7 +106,7 @@ YUVTORGB
"vmov.u8 d19, #255 \n"
"vst4.u8 {d19, d20, d21, d22}, [%3]! \n"
"subs %4, %4, #8 \n"
"bhi 1b \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
......@@ -139,7 +139,7 @@ YUVTORGB
"vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n"
"bhi 1b \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
......@@ -163,7 +163,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
"subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // Store V
"bhi 1b \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
......@@ -183,7 +183,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
"vldm %0!,{q0,q1,q2,q3} \n" // load 64
"subs %2, %2, #64 \n" // 64 processed per loop
"vstm %1!,{q0,q1,q2,q3} \n" // store 64
"bhi 1b \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment