switch looping to jg from ja to allow non-multiple of 16 to underflow to a negative

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/453001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@214 16f28f9a-4ce2-e073-06de-1de4eb20be90

switch looping to jg from ja to allow non-multiple of 16 to underflow to a negative
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/453001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@214 16f28f9a-4ce2-e073-06de-1de4eb20be90
18184fd1 · fbarchard@google.com · 1ff03571 · 18184fd1 · 18184fd1 · 18184fd1
Commit 18184fd1 authored Mar 12, 2012 by fbarchard@google.com
12 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 213
+Version: 214
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 213
+#define LIBYUV_VERSION 214
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -58,7 +58,7 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
    "vmlal.s16  q8, d5, d5                     \n"
    "vmlal.s16  q10, d7, d7                    \n"
    "subs       %2, %2, #16                    \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
    "vadd.u32   q7, q7, q8                     \n"
    "vadd.u32   q9, q9, q10                    \n"
@@ -94,6 +94,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
    movdqa     xmm1, [eax]
    movdqa     xmm2, [eax + edx]
    lea        eax,  [eax + 16]
+    sub        ecx, 16
    movdqa     xmm3, xmm1
    psubusb    xmm1, xmm2
    psubusb    xmm2, xmm3
@@ -105,8 +106,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
    pmaddwd    xmm2, xmm2
    paddd      xmm0, xmm1
    paddd      xmm0, xmm2
-    sub        ecx, 16
+    jg         wloop
-    ja         wloop
    pshufd     xmm1, xmm0, 0EEh
    paddd      xmm0, xmm1
@@ -131,6 +131,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
    "movdqa    (%0),%%xmm1                     \n"
    "movdqa    (%0,%1,1),%%xmm2                \n"
    "lea       0x10(%0),%0                     \n"
+    "sub       $0x10,%2                        \n"
    "movdqa    %%xmm1,%%xmm3                   \n"
    "psubusb   %%xmm2,%%xmm1                   \n"
    "psubusb   %%xmm3,%%xmm2                   \n"
@@ -142,8 +143,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
    "pmaddwd   %%xmm2,%%xmm2                   \n"
    "paddd     %%xmm1,%%xmm0                   \n"
    "paddd     %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
-    "ja        1b                              \n"
    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
    "paddd     %%xmm1,%%xmm0                   \n"

--- a/source/convert.cc
+++ b/source/convert.cc
@@ -77,10 +77,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
  convertloop:
    movdqa     xmm0, [eax]
    pavgb      xmm0, [eax + edx]
+    sub        ecx, 16
    movdqa     [eax + edi], xmm0
    lea        eax,  [eax + 16]
-    sub        ecx, 16
+    jg         convertloop
-    ja         convertloop
    pop        edi
    ret
  }
@@ -95,10 +95,10 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "pavgb      (%0,%3),%%xmm0                   \n"
+  "sub        $0x10,%2                         \n"
  "movdqa     %%xmm0,(%0,%1)                   \n"
  "lea        0x10(%0),%0                      \n"
-  "sub        $0x10,%2                         \n"
+  "jg         1b                               \n"
-  "ja         1b                               \n"
  : "+r"(src_uv),  // %0
    "+r"(dst_uv),  // %1
    "+r"(pix)      // %2
@@ -495,10 +495,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
    lea        esi, [esi + 8]
    psrlw      xmm1, 8     // V
    packuswb   xmm1, xmm1
+    sub        ecx, 16
    movq       qword ptr [edi], xmm1
    lea        edi, [edi + 8]
-    sub        ecx, 16
+    jg         convertloop
-    ja         convertloop
    pop        edi
    pop        esi
@@ -534,10 +534,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
  "lea        0x8(%2),%2                       \n"
  "psrlw      $0x8,%%xmm1                      \n"
  "packuswb   %%xmm1,%%xmm1                    \n"
+  "sub        $0x10,%4                         \n"
  "movq       %%xmm1,(%3)                      \n"
  "lea        0x8(%3),%3                       \n"
-  "sub        $0x10,%4                         \n"
+  "jg         1b                               \n"
-  "ja         1b                               \n"
  : "+r"(src_yuy2),    // %0
    "+r"(dst_y),       // %1
    "+r"(dst_u),       // %2

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -237,7 +237,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
    movdqa     [edi + 16], xmm1
    lea        edi, [edi + 32]
    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
    pop        edi
    pop        esi
@@ -276,7 +276,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
    movdqa     [edi + 16], xmm2
    lea        edi, [edi + 32]
    sub        ecx, 16
-    ja         convertloop
+    jg         convertloop
    pop        edi
    pop        esi
@@ -305,7 +305,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
    "movdqa    %%xmm1,0x10(%3)                   \n"
    "lea       0x20(%3),%3                       \n"
    "sub       $0x10,%4                          \n"
-    "ja         1b                               \n"
+    "jg         1b                               \n"
    : "+r"(src_y),  // %0
      "+r"(src_u),  // %1
      "+r"(src_v),  // %2
@@ -340,7 +340,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
    "movdqa    %%xmm2,0x10(%3)                   \n"
    "lea       0x20(%3),%3                       \n"
    "sub       $0x10,%4                          \n"
-    "ja         1b                               \n"
+    "jg         1b                               \n"
    : "+r"(src_y),  // %0
      "+r"(src_u),  // %1
      "+r"(src_v),  // %2
@@ -1084,10 +1084,11 @@ int ConvertFromI420(const uint8* y, int y_stride,
  if (y == NULL || u == NULL || v == NULL || dst_sample == NULL) {
    return -1;
  }
+  int r = 0;
  switch (format) {
    // Single plane formats
    case FOURCC_YUY2:
-      I420ToYUY2(y, y_stride,
+      r = I420ToYUY2(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample,
@@ -1095,7 +1096,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                     width, height);
      break;
    case FOURCC_UYVY:
-      I420ToUYVY(y, y_stride,
+      r = I420ToUYVY(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample,
@@ -1103,7 +1104,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                     width, height);
      break;
    case FOURCC_V210:
-      I420ToV210(y, y_stride,
+      r = I420ToV210(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample,
@@ -1112,7 +1113,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                     width, height);
      break;
    case FOURCC_RGBP:
-      I420ToRGB565(y, y_stride,
+      r = I420ToRGB565(y, y_stride,
                       u, u_stride,
                       v, v_stride,
                       dst_sample,
@@ -1120,7 +1121,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                       width, height);
      break;
    case FOURCC_RGBO:
-      I420ToARGB1555(y, y_stride,
+      r = I420ToARGB1555(y, y_stride,
                         u, u_stride,
                         v, v_stride,
                         dst_sample,
@@ -1128,7 +1129,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                         width, height);
      break;
    case FOURCC_R444:
-      I420ToARGB4444(y, y_stride,
+      r = I420ToARGB4444(y, y_stride,
                         u, u_stride,
                         v, v_stride,
                         dst_sample,
@@ -1136,7 +1137,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                         width, height);
      break;
    case FOURCC_24BG:
-      I420ToRGB24(y, y_stride,
+      r = I420ToRGB24(y, y_stride,
                      u, u_stride,
                      v, v_stride,
                      dst_sample,
@@ -1144,7 +1145,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                      width, height);
      break;
    case FOURCC_RAW:
-      I420ToRAW(y, y_stride,
+      r = I420ToRAW(y, y_stride,
                    u, u_stride,
                    v, v_stride,
                    dst_sample,
@@ -1152,7 +1153,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                    width, height);
      break;
    case FOURCC_ARGB:
-      I420ToARGB(y, y_stride,
+      r = I420ToARGB(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample,
@@ -1160,7 +1161,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                     width, height);
      break;
    case FOURCC_BGRA:
-      I420ToBGRA(y, y_stride,
+      r = I420ToBGRA(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample,
@@ -1168,7 +1169,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                     width, height);
      break;
    case FOURCC_ABGR:
-      I420ToABGR(y, y_stride,
+      r = I420ToABGR(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample,
@@ -1176,7 +1177,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                     width, height);
      break;
    case FOURCC_BGGR:
-      I420ToBayerBGGR(y, y_stride,
+      r = I420ToBayerBGGR(y, y_stride,
                          u, u_stride,
                          v, v_stride,
                          dst_sample,
@@ -1184,7 +1185,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                          width, height);
      break;
    case FOURCC_GBRG:
-      I420ToBayerGBRG(y, y_stride,
+      r = I420ToBayerGBRG(y, y_stride,
                          u, u_stride,
                          v, v_stride,
                          dst_sample,
@@ -1192,7 +1193,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                          width, height);
      break;
    case FOURCC_GRBG:
-      I420ToBayerGRBG(y, y_stride,
+      r = I420ToBayerGRBG(y, y_stride,
                          u, u_stride,
                          v, v_stride,
                          dst_sample,
@@ -1200,7 +1201,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                          width, height);
      break;
    case FOURCC_RGGB:
-      I420ToBayerRGGB(y, y_stride,
+      r = I420ToBayerRGGB(y, y_stride,
                          u, u_stride,
                          v, v_stride,
                          dst_sample,
@@ -1208,7 +1209,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                          width, height);
      break;
    case FOURCC_I400:
-      I400Copy(y, y_stride,
+      r = I400Copy(y, y_stride,
                   dst_sample,
                   dst_sample_stride ? dst_sample_stride : width,
                   width, height);
@@ -1228,7 +1229,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
        dst_v = dst_sample + width * height;
        dst_u = dst_v + halfwidth * halfheight;
      }
-      I420Copy(y, y_stride,
+      r = I420Copy(y, y_stride,
                   u, u_stride,
                   v, v_stride,
                   dst_sample, width,
@@ -1249,7 +1250,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
        dst_v = dst_sample + width * height;
        dst_u = dst_v + halfwidth * height;
      }
-      I420ToI422(y, y_stride,
+      r = I420ToI422(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample, width,
@@ -1269,7 +1270,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
        dst_v = dst_sample + width * height;
        dst_u = dst_v + width * height;
      }
-      I420ToI444(y, y_stride,
+      r = I420ToI444(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample, width,
@@ -1282,7 +1283,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
      int quarterwidth = (width + 3) / 4;
      uint8* dst_u = dst_sample + width * height;
      uint8* dst_v = dst_u + quarterwidth * height;
-      I420ToI411(y, y_stride,
+      r = I420ToI411(y, y_stride,
                     u, u_stride,
                     v, v_stride,
                     dst_sample, width,
@@ -1296,7 +1297,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
    default:
      return -1;  // unknown fourcc - return failure code.
  }
-  return 0;
+  return r;
 }
 #ifdef __cplusplus

--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -40,10 +40,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
    movdqa     xmm0, [eax]
    lea        eax, [eax + 16]
    pshufb     xmm0, xmm5
+    sub        ecx, 4
    movd       [edx], xmm0
    lea        edx, [edx + 4]
-    sub        ecx, 4
+    jg         wloop
-    ja         wloop
    ret
  }
 }
@@ -60,10 +60,10 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
    "movdqa (%0),%%xmm0                        \n"
    "lea    0x10(%0),%0                        \n"
    "pshufb %%xmm5,%%xmm0                      \n"
+    "sub    $0x4,%2                            \n"
    "movd   %%xmm0,(%1)                        \n"
    "lea    0x4(%1),%1                         \n"
-    "sub    $0x4,%2                            \n"
+    "jg     1b                                 \n"
-    "ja     1b                                 \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_bayer), // %1
    "+r"(pix)        // %2

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -685,7 +685,7 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
    "1:                                        \n"
    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
    "vst1.u32  {q0}, [%0]!                     \n"  // store
-    "bhi       1b                              \n"
+    "bgt       1b                              \n"
  : "+r"(dst),  // %0
    "+r"(count) // %1
  : "r"(v32)    // %2
@@ -738,7 +738,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
    rep stosd
    add        edi, edx
    sub        ebx, 1
-    ja         convertloop
+    jg         convertloop
    pop        ebp
    pop        edi

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
    sub       ecx, 8
    movq      qword ptr [edx + esi], xmm7
    lea       edx, [edx + 2 * esi]
-    ja        convertloop
+    jg        convertloop
    pop       ebp
    pop       esi
@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
-    ja        convertloop
+    jg        convertloop
    mov       esp, [esp + 16]
    pop       ebp
@@ -366,7 +366,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
    "sub        $0x8,%2                          \n"
    "movq       %%xmm7,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
-    "ja         1b                               \n"
+    "jg         1b                               \n"
    : "+r"(src),    // %0
      "+r"(dst),    // %1
      "+r"(width)   // %2
@@ -493,7 +493,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "ja     1b                                 \n"
+    "jg     1b                                 \n"
    "mov    0x10(%esp),%esp                    \n"
    "pop    %ebp                               \n"
    "pop    %edi                               \n"
@@ -629,7 +629,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
  "sub        $0x10,%2                         \n"
  "movq       %%xmm15,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
@@ -737,7 +737,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
-  "ja         1b                               \n"
+  "jg         1b                               \n"
  : "+r"(src),    // %0
    "+r"(dst_a),  // %1
    "+r"(dst_b),  // %2
@@ -755,8 +755,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 static void TransposeWx8_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
-                           int w) {
+                           int width) {
-  for (int i = 0; i < w; ++i) {
+  for (int i = 0; i < width; ++i) {
    dst[0] = src[0 * src_stride];
    dst[1] = src[1 * src_stride];
    dst[2] = src[2 * src_stride];
@@ -888,9 +888,9 @@ void RotatePlane180(const uint8* src, int src_stride,
 static void TransposeUVWx8_C(const uint8* src, int src_stride,
                             uint8* dst_a, int dst_stride_a,
                             uint8* dst_b, int dst_stride_b,
-                             int w) {
+                             int width) {
  int i;
-  for (i = 0; i < w; ++i) {
+  for (i = 0; i < width; ++i) {
    dst_a[0] = src[0 * src_stride + 0];
    dst_b[0] = src[0 * src_stride + 1];
    dst_a[1] = src[1 * src_stride + 0];
@@ -916,10 +916,10 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
 static void TransposeUVWxH_C(const uint8* src, int src_stride,
                             uint8* dst_a, int dst_stride_a,
                             uint8* dst_b, int dst_stride_b,
-                             int w, int h) {
+                             int width, int height) {
  int i, j;
-  for (i = 0; i < w * 2; i += 2)
+  for (i = 0; i < width * 2; i += 2)
-    for (j = 0; j < h; ++j) {
+    for (j = 0; j < height; ++j) {
      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
    }

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -73,7 +73,7 @@ YUVTORGB
    "vmov.u8    d23, #255                      \n"
    "vst4.u8    {d20, d21, d22, d23}, [%3]!    \n"
    "subs       %4, %4, #8                     \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
      "+r"(u_buf),    // %1
      "+r"(v_buf),    // %2
@@ -106,7 +106,7 @@ YUVTORGB
    "vmov.u8    d19, #255                      \n"
    "vst4.u8    {d19, d20, d21, d22}, [%3]!    \n"
    "subs       %4, %4, #8                     \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
      "+r"(u_buf),    // %1
      "+r"(v_buf),    // %2
@@ -139,7 +139,7 @@ YUVTORGB
    "vmov.u8    d23, #255                      \n"
    "vst4.u8    {d20, d21, d22, d23}, [%3]!    \n"
    "subs       %4, %4, #8                     \n"
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
      "+r"(u_buf),    // %1
      "+r"(v_buf),    // %2
@@ -163,7 +163,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
    "vst1.u8    {q0}, [%1]!                    \n"  // store U
    "vst1.u8    {q1}, [%2]!                    \n"  // Store V
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
    : "+r"(src_uv),  // %0
      "+r"(dst_u),   // %1
      "+r"(dst_v),   // %2
@@ -183,7 +183,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
    "vldm       %0!,{q0,q1,q2,q3}              \n"  // load 64
    "subs       %2, %2, #64                    \n"  // 64 processed per loop
    "vstm       %1!,{q0,q1,q2,q3}              \n"  // store 64
-    "bhi        1b                             \n"
+    "bgt        1b                             \n"
    : "+r"(src),   // %0
      "+r"(dst),   // %1
      "+r"(count)  // %2  // Output registers

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
--- a/source/row_win.cc
+++ b/source/row_win.cc
--- a/source/scale.cc
+++ b/source/scale.cc