Specialized scale down sample to 1 / 2 size adjust to match general purpose code…

Specialized scale down sample to 1 / 2 size adjust to match general purpose code which uses odd pixel (rounded up - nearest neighbor). BUG=223 TEST=out\Debug\convert.exe -f 0 faces_640x480_P420.yuv face2_320x240_P420.yuv R=johannkoenig@google.com Review URL: https://webrtc-codereview.appspot.com/1583005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@708 16f28f9a-4ce2-e073-06de-1de4eb20be90

Specialized scale down sample to 1 / 2 size adjust to match general purpose code…
Specialized scale down sample to 1 / 2 size adjust to match general purpose code which uses odd pixel (rounded up - nearest neighbor). BUG=223 TEST=out\Debug\convert.exe -f 0 faces_640x480_P420.yuv face2_320x240_P420.yuv R=johannkoenig@google.com Review URL: https://webrtc-codereview.appspot.com/1583005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@708 16f28f9a-4ce2-e073-06de-1de4eb20be90
8b54a8f9 · fbarchard@google.com · 83408b85 · 8b54a8f9 · 8b54a8f9 · 8b54a8f9
Commit 8b54a8f9 authored May 29, 2013 by fbarchard@google.com
7 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 707
+Version: 708
 License: BSD
 License File: LICENSE


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 707
+#define LIBYUV_VERSION 708

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -196,16 +196,14 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                     // src_stride ignored
    mov        edx, [esp + 12]       // dst_ptr
    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8

    align      16
  wloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
    packuswb   xmm0, xmm1
    sub        ecx, 16
    movdqa     [edx], xmm0
@@ -271,16 +269,14 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
                                     // src_stride ignored
    mov        edx, [esp + 12]       // dst_ptr
    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8

    align      16
  wloop:
    movdqu     xmm0, [eax]
    movdqu     xmm1, [eax + 16]
    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
    packuswb   xmm0, xmm1
    sub        ecx, 16
    movdqu     [edx], xmm0
@@ -1269,15 +1265,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
 static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
    "movdqa    %%xmm0,(%1)                     \n"
    "lea       0x10(%1),%1                     \n"
@@ -1289,7 +1283,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
  :
  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
+    , "xmm0", "xmm1"
 #endif
  );
 }
@@ -1336,15 +1330,13 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
                                         ptrdiff_t src_stride,
                                         uint8* dst_ptr, int dst_width) {
  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
    "movdqu    %%xmm0,(%1)                     \n"
    "lea       0x10(%1),%1                     \n"
@@ -1356,7 +1348,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
  :
  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
+    , "xmm0", "xmm1"
 #endif
  );
 }
@@ -2324,13 +2316,13 @@ static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                            uint8* dst, int dst_width) {
  uint8* dend = dst + dst_width - 1;
  do {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[2];
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
    dst += 2;
    src_ptr += 4;
  } while (dst < dend);
  if (dst_width & 1) {
-    dst[0] = src_ptr[0];
+    dst[0] = src_ptr[1];
  }
 }

@@ -2689,6 +2681,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
  }
 #endif

+  src_ptr += src_stride;  // Point to odd rows.
  // TODO(fbarchard): Loop through source height to allow odd height.
  for (int y = 0; y < dst_height; ++y) {
    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);

--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -62,7 +62,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
    lea        eax,  [eax + 32]
-    shufps     xmm0, xmm1, 0x88
+    shufps     xmm0, xmm1, 0xdd
    sub        ecx, 4
    movdqa     [edx], xmm0
    lea        edx, [edx + 16]
@@ -350,7 +350,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
    "lea       0x20(%0),%0                     \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%1)                     \n"
    "lea       0x10(%1),%1                     \n"
@@ -634,13 +634,13 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb,
  uint32* dst = reinterpret_cast<uint32*>(dst_argb);

  for (int x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[0];
-    dst[1] = src[2];
+    dst[0] = src[1];
+    dst[1] = src[3];
    src += 4;
    dst += 2;
  }
  if (dst_width & 1) {
-    dst[0] = src[0];
+    dst[0] = src[1];
  }
 }

@@ -743,25 +743,26 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
                           FilterMode filtering) {
  assert(dx == 65536 * 2);  // Test scale factor of 2.
  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row / even column.
+  src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+  int row_stride = src_stride * (dy >> 16);
  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
                            uint8* dst_argb, int dst_width) =
      filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
        ScaleARGBRowDown2_SSE2;
  }
 #elif defined(HAS_SCALEARGBROWDOWN2_NEON)
  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_NEON :
        ScaleARGBRowDown2_NEON;
  }
 #endif
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  int row_stride = src_stride * (dy >> 16);

  // TODO(fbarchard): Loop through source height to allow odd height.
  for (int y = 0; y < dst_height; ++y) {
@@ -782,6 +783,9 @@ static void ScaleARGBDownEven(int src_width, int src_height,
                              FilterMode filtering) {
  assert(IS_ALIGNED(src_width, 2));
  assert(IS_ALIGNED(src_height, 2));
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
                               int src_step, uint8* dst_argb, int dst_width) =
      filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
@@ -798,9 +802,6 @@ static void ScaleARGBDownEven(int src_width, int src_height,
        ScaleARGBRowDownEven_NEON;
  }
 #endif
-  int col_step = dx >> 16;
-  int row_stride = (dy >> 16) * src_stride;
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;

  for (int y = 0; y < dst_height; ++y) {
    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);

--- a/source/scale_argb_neon.cc
+++ b/source/scale_argb_neon.cc
@@ -27,8 +27,8 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
    "vld2.u32   {q0, q1}, [%0]!                \n"
    "vld2.u32   {q2, q3}, [%0]!                \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
-    "vst1.u8    {q2}, [%1]!                    \n"
+    "vst1.u8    {q1}, [%1]!                    \n"  // store odd pixels
+    "vst1.u8    {q3}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
@@ -78,6 +78,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t,
                               int src_stepx,
                               uint8* dst_argb, int dst_width) {
  asm volatile (
+    "add        %0, #4                         \n"  // point to odd pixels.
    "mov        r12, %3, lsl #2                \n"
    ".p2align  2                               \n"
  "1:                                          \n"

--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@@ -39,6 +39,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
+    // TODO(fbarchard): Use odd pixels instead of even.
    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|

--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
    // load even pixels into q0, odd into q1
    "vld2.u8    {q0,q1}, [%0]!                 \n"
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
+    "vst1.u8    {q1}, [%1]!                    \n"  // store odd pixels
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1