Scale up columns 2 pixels at a time

BUG=208 TEST=out\release\libyuv_unittest --gtest_filter=*Scale*640* Review URL: https://webrtc-codereview.appspot.com/1294004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@648 16f28f9a-4ce2-e073-06de-1de4eb20be90

Scale up columns 2 pixels at a time
BUG=208 TEST=out\release\libyuv_unittest --gtest_filter=*Scale*640* Review URL: https://webrtc-codereview.appspot.com/1294004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@648 16f28f9a-4ce2-e073-06de-1de4eb20be90
98a1fbf5 · fbarchard@google.com · a0070461 · 98a1fbf5 · 98a1fbf5 · 98a1fbf5
Commit 98a1fbf5 authored Apr 07, 2013 by fbarchard@google.com
5 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 646
+Version: 648
 License: BSD
 License File: LICENSE


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 646
+#define LIBYUV_VERSION 648

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3043,12 +3043,12 @@ void YToARGBRow_SSE2(const uint8* y_buf,
    pxor       xmm5, xmm5
    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
    pslld      xmm4, 24
-    mov        eax,0x00100010
-    movd       xmm3,eax
-    pshufd     xmm3,xmm3,0
-    mov        eax,0x004a004a       // 74
-    movd       xmm2,eax
-    pshufd     xmm2,xmm2,0
+    mov        eax, 0x00100010
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    mov        eax, 0x004a004a       // 74
+    movd       xmm2, eax
+    pshufd     xmm2, xmm2,0
    mov        eax, [esp + 4]       // Y
    mov        edx, [esp + 8]       // rgb
    mov        ecx, [esp + 12]      // width
@@ -4267,8 +4267,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    pxor       xmm3, xmm4       // ~alpha
    movd       xmm2, [esi]      // _r_b
    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
    pand       xmm2, xmm6       // _r_b
    paddw      xmm3, xmm7       // 256 - alpha
    pmullw     xmm2, xmm3       // _r_b * alpha
@@ -4298,8 +4298,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    pxor       xmm3, xmm4       // ~alpha
    movdqu     xmm2, [esi]      // _r_b
    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
    pand       xmm2, xmm6       // _r_b
    paddw      xmm3, xmm7       // 256 - alpha
    pmullw     xmm2, xmm3       // _r_b * alpha
@@ -4329,8 +4329,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    pxor       xmm3, xmm4       // ~alpha
    movd       xmm2, [esi]      // _r_b
    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
    pand       xmm2, xmm6       // _r_b
    paddw      xmm3, xmm7       // 256 - alpha
    pmullw     xmm2, xmm3       // _r_b * alpha
@@ -4363,8 +4363,8 @@ static const uvec8 kShuffleAlpha = {
 };
 // Same as SSE2, but replaces:
 //    psrlw      xmm3, 8          // alpha
-//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-//    pshuflw    xmm3, xmm3,0F5h
+//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+//    pshuflw    xmm3, xmm3, 0F5h
 // with..
 //    pshufb     xmm3, kShuffleAlpha // alpha
 // Blend 8 pixels at a time.
@@ -4533,13 +4533,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 convertloop:
    movdqa     xmm0, [eax]      // read 4 pixels
    punpcklbw  xmm0, xmm0       // first 2
-    pshufhw    xmm2, xmm0,0FFh  // 8 alpha words
-    pshuflw    xmm2, xmm2,0FFh
+    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
+    pshuflw    xmm2, xmm2, 0FFh
    pmulhuw    xmm0, xmm2       // rgb * a
    movdqa     xmm1, [eax]      // read 4 pixels
    punpckhbw  xmm1, xmm1       // next 2 pixels
-    pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
-    pshuflw    xmm2, xmm2,0FFh
+    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
+    pshuflw    xmm2, xmm2, 0FFh
    pmulhuw    xmm1, xmm2       // rgb * a
    movdqa     xmm2, [eax]      // alphas
    psrlw      xmm0, 8
@@ -4673,8 +4673,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    punpcklbw  xmm0, xmm0       // first 2
    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
-    pshuflw    xmm2, xmm2,040h  // first 4 inv_alpha words.  1, a, a, a
-    pshuflw    xmm3, xmm3,040h  // next 4 inv_alpha words
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
    movlhps    xmm2, xmm3
    pmulhuw    xmm0, xmm2       // rgb * a

@@ -4684,8 +4684,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    punpckhbw  xmm1, xmm1       // next 2
    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
-    pshuflw    xmm2, xmm2,040h  // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3,040h  // next 4 inv_alpha words
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
    movlhps    xmm2, xmm3
    pmulhuw    xmm1, xmm2       // rgb * a


--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -424,46 +424,86 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_argb, const uint8* src_argb,
 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
 // TODO(fbarchard): Port to Neon
 // TODO(fbarchard): Port to Posix
-// TODO(fbarchard): Unroll for 2 pixels for better pairing and memory access.
+// TODO(fbarchard): Consider lea to get 2nd pixel without incrementing.
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u,
+};
+
 #define HAS_SCALEARGBFILTERCOLS_SSSE3
 __declspec(naked) __declspec(align(16))
 static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                                      int dst_width, int x, int dx) {
  __asm {
    push       ebx
+    push       ebp
    push       esi
    push       edi
-    mov        edi, [esp + 12 + 4]   // dst_argb
-    mov        esi, [esp + 12 + 8]   // src_argb
-    mov        ecx, [esp + 12 + 12]  // dst_width
-    mov        edx, [esp + 12 + 16]  // x
-    mov        ebx, [esp + 12 + 20]  // dx
+    mov        edi, [esp + 16 + 4]   // dst_argb
+    mov        esi, [esp + 16 + 8]   // src_argb
+    mov        ecx, [esp + 16 + 12]  // dst_width
+    mov        edx, [esp + 16 + 16]  // x
+    mov        ebx, [esp + 16 + 20]  // dx
+    movdqa     xmm3, kShuffleFractions
+    movdqa     xmm4, kShuffleColARGB
    pcmpeqb    xmm5, xmm5            // generate 0x007f for inverting fraction.
    psrlw      xmm5, 9
+    sub        ecx, 2
+    jl         xloop29

    align      16
-  xloop:
-    mov        eax, edx             // get x integer offset
-    shr        eax, 16
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source pixels
-    pshufd     xmm1, xmm0, 1        // second pixel
-    punpcklbw  xmm0, xmm1           // aarrggbb
-    movd       xmm2, edx            // get x fraction
-    psrlw      xmm2, 9              // 7 bit fraction
-    punpcklbw  xmm2, xmm2
-    punpcklwd  xmm2, xmm2
-    pshufd     xmm2, xmm2, 0
-    pxor       xmm2, xmm5           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2
+  xloop2:
+    mov        eax, edx             // get x0 integer
+    movd       xmm1, edx            // get x0 fraction
+    lea        ebp, [edx + ebx]     // get x1 integer (x + dx)
+    movd       xmm2, ebp            // get x1 fraction
+    shr        eax, 16              // x0
+    punpcklwd  xmm1, xmm2           // x0x1 fractions
+    lea        edx, [edx + ebx * 2] // x += dx * 2
+    shr        ebp, 16              // x1
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    movhps     xmm0, qword ptr [esi + ebp * 4]  // 2 source x1 pixels
+    psrlw      xmm1, 9              // 7 bit fractions.
+    pshufb     xmm1, xmm3           // 0000000011111111
+    sub        ecx, 2
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm1, xmm5           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    jge        xloop2
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    mov        eax, edx             // get x0 integer
+    movd       xmm1, edx            // get x0 fraction
+    shr        eax, 16              // x0
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    psrlw      xmm1, 9              // 7 bit fractions.
+    pshufb     xmm1, xmm3           // 00000000
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm1, xmm5           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // argb 16 bit, 1 pixel.
    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0
-    add        edx, ebx             // x += dx
-    sub        ecx, 1
+    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
    movd       [edi], xmm0
-    lea        edi, [edi + 4]
-    jg         xloop
+ xloop99:
+
    pop        edi
    pop        esi
+    pop        ebp
    pop        ebx
    ret
  }
@@ -1104,8 +1144,6 @@ static void ScaleARGBBilinear(int src_width, int src_height,
    ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
  }
 #endif
-
-
  int dx = (src_width << 16) / dst_width;
  int dy = (src_height << 16) / dst_height;
  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);

--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -410,7 +410,7 @@ TEST_F(libyuvTest, ARGBScaleTo853x480_Bilinear) {
                                dst_width, dst_height,
                                kFilterBilinear,
                                benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 3);
 }

 TEST_F(libyuvTest, ARGBScaleFrom640x360_None) {
@@ -436,7 +436,7 @@ TEST_F(libyuvTest, ARGBScaleFrom640x360_Bilinear) {
                                dst_width, dst_height,
                                kFilterBilinear,
                                benchmark_iterations_);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_LE(max_diff, 3);
 }

 }  // namespace libyuv