planar blend use signed images

R=dhrosa@google.com, harryjin@google.com, jzern@chromium.org BUG=libyuv:527 Review URL: https://codereview.chromium.org/1491533002 .

planar blend use signed images
R=dhrosa@google.com, harryjin@google.com, jzern@chromium.org BUG=libyuv:527 Review URL: https://codereview.chromium.org/1491533002 .
8af0ebf8 · Frank Barchard · b6f37bd8 · 8af0ebf8 · 8af0ebf8 · 8af0ebf8
Commit 8af0ebf8 authored Dec 02, 2015 by Frank Barchard
6 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1546
+Version: 1547
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -252,6 +252,12 @@ extern "C" {
 #define HAS_RGB565TOARGBROW_AVX2
 #endif

+// The following are available for 32 bit Visual C and clangcl 32 bit:
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#define HAS_BLENDPLANEROW_SSSE3
+#endif
+
 // The following are also available on x64 Visual C.
 #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
    (!defined(__clang__) || defined(__SSSE3__))
@@ -1454,6 +1460,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
 void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
                    uint8* dst_argb, int width);

+// Unattenuated planar alpha blend.
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
+                     const uint8* alpha, uint8* dst, int width);
+
 // ARGB multiply images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
 void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1546
+#define LIBYUV_VERSION 1547

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2016,6 +2016,18 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
  }
 }
 #undef BLEND
+
+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
+                     const uint8* alpha, uint8* dst, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32 f = *src0++;
+    uint32 b = *src1++;
+    uint32 a = *alpha++;
+    *dst++ = (((a) * f) + ((255 - a) * b) + 255) >> 8;
+  }
+}
+
 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24

 // Multiply source RGB by alpha and store to destination.

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4063,6 +4063,58 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 }
 #endif  // HAS_YUY2TOYROW_SSE2

+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// =((G2*C2)+(H2*(D2))+32768+127)/256
+__declspec(naked)
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width) {
+  __asm {
+    push       esi
+    push       edi
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm6, eax
+    pshufd     xmm6, xmm6, 0x00
+
+    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    movd       xmm7, eax
+    pshufd     xmm7, xmm7, 0x00
+    mov        eax, [esp + 8 + 4]   // src0
+    mov        edx, [esp + 8 + 8]   // src1
+    mov        esi, [esp + 8 + 12]  // alpha
+    mov        edi, [esp + 8 + 16]  // dst
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        eax, esi
+    sub        edx, esi
+    sub        edi, esi
+
+    // 8 pixel loop.
+  convertloop8:
+    movq       xmm0, qword ptr [esi]        // alpha
+    punpcklbw  xmm0, xmm0
+    pxor       xmm0, xmm5         // a, 255-a
+    movq       xmm1, qword ptr [eax + esi]  // src0
+    movq       xmm2, qword ptr [edx + esi]  // src1
+    punpcklbw  xmm1, xmm2
+    psubb      xmm1, xmm6         // bias src0/1 - 128
+    pmaddubsw  xmm0, xmm1
+    paddw      xmm0, xmm7         // unbias result - 32768 and round.
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi + esi], xmm0
+    lea        esi, [esi + 8]
+    sub        ecx, 8
+    jge        convertloop8
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
 static const uvec8 kShuffleAlpha = {

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -1163,6 +1163,87 @@ TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
  EXPECT_LE(max_diff, 1);
 }

+#ifdef HAS_BLENDPLANEROW_SSSE3
+// TODO(fbarchard): Switch to I420Blend.
+static void TestBlendPlane(int width, int height, int benchmark_iterations,
+                          int invert, int off) {
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  width = width * height;
+  height = 1;
+  if (width < 1) {
+    width = 1;
+  }
+  if (width < 256) {
+    width = 256;
+  }
+  const int kBpp = 1;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(src_argb_b, kStride * height + off);
+  align_buffer_64(src_argb_alpha, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+
+  if (has_ssse3) {
+    for (int i = 0; i < 255; ++i) {
+      src_argb_a[i] = i;
+      src_argb_b[i] = 255 - i;
+      src_argb_alpha[i] = 255;
+    }
+    memset(dst_argb_opt, 0xfb, kStride * height);
+    BlendPlaneRow_SSSE3(src_argb_a + off,
+                        src_argb_b + off,
+                        src_argb_alpha + off,
+                        dst_argb_opt,
+                        width * height);
+    for (int i = 0; i < kStride * height; ++i) {
+      EXPECT_EQ(src_argb_a[i], dst_argb_opt[i]);
+    }
+  }
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+    src_argb_alpha[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 255, kStride * height);
+  memset(dst_argb_opt, 255, kStride * height);
+
+  BlendPlaneRow_C(src_argb_a + off,
+                  src_argb_b + off,
+                  src_argb_alpha + off,
+                  dst_argb_c,
+                  width * height);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    if (has_ssse3) {
+      BlendPlaneRow_SSSE3(src_argb_a + off,
+                          src_argb_b + off,
+                          src_argb_alpha + off,
+                          dst_argb_opt,
+                          width * height);
+    } else {
+      BlendPlaneRow_C(src_argb_a + off,
+                      src_argb_b + off,
+                      src_argb_alpha + off,
+                      dst_argb_opt,
+                      width * height);
+    }
+  }
+  for (int i = 0; i < kStride * height; ++i) {
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(src_argb_b);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return;
+}
+
+TEST_F(LibYUVPlanarTest, BlendPlane_Opt) {
+  TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 +1, 0);
+}
+#endif
+
 TEST_F(LibYUVPlanarTest, TestAffine) {
  SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
  SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);