Blend style multiple

BUG=175 TEST=Multiply unittest Review URL: https://webrtc-codereview.appspot.com/1048004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@542 16f28f9a-4ce2-e073-06de-1de4eb20be90

Blend style multiple
BUG=175 TEST=Multiply unittest Review URL: https://webrtc-codereview.appspot.com/1048004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@542 16f28f9a-4ce2-e073-06de-1de4eb20be90
8fa76349 · fbarchard@google.com · 8ec60334 · 8fa76349 · 8fa76349 · 8fa76349
Commit 8fa76349 authored Jan 18, 2013 by fbarchard@google.com
10 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 541
+Version: 542
 License: BSD
 License File: LICENSE


--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -209,6 +209,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);

+// Multiply ARGB image by ARGB image.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
 // Convert I422 to YUY2.
 LIBYUV_API
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
@@ -265,11 +272,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height, uint32 value);

-// Multiply ARGB image by ARGB image.
-int ARGBMultiply(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
-
 // Interpolate between two ARGB images using specified amount of interpolation
 // (0 to 255) and store to destination.
 // 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -967,6 +967,15 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
 void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
                    uint8* dst_argb, int width);

+// ARGB preattenuated alpha blend.  Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
 void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -1270,7 +1279,6 @@ void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
                               const int32* previous_cumsum, int width);

-
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                     uint8* dst_argb, const float* uv_dudv, int width);
@@ -1287,10 +1295,6 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
 void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb,
                             ptrdiff_t src_stride_argb, int dst_width,
                             int source_y_fraction);
-void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                              int width);

 #ifdef __cplusplus
 }  // extern "C"

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 541
+#define LIBYUV_VERSION 542

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -404,6 +404,50 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
  return 0;
 }

+// Multiply 2 ARGB images together and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBMultiplyRow_C;
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+      IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+    }
+  }
+#elif defined(HAS_ARGBMULTIPLYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+  }
+#endif
+
+  // Multiply plane
+  for (int y = 0; y < height; ++y) {
+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
 // Convert I422 to BGRA.
 LIBYUV_API
 int I422ToBGRA(const uint8* src_y, int src_stride_y,
@@ -1170,47 +1214,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
  return 0;
 }

-// ARGB multiply 2 images together.
-LIBYUV_API
-int ARGBMultiply(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  void (*ARGBMultiplyRow)(const uint8* src, uint8* dst, int width) =
-      ARGBMultiplyRow_C;
-#if defined(HAS_ARGBMULTIPLYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
-    }
-  }
-#elif defined(HAS_ARGBMULTIPLYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_NEON;
-  }
-#endif
-
-  // Multiply plane
-  for (int y = 0; y < height; ++y) {
-    ARGBMultiplyRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 // TODO(fbarchard): Consider selecting a specialization for interpolation so
 //     row function doesn't need to check interpolation on each row.

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -373,10 +373,12 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
 #undef MergeUVRow_ANY

 #define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK)              \
-    void NAMEANY(const uint8* src_argb, uint8* dst_argb, int width) {          \
+    void NAMEANY(const uint8* src_argb0, const uint8* src_argb1,               \
+                 uint8* dst_argb, int width) {                                 \
      int n = width & ~MASK;                                                   \
-      ARGBMULT_SIMD(src_argb, dst_argb, n);                                    \
-      ARGBMULT_C(src_argb + n * 4,                                             \
+      ARGBMULT_SIMD(src_argb0, src_argb1, dst_argb, n);                        \
+      ARGBMULT_C(src_argb0 + n * 4,                                            \
+                 src_argb1 + n * 4,                                            \
                 dst_argb + n * 4,                                             \
                 width & MASK);                                                \
    }

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -704,21 +704,23 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
 #define REPEAT8(v) (v) | ((v) << 8)
 #define SHADE(f, v) v * f >> 16

-void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
  for (int i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb[0]);
-    const uint32 g = REPEAT8(src_argb[1]);
-    const uint32 r = REPEAT8(src_argb[2]);
-    const uint32 a = REPEAT8(src_argb[3]);
-    const uint32 b_scale = dst_argb[0];
-    const uint32 g_scale = dst_argb[1];
-    const uint32 r_scale = dst_argb[2];
-    const uint32 a_scale = dst_argb[3];
+    const uint32 b = REPEAT8(src_argb0[0]);
+    const uint32 g = REPEAT8(src_argb0[1]);
+    const uint32 r = REPEAT8(src_argb0[2]);
+    const uint32 a = REPEAT8(src_argb0[3]);
+    const uint32 b_scale = src_argb1[0];
+    const uint32 g_scale = src_argb1[1];
+    const uint32 r_scale = src_argb1[2];
+    const uint32 a_scale = src_argb1[3];
    dst_argb[0] = SHADE(b, b_scale);
    dst_argb[1] = SHADE(g, g_scale);
    dst_argb[2] = SHADE(r, r_scale);
    dst_argb[3] = SHADE(a, a_scale);
-    src_argb += 4;
+    src_argb0 += 4;
+    src_argb1 += 4;
    dst_argb += 4;
  }
 }

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3963,10 +3963,12 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
  asm volatile (
    "pxor      %%xmm5,%%xmm5                   \n"
    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"

    // 4 pixel loop.
    ".p2align  4                               \n"
@@ -3982,13 +3984,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    "pmulhuw   %%xmm2,%%xmm0                   \n"
    "pmulhuw   %%xmm3,%%xmm1                   \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%0,%2,1)                \n"
    "lea       0x10(%0),%0                     \n"
    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
+  : "+r"(src_argb0),   // %0
+    "+r"(src_argb1),   // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
  :
  : "memory", "cc"
 #if defined(__SSE2__)

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4280,18 +4280,22 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 // Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
    pxor       xmm5, xmm5  // constant 0
+    sub        esi, eax
    sub        edx, eax

    align      16
 convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
-    movdqa     xmm2, [eax + edx]  // read 4 dest pixels
+    movdqa     xmm0, [eax]      // read 4 pixels from src_argb0
+    movdqa     xmm2, [eax + esi]  // read 4 pixels from src_argb1
    movdqa     xmm1, xmm0
    movdqa     xmm3, xmm2
    punpcklbw  xmm0, xmm0       // first 2
@@ -4306,6 +4310,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    lea        eax, [eax + 16]
    jg         convertloop

+    pop        esi
    ret
  }
 }

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -913,17 +913,21 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
    src_argb_a[i + off] = (random() & 0xff);
    src_argb_b[i + off] = (random() & 0xff);
  }
-  memcpy(dst_argb_c, src_argb_b + off, kStride * height);
-  memcpy(dst_argb_opt, src_argb_b + off, kStride * height);
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);

  MaskCpuFlags(0);
  ARGBMultiply(src_argb_a + off, kStride,
+               src_argb_b + off, kStride,
               dst_argb_c, kStride,
               width, invert * height);
  MaskCpuFlags(-1);
+  for (int i = 0; i < benchmark_iterations; ++i) {
    ARGBMultiply(src_argb_a + off, kStride,
+                 src_argb_b + off, kStride,
                 dst_argb_opt, kStride,
                 width, invert * height);
+  }
  int max_diff = 0;
  for (int i = 0; i < kStride * height; ++i) {
    int abs_diff =
@@ -933,12 +937,6 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
      max_diff = abs_diff;
    }
  }
-  // Benchmark.
-  for (int i = 0; i < benchmark_iterations - 1; ++i) {
-    ARGBMultiply(src_argb_a + off, kStride,
-                dst_argb_opt, kStride,
-                width, invert * height);
-  }
  free_aligned_buffer_64(src_argb_a)
  free_aligned_buffer_64(src_argb_b)
  free_aligned_buffer_64(dst_argb_c)