Unattenuate AVX2

BUG=190 TEST=planar_test Review URL: https://webrtc-codereview.appspot.com/1112004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@577 16f28f9a-4ce2-e073-06de-1de4eb20be90

Unattenuate AVX2
BUG=190 TEST=planar_test Review URL: https://webrtc-codereview.appspot.com/1112004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@577 16f28f9a-4ce2-e073-06de-1de4eb20be90
3c7bb050 · fbarchard@google.com · d5ee3dc9 · 3c7bb050 · 3c7bb050 · 3c7bb050
Commit 3c7bb050 authored Feb 20, 2013 by fbarchard@google.com
10 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 576
+Version: 577
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -140,6 +140,7 @@ extern "C" {
 // Effects
 #define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
 #endif
 #endif
@@ -1324,6 +1325,7 @@ void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
 extern uint32 fixed_invtbl8[256];
 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);

--- a/source/compare.cc
+++ b/source/compare.cc
@@ -145,11 +145,9 @@ LIBYUV_API
 uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
                                  const uint8* src_b, int stride_b,
                                  int width, int height) {
  if (stride_a == width && stride_b == width) {
    return ComputeSumSquareError(src_a, src_b, width * height);
  }
  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
      SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1085,6 +1085,14 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
    ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
  }
 #endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
+    bool clear = true;
+    ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+  }
+#endif
+// TODO(fbarchard): Neon version.
  for (int y = 0; y < height; ++y) {
    ARGBUnattenuateRow(src_argb, dst_argb, width);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1528,7 +1528,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 // 8.16 fixed point inverse table
 #define T(a) 0x10000 / a
 uint32 fixed_invtbl8[256] = {
-  0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  0xffff, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4462,6 +4462,53 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u,
+  8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u,
+  0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u,
+  8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u,
+};
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
+    vpslld     ymm5, ymm5, 24
+    align      16
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffffffff for gather.
+    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm7  // ymm7 cleared.
+    vpunpcklwd ymm2, ymm3, ymm7  // low 4 inverted alphas. mutated.
+    vpunpckhwd ymm3, ymm3, ymm7  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas
+    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpand      ymm6, ymm6, ymm5  // isolate alpha
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpor       ymm0, ymm0, ymm6  // copy original alpha
+    sub        ecx, 8
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
 static const vec8 kARGBToGray = {

--- a/source/row_x86.asm
+++ b/source/row_x86.asm
@@ -42,7 +42,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
 %endif
    packuswb   m0, m0, m1
 %if cpuflag(AVX2)
-    vpermq     m0, m0, 0xd8 
+    vpermq     m0, m0, 0xd8
 %endif
    sub        pixd, mmsize
    mov%2      [dst_yq], m0
@@ -86,8 +86,8 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
    packuswb   m0, m0, m1
    packuswb   m2, m2, m3
 %if cpuflag(AVX2)
-    vpermq     m0, m0, 0xd8 
+    vpermq     m0, m0, 0xd8
-    vpermq     m2, m2, 0xd8 
+    vpermq     m2, m2, 0xd8
 %endif
    mov%1      [dst_uq], m0
    mov%1      [dst_uq + dst_vq], m2

--- a/source/scale.cc
+++ b/source/scale.cc
@@ -42,13 +42,7 @@ void SetUseReferenceImpl(bool use) {
 }
 // ScaleRowDown2Int also used by planar functions
+// NEON downscalers with interpolation.
-/**
- * NEON downscalers with interpolation.
- *
- * Provided by Fritz Koenig
- *
- */
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEROWDOWN2_NEON
@@ -98,13 +92,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
                          int dst_width, int source_y_fraction);
-/**
+// SSE2 downscalers with interpolation.
- * SSE2 downscalers with interpolation.
- *
- * Provided by Frank Barchard (fbarchard@google.com)
- *
- */
 // Constants for SSSE3 code
 #elif !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
@@ -2630,13 +2618,10 @@ void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }
-/**
+// Scale plane, 1/2
- * Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
- *
+// its original size.
- * This is an optimized version for scaling down a plane to 1/2 of
- * its original size.
- *
- */
 static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
@@ -2676,12 +2661,10 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
  }
 }
-/**
+// Scale plane, 1/4
- * Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
- *
+// its original size.
- * This is an optimized version for scaling down a plane to 1/4 of
- * its original size.
- */
 static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
@@ -2717,13 +2700,10 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
  }
 }
-/**
+// Scale plane, 1/8
- * Scale plane, 1/8
+// This is an optimized version for scaling down a plane to 1/8
- *
+// of its original size.
- * This is an optimized version for scaling down a plane to 1/8
- * of its original size.
- *
- */
 static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
@@ -2748,12 +2728,8 @@ static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
  }
 }
-/**
+// Scale plane down, 3/4
- * Scale plane down, 3/4
- *
- * Provided by Frank Barchard (fbarchard@google.com)
- *
- */
 static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
@@ -2839,23 +2815,22 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
  }
 }
-/**
- * Scale plane, 3/8
+// Scale plane, 3/8
- *
+// This is an optimized version for scaling down a plane to 3/8
- * This is an optimized version for scaling down a plane to 3/8
+// of its original size.
- * of its original size.
+//
- *
+// Uses box filter arranges like this
- * Uses box filter arranges like this
+// aaabbbcc -> abc
- * aaabbbcc -> abc
+// aaabbbcc    def
- * aaabbbcc    def
+// aaabbbcc    ghi
- * aaabbbcc    ghi
+// dddeeeff
- * dddeeeff
+// dddeeeff
- * dddeeeff
+// dddeeeff
- * dddeeeff
+// ggghhhii
- * ggghhhii
+// ggghhhii
- * ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
- * Boxes are 3x3, 2x3, 3x2 and 2x2
- */
 static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
@@ -2991,15 +2966,14 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
  }
 }
-/**
+// Scale plane down to any dimensions, with interpolation.
- * Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
- * (boxfilter).
+//
- *
+// Same method as SimpleScale, which is fixed point, outputting
- * Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
- * one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
- * through source, sampling a box of pixel with simple
+// averaging.
- * averaging.
- */
 static void ScalePlaneBox(int src_width, int src_height,
                          int dst_width, int dst_height,
                          int src_stride, int dst_stride,
@@ -3008,8 +2982,6 @@ static void ScalePlaneBox(int src_width, int src_height,
  assert(dst_height > 0);
  int dx = (src_width << 16) / dst_width;
  int dy = (src_height << 16) / dst_height;
-//  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
-//  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
  int x = 0;
  int y = 0;
  int maxy = (src_height << 16);
@@ -3063,9 +3035,8 @@ static void ScalePlaneBox(int src_width, int src_height,
  }
 }
-/**
+// Scale plane to/from any dimensions, with interpolation.
- * Scale plane to/from any dimensions, with interpolation.
- */
 static void ScalePlaneBilinearSimple(int src_width, int src_height,
                                     int dst_width, int dst_height,
                                     int src_stride, int dst_stride,
@@ -3104,10 +3075,9 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
  }
 }
-/**
- * Scale plane to/from any dimensions, with bilinear
+// Scale plane to/from any dimensions, with bilinear interpolation.
- * interpolation.
- */
 void ScalePlaneBilinear(int src_width, int src_height,
                        int dst_width, int dst_height,
                        int src_stride, int dst_stride,
@@ -3170,12 +3140,11 @@ void ScalePlaneBilinear(int src_width, int src_height,
  }
 }
-/**
+// Scale plane to/from any dimensions, without interpolation.
- * Scale plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
- * Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
- * of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
- * the lower 16 bits are the fixed decimal part.
- */
 static void ScalePlaneSimple(int src_width, int src_height,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
@@ -3197,9 +3166,8 @@ static void ScalePlaneSimple(int src_width, int src_height,
  }
 }
-/**
+// Scale plane to/from any dimensions.
- * Scale plane to/from any dimensions.
- */
 static void ScalePlaneAnySize(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
@@ -3215,14 +3183,12 @@ static void ScalePlaneAnySize(int src_width, int src_height,
  }
 }
-/**
+// Scale plane down, any size
- * Scale plane down, any size
+//
- *
+// This is an optimized version for scaling down a plane to any size.
- * This is an optimized version for scaling down a plane to any size.
+// The current implementation is ~10 times faster compared to the
- * The current implementation is ~10 times faster compared to the
+// reference implementation for e.g. XGA->LowResPAL
- * reference implementation for e.g. XGA->LowResPAL
- *
- */
 static void ScalePlaneDown(int src_width, int src_height,
                           int dst_width, int dst_height,
                           int src_stride, int dst_stride,

--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -44,10 +44,6 @@ void ScaleARGBFilterRows_NEON(uint8* dst_ptr,
                              int dst_width, int source_y_fraction);
 #endif
-/**
- * SSE2 downscalers with bilinear interpolation.
- */
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SCALEARGBROWDOWN2_SSE2
@@ -880,13 +876,10 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
  dst_argb[3] = dst_argb[-1];
 }
-/**
+// ScaleARGB ARGB, 1/2
- * ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
- *
+// its original size.
- * This is an optimized version for scaling down a ARGB to 1/2 of
- * its original size.
- *
- */
 static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
                           int dst_width, int dst_height,
                           int src_stride, int dst_stride,
@@ -918,13 +911,10 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
  }
 }
-/**
+// ScaleARGB ARGB Even
- * ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
- *
+// multiple of its original size.
- * This is an optimized version for scaling down a ARGB to even
- * multiple of its original size.
- *
- */
 static void ScaleARGBDownEven(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
@@ -959,10 +949,9 @@ static void ScaleARGBDownEven(int src_width, int src_height,
    dst_argb += dst_stride;
  }
 }
-/**
- * ScaleARGB ARGB to/from any dimensions, with bilinear
+// ScaleARGB ARGB to/from any dimensions, with bilinear
- * interpolation.
+// interpolation.
- */
 // Maximum width handled by 2 pass Bilinear.
 static const int kMaxInputWidth = 2560;
@@ -1033,12 +1022,11 @@ static void ScaleARGBCols(uint8* dst_argb, const uint8* src_argb,
  }
 }
-/**
- * ScaleARGB ARGB to/from any dimensions, without interpolation.
+// ScaleARGB ARGB to/from any dimensions, without interpolation.
- * Fixed point math is used for performance: The upper 16 bits
+// Fixed point math is used for performance: The upper 16 bits
- * of x and dx is the integer part of the source position and
+// of x and dx is the integer part of the source position and
- * the lower 16 bits are the fixed decimal part.
+// the lower 16 bits are the fixed decimal part.
- */
 static void ScaleARGBSimple(int src_width, int src_height,
                            int dst_width, int dst_height,
@@ -1056,9 +1044,8 @@ static void ScaleARGBSimple(int src_width, int src_height,
  }
 }
-/**
+// ScaleARGB ARGB to/from any dimensions.
- * ScaleARGB ARGB to/from any dimensions.
- */
 static void ScaleARGBAnySize(int src_width, int src_height,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -167,6 +167,72 @@ TEST_F(libyuvTest, ARGBAttenuate_Opt) {
  EXPECT_LE(max_diff, 2);
 }
+static int TestUnattenuateI(int width, int height, int benchmark_iterations,
+                            int invert, int off) {
+  const int kBpp = 4;
+  const int kStride = (width * kBpp + 15) & ~15;
+  align_buffer_64(src_argb, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  srandom(time(NULL));
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb[i + off] = (random() & 0xff);
+  }
+  ARGBAttenuate(src_argb + off, kStride,
+                src_argb + off, kStride,
+                width, height);
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+  MaskCpuFlags(0);
+  ARGBUnattenuate(src_argb + off, kStride,
+                  dst_argb_c, kStride,
+                  width, invert * height);
+  MaskCpuFlags(-1);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBUnattenuate(src_argb + off, kStride,
+                    dst_argb_opt, kStride,
+                    width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb)
+  free_aligned_buffer_64(dst_argb_c)
+  free_aligned_buffer_64(dst_argb_opt)
+  return max_diff;
+}
+TEST_F(libyuvTest, ARGBUnattenuate_Any) {
+  int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
+                                  benchmark_iterations_, +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+TEST_F(libyuvTest, ARGBUnattenuate_Unaligned) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_, +1, 1);
+  EXPECT_LE(max_diff, 2);
+}
+TEST_F(libyuvTest, ARGBUnattenuate_Invert) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_, -1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+TEST_F(libyuvTest, ARGBUnattenuate_Opt) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_, +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
 TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
  SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
  SIMD_ALIGNED(int32 added_pixels[16][16][4]);