ARGBUnattenuateRow_SSE2 use reciprocal table and pmul

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/497001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@244 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBUnattenuateRow_SSE2 use reciprocal table and pmul
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/497001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@244 16f28f9a-4ce2-e073-06de-1de4eb20be90
810cd910 · fbarchard@google.com · f2c86d01 · 810cd910 · 810cd910 · 810cd910
Commit 810cd910 authored Apr 20, 2012 by fbarchard@google.com
12 changed files
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -10,6 +10,7 @@
 #include "libyuv/planar_functions.h"
+#include <stdio.h>  // printf()
 #include <string.h>  // for memset()
 #include "libyuv/cpu_id.h"
@@ -909,80 +910,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
  return 0;
 }
-// Divide source RGB by alpha and store to destination.
-// b = (b * 255 + (a / 2)) / a;
-// g = (g * 255 + (a / 2)) / a;
-// r = (r * 255 + (a / 2)) / a;
-// Reciprocal method is off by 1 on some values. ie 125
-// 8.16 fixed point inverse table
-#define T(a) 0x1000000 / a
-static uint32 fixed_invtbl[256] = {
-  0, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
-  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
-  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
-  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
-  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
-  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
-  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
-  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
-  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
-  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
-  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
-  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
-  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
-  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
-  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
-  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
-  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
-  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
-  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
-  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
-  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
-  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
-  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
-  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
-  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
-  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
-  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
-  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
-  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
-  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
-  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), T(0xff) };
-#undef T
-static void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb,
-                                 int width) {
-  for (int i = 0; i < width; ++i) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
-    if (a) {
-      const uint32 ia = fixed_invtbl[a];  // 8.16 fixed point
-      b = (b * ia + 0x8000) >> 16;
-      g = (g * ia + 0x8000) >> 16;
-      r = (r * ia + 0x8000) >> 16;
-      // Clamping should not be necessary but is free in assembly.
-      if (b > 255) {
-        b = 255;
-      }
-      if (g > 255) {
-        g = 255;
-      }
-      if (r > 255) {
-        r = 255;
-      }
-    }
-    dst_argb[0] = b;
-    dst_argb[1] = g;
-    dst_argb[2] = r;
-    dst_argb[3] = a;
-    src_argb += 4;
-    dst_argb += 4;
-  }
-}
 // Convert unattentuated ARGB values to preattenuated ARGB.
 int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_argb, int dst_stride_argb,
@@ -1010,7 +937,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
  return 0;
 }
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row.h
+++ b/source/row.h
@@ -67,11 +67,8 @@ extern "C" {
 #define HAS_ARGBBLENDROW_SSE2
 #define HAS_ARGBBLENDROW_SSSE3
 #define HAS_ARGBATTENUATE_SSE2
-#endif
-// The following are available on Windows 32 bit
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBATTENUATE_SSSE3
+#define HAS_ARGBUNATTENUATE_SSE2
 #endif
 // The following are available on Neon platforms
@@ -312,11 +309,11 @@ void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
+                           uint8* dst_u, uint8* dst_v, int width);
 void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
+                           uint8* dst_u, uint8* dst_v, int width);
 void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
+                           uint8* dst_u, uint8* dst_v, int width);
 void I420ToARGBRow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
@@ -370,6 +367,9 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -700,6 +700,79 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
  }
 }
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.16 fixed point inverse table
+#define T(a) 0x1000000 / a
+static uint32 fixed_invtbl[256] = {
+  0, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), T(0xff) };
+#undef T
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  for (int i = 0; i < width; ++i) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    if (a) {
+      const uint32 ia = fixed_invtbl[a];  // 8.16 fixed point
+      b = (b * ia + 0x8000) >> 16;
+      g = (g * ia + 0x8000) >> 16;
+      r = (r * ia + 0x8000) >> 16;
+      // Clamping should not be necessary but is free in assembly.
+      if (b > 255) {
+        b = 255;
+      }
+      if (g > 255) {
+        g = 255;
+      }
+      if (r > 255) {
+        r = 255;
+      }
+    }
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1730,6 +1730,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  asm volatile (
    "sub        %0,%1                          \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@@ -2192,9 +2193,9 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    "movdqu    0x10(%0),%%xmm3                 \n"
    "lea       0x20(%0),%0                     \n"
    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb    %%xmm2,%%xmm0                  \n"
+    "paddusb   %%xmm2,%%xmm0                  \n"
    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb    %%xmm1,%%xmm0                  \n"
+    "paddusb   %%xmm1,%%xmm0                  \n"
    "sub       $0x4,%3                         \n"
    "movdqa    %%xmm0,(%2)                     \n"
    "jle       9f                              \n"
@@ -2242,6 +2243,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    "pslld     $0x18,%%xmm4                    \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrld     $0x8,%%xmm5                     \n"
  // 4 pixel loop
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
@@ -2254,13 +2256,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    (%0),%%xmm3                     \n"
+    "movdqa    (%0),%%xmm2                     \n"
    "psrlw     $0x8,%%xmm0                     \n"
-    "pand      %%xmm4,%%xmm3                   \n"
+    "pand      %%xmm4,%%xmm2                   \n"
    "psrlw     $0x8,%%xmm1                     \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%0,%1,1)                \n"
    "lea       0x10(%0),%0                     \n"
@@ -2277,6 +2279,156 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 }
 #endif  // HAS_ARGBATTENUATE_SSE2
+#ifdef HAS_ARGBATTENUATE_SSSE3
+// Shuffle table duplicating alpha
+CONST uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+CONST uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "pslld     $0x18,%%xmm3                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+  // 4 pixel loop
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm1,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "punpckhbw %%xmm2,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "pand      %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha0),  // %3
+    "m"(kShuffleAlpha1)  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBATTENUATE_SSSE3
+#ifdef HAS_ARGBUNATTENUATE_SSE2
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.16 fixed point inverse table
+#define T(a) 0x10000 / a
+CONST uint32 fixed_invtbl8[256] = {
+  0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 };
+#undef T
+// Unattenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+  // 4 pixel loop
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movzb     0x3(%0),%3                      \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movd      0x0(%4,%3,4),%%xmm2             \n"
+    "movzb     0x7(%0),%3                      \n"
+    "movd      0x0(%4,%3,4),%%xmm3             \n"
+    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm1                     \n"
+    "movzb     0xb(%0),%3                      \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "movd      0x0(%4,%3,4),%%xmm2             \n"
+    "movzb     0xf(%0),%3                      \n"
+    "movd      0x0(%4,%3,4),%%xmm3             \n"
+    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqa    (%0),%%xmm2                     \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8)  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBUNATTENUATE_SSE2
 #endif  // defined(__x86_64__) || defined(__i386__)
 #ifdef __cplusplus

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2319,13 +2319,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
    pshuflw    xmm2, xmm2,0FFh
    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqa     xmm3, [eax]      // alphas
+    movdqa     xmm2, [eax]      // alphas
    psrlw      xmm0, 8
-    pand       xmm3, xmm4
+    pand       xmm2, xmm4
    psrlw      xmm1, 8
    packuswb   xmm0, xmm1
    pand       xmm0, xmm5       // keep original alphas
-    por        xmm0, xmm3
+    por        xmm0, xmm2
    sub        ecx, 4
    movdqa     [eax + edx], xmm0
    lea        eax, [eax + 16]
@@ -2347,7 +2347,6 @@ static const uvec8 kShuffleAlpha1 = {
 };
 __declspec(naked) __declspec(align(16))
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
    mov        eax, [esp + 4]   // src_argb0
    mov        edx, [esp + 8]   // dst_argb
    mov        ecx, [esp + 12]  // width
@@ -2360,7 +2359,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
    align      16
 convertloop:
    movdqa     xmm0, [eax]      // read 4 pixels
-    pshufb     xmm0, xmm4       // isolate first 2 alphas 
+    pshufb     xmm0, xmm4       // isolate first 2 alphas
    movdqa     xmm1, [eax]      // read 4 pixels
    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
    pmulhuw    xmm0, xmm1       // rgb * a
@@ -2383,9 +2382,105 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
    ret
  }
 }
 #endif  // HAS_ARGBATTENUATE_SSSE3
+#ifdef HAS_ARGBUNATTENUATE_SSE2
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.16 fixed point inverse table
+#define T(a) 0x10000 / a
+static uint32 fixed_invtbl8[256] = {
+  0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 };
+#undef T
+// Unattenuate 4 pixels at a time.
+// aligned to 16 bytes
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb0
+    mov        edx, [esp + 8 + 8]   // dst_argb
+    mov        ecx, [esp + 8 + 12]  // width
+    sub        edx, eax
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 3]  // first alpha
+    movzx      edi, byte ptr [eax + 7]  // second alpha
+    punpcklbw  xmm0, xmm0       // first 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm0, xmm2       // rgb * a
+    movdqa     xmm1, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 11]  // third alpha
+    movzx      edi, byte ptr [eax + 15]  // forth alpha
+    punpckhbw  xmm1, xmm1       // next 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqa     xmm2, [eax]      // alphas
+    pand       xmm2, xmm4
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [eax + edx], xmm0
+    lea        eax, [eax + 16]
+    jg         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBUNATTENUATE_SSE2
 #endif  // _M_IX86
 #ifdef __cplusplus

--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -63,7 +63,7 @@ TEST_F(libyuvTest, BenchmakDjb2_C) {
  uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
  uint32 h1;
  MaskCpuFlags(kCpuInitialized);
-  for (int i = 0; i < _benchmark_iterations; ++i) {
+  for (int i = 0; i < benchmark_iterations_; ++i) {
    h1 = HashDjb2(src_a, kMaxTest, 5381);
  }
  MaskCpuFlags(-1);
@@ -80,7 +80,7 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) {
  }
  uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
  uint32 h1;
-  for (int i = 0; i < _benchmark_iterations; ++i) {
+  for (int i = 0; i < benchmark_iterations_; ++i) {
    h1 = HashDjb2(src_a, kMaxTest, 5381);
  }
  EXPECT_EQ(h1, h2);
@@ -96,7 +96,7 @@ TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) {
  }
  uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
  uint32 h1;
-  for (int i = 0; i < _benchmark_iterations; ++i) {
+  for (int i = 0; i < benchmark_iterations_; ++i) {
    h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
  }
  EXPECT_EQ(h1, h2);
@@ -110,7 +110,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_C) {
  align_buffer_16(src_b, max_width)
  MaskCpuFlags(kCpuInitialized);
-  for (int i = 0; i < _benchmark_iterations; ++i) {
+  for (int i = 0; i < benchmark_iterations_; ++i) {
    ComputeSumSquareError(src_a, src_b, max_width);
  }
@@ -128,7 +128,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) {
  align_buffer_16(src_a, max_width)
  align_buffer_16(src_b, max_width)
-  for (int i = 0; i < _benchmark_iterations; ++i) {
+  for (int i = 0; i < benchmark_iterations_; ++i) {
    ComputeSumSquareError(src_a, src_b, max_width);
  }
@@ -183,18 +183,18 @@ TEST_F(libyuvTest, SumSquareError) {
 }
 TEST_F(libyuvTest, BenchmarkPsnr_C) {
-  align_buffer_16(src_a, _benchmark_width * _benchmark_height)
+  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
-  align_buffer_16(src_b, _benchmark_width * _benchmark_height)
+  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
  MaskCpuFlags(kCpuInitialized);
  double c_time = get_time();
-  for (int i = 0; i < _benchmark_iterations; ++i)
+  for (int i = 0; i < benchmark_iterations_; ++i)
-    CalcFramePsnr(src_a, _benchmark_width,
+    CalcFramePsnr(src_a, benchmark_width_,
-                  src_b, _benchmark_width,
+                  src_b, benchmark_width_,
-                  _benchmark_width, _benchmark_height);
+                  benchmark_width_, benchmark_height_);
-  c_time = (get_time() - c_time) / _benchmark_iterations;
+  c_time = (get_time() - c_time) / benchmark_iterations_;
  printf("BenchmarkPsnr_C - %8.2f us c\n", c_time * 1e6);
  MaskCpuFlags(-1);
@@ -206,18 +206,18 @@ TEST_F(libyuvTest, BenchmarkPsnr_C) {
 }
 TEST_F(libyuvTest, BenchmarkPsnr_OPT) {
-  align_buffer_16(src_a, _benchmark_width * _benchmark_height)
+  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
-  align_buffer_16(src_b, _benchmark_width * _benchmark_height)
+  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
  MaskCpuFlags(-1);
  double opt_time = get_time();
-  for (int i = 0; i < _benchmark_iterations; ++i)
+  for (int i = 0; i < benchmark_iterations_; ++i)
-    CalcFramePsnr(src_a, _benchmark_width,
+    CalcFramePsnr(src_a, benchmark_width_,
-                  src_b, _benchmark_width,
+                  src_b, benchmark_width_,
-                  _benchmark_width, _benchmark_height);
+                  benchmark_width_, benchmark_height_);
-  opt_time = (get_time() - opt_time) / _benchmark_iterations;
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
  printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
  EXPECT_EQ(0, 0);
@@ -304,18 +304,18 @@ TEST_F(libyuvTest, Psnr) {
 }
 TEST_F(libyuvTest, BenchmarkSsim_C) {
-  align_buffer_16(src_a, _benchmark_width * _benchmark_height)
+  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
-  align_buffer_16(src_b, _benchmark_width * _benchmark_height)
+  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
  MaskCpuFlags(kCpuInitialized);
  double c_time = get_time();
-  for (int i = 0; i < _benchmark_iterations; ++i)
+  for (int i = 0; i < benchmark_iterations_; ++i)
-    CalcFrameSsim(src_a, _benchmark_width,
+    CalcFrameSsim(src_a, benchmark_width_,
-                  src_b, _benchmark_width,
+                  src_b, benchmark_width_,
-                  _benchmark_width, _benchmark_height);
+                  benchmark_width_, benchmark_height_);
-  c_time = (get_time() - c_time) / _benchmark_iterations;
+  c_time = (get_time() - c_time) / benchmark_iterations_;
  printf("BenchmarkSsim_C - %8.2f us c\n", c_time * 1e6);
  MaskCpuFlags(-1);
@@ -327,18 +327,18 @@ TEST_F(libyuvTest, BenchmarkSsim_C) {
 }
 TEST_F(libyuvTest, BenchmarkSsim_OPT) {
-  align_buffer_16(src_a, _benchmark_width * _benchmark_height)
+  align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
-  align_buffer_16(src_b, _benchmark_width * _benchmark_height)
+  align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
  MaskCpuFlags(-1);
  double opt_time = get_time();
-  for (int i = 0; i < _benchmark_iterations; ++i)
+  for (int i = 0; i < benchmark_iterations_; ++i)
-    CalcFrameSsim(src_a, _benchmark_width,
+    CalcFrameSsim(src_a, benchmark_width_,
-                  src_b, _benchmark_width,
+                  src_b, benchmark_width_,
-                  _benchmark_width, _benchmark_height);
+                  benchmark_width_, benchmark_height_);
-  opt_time = (get_time() - opt_time) / _benchmark_iterations;
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
  printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
  EXPECT_EQ(0, 0);

--- a/unit_test/cpu_test.cc
+++ b/unit_test/cpu_test.cc
@@ -8,14 +8,13 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
-#include "unit_test.h"
 #include <stdlib.h>
 #include <string.h>
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/version.h"
+#include "unit_test/unit_test.h"
 namespace libyuv {

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -8,8 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
-#include "unit_test.h"
 #include <stdlib.h>
 #include <time.h>
@@ -17,6 +15,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "unit_test/unit_test.h"
 #if defined(_MSC_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
@@ -26,20 +25,20 @@
 namespace libyuv {
-TEST_F (libyuvTest, BenchmarkI420ToARGB_C) {
+TEST_F(libyuvTest, BenchmarkI420ToARGB_C) {
-  align_buffer_16(src_y, _benchmark_width * _benchmark_height);
+  align_buffer_16(src_y, benchmark_width_ * benchmark_height_);
-  align_buffer_16(src_u, ((_benchmark_width * _benchmark_height) >> 2));
+  align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2);
-  align_buffer_16(src_v, ((_benchmark_width * _benchmark_height) >> 2));
+  align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2);
-  align_buffer_16(dst_argb, ((_benchmark_width << 2) * _benchmark_height));
+  align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_);
  MaskCpuFlags(kCpuInitialized);
-  for (int i = 0; i < _benchmark_iterations; ++i)
+  for (int i = 0; i < benchmark_iterations_; ++i)
-    I420ToARGB(src_y, _benchmark_width,
+    I420ToARGB(src_y, benchmark_width_,
-               src_u, _benchmark_width >> 1,
+               src_u, benchmark_width_ >> 1,
-               src_v, _benchmark_width >> 1,
+               src_v, benchmark_width_ >> 1,
-               dst_argb, _benchmark_width << 2,
+               dst_argb, benchmark_width_ << 2,
-               _benchmark_width, _benchmark_height);
+               benchmark_width_, benchmark_height_);
  MaskCpuFlags(-1);
@@ -51,18 +50,18 @@ TEST_F (libyuvTest, BenchmarkI420ToARGB_C) {
  free_aligned_buffer_16(dst_argb)
 }
-TEST_F (libyuvTest, BenchmarkI420ToARGB_OPT) {
+TEST_F(libyuvTest, BenchmarkI420ToARGB_OPT) {
-  align_buffer_16(src_y, _benchmark_width * _benchmark_height);
+  align_buffer_16(src_y, benchmark_width_ * benchmark_height_);
-  align_buffer_16(src_u, (_benchmark_width * _benchmark_height) >> 2);
+  align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2);
-  align_buffer_16(src_v, (_benchmark_width * _benchmark_height) >> 2);
+  align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2);
-  align_buffer_16(dst_argb, (_benchmark_width << 2) * _benchmark_height);
+  align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_);
-  for (int i = 0; i < _benchmark_iterations; ++i)
+  for (int i = 0; i < benchmark_iterations_; ++i)
-    I420ToARGB(src_y, _benchmark_width,
+    I420ToARGB(src_y, benchmark_width_,
-               src_u, _benchmark_width >> 1,
+               src_u, benchmark_width_ >> 1,
-               src_v, _benchmark_width >> 1,
+               src_v, benchmark_width_ >> 1,
-               dst_argb, _benchmark_width << 2,
+               dst_argb, benchmark_width_ << 2,
-               _benchmark_width, _benchmark_height);
+               benchmark_width_, benchmark_height_);
  free_aligned_buffer_16(src_y)
  free_aligned_buffer_16(src_u)
@@ -71,7 +70,7 @@ TEST_F (libyuvTest, BenchmarkI420ToARGB_OPT) {
 }
 #define TESTI420TO(FMT)                                                        \
-TEST_F (libyuvTest, I420To##FMT##_CvsOPT) {                                    \
+TEST_F(libyuvTest, I420To##FMT##_CvsOPT) {                                     \
  const int src_width = 1280;                                                  \
  const int src_height = 720;                                                  \
  align_buffer_16(src_y, src_width * src_height);                              \
@@ -103,8 +102,8 @@ TEST_F (libyuvTest, I420To##FMT##_CvsOPT) {                                    \
  int err = 0;                                                                 \
  for (int i = 0; i < src_height; ++i) {                                       \
    for (int j = 0; j < src_width << 2; ++j) {                                 \
-      int diff = (int)(dst_rgb_c[i * src_height + j]) -                        \
+      int diff = static_cast<int>(dst_rgb_c[i * src_height + j]) -             \
-                 (int)(dst_rgb_opt[i * src_height + j]);                       \
+                 static_cast<int>(dst_rgb_opt[i * src_height + j]);            \
      if (abs(diff) > 2)                                                       \
        err++;                                                                 \
    }                                                                          \
@@ -121,11 +120,48 @@ TESTI420TO(ARGB)
 TESTI420TO(BGRA)
 TESTI420TO(ABGR)
-TEST_F (libyuvTest, TestAttenuate) {
+TEST_F(libyuvTest, TestAttenuate) {
  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
  SIMD_ALIGNED(uint8 atten_pixels[256][4]);
  SIMD_ALIGNED(uint8 unatten_pixels[256][4]);
  SIMD_ALIGNED(uint8 atten2_pixels[256][4]);
+  // Test unattenuation clamps
+  orig_pixels[0][0] = 200u;
+  orig_pixels[0][1] = 129u;
+  orig_pixels[0][2] = 127u;
+  orig_pixels[0][3] = 128u;
+  // Test unattenuation transparent and opaque are unaffected
+  orig_pixels[1][0] = 16u;
+  orig_pixels[1][1] = 64u;
+  orig_pixels[1][2] = 192u;
+  orig_pixels[1][3] = 0u;
+  orig_pixels[2][0] = 16u;
+  orig_pixels[2][1] = 64u;
+  orig_pixels[2][2] = 192u;
+  orig_pixels[2][3] = 255u;
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 128u;
+  ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1);
+  EXPECT_EQ(255u, unatten_pixels[0][0]);
+  EXPECT_EQ(255u, unatten_pixels[0][1]);
+  EXPECT_EQ(254u, unatten_pixels[0][2]);
+  EXPECT_EQ(128u, unatten_pixels[0][3]);
+  EXPECT_EQ(16u, unatten_pixels[1][0]);
+  EXPECT_EQ(64u, unatten_pixels[1][1]);
+  EXPECT_EQ(192u, unatten_pixels[1][2]);
+  EXPECT_EQ(0u, unatten_pixels[1][3]);
+  EXPECT_EQ(16u, unatten_pixels[2][0]);
+  EXPECT_EQ(64u, unatten_pixels[2][1]);
+  EXPECT_EQ(192u, unatten_pixels[2][2]);
+  EXPECT_EQ(255u, unatten_pixels[2][3]);
+  EXPECT_EQ(32u, unatten_pixels[3][0]);
+  EXPECT_EQ(128u, unatten_pixels[3][1]);
+  EXPECT_EQ(255u, unatten_pixels[3][2]);
+  EXPECT_EQ(128u, unatten_pixels[3][3]);
  for (int i = 0; i < 256; ++i) {
    orig_pixels[i][0] = i;
    orig_pixels[i][1] = i / 2;
@@ -156,17 +192,5 @@ TEST_F (libyuvTest, TestAttenuate) {
  EXPECT_EQ(127, atten_pixels[255][1]);
  EXPECT_EQ(85,  atten_pixels[255][2]);
  EXPECT_EQ(255, atten_pixels[255][3]);
-  // Test unattenuation clamps
-  orig_pixels[0][0] = 200;
-  orig_pixels[0][1] = 129;
-  orig_pixels[0][2] = 127;
-  orig_pixels[0][3] = 128;
-  ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 1, 1);
-  EXPECT_EQ(255, unatten_pixels[0][0]);
-  EXPECT_EQ(255, unatten_pixels[0][1]);
-  EXPECT_EQ(254, unatten_pixels[0][2]);
-  EXPECT_EQ(128, unatten_pixels[0][3]);
 }
 }
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@@ -8,13 +8,12 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
-#include "unit_test.h"
 #include <stdlib.h>
 #include <time.h>
-#include "libyuv/rotate.h"
 #include "../source/rotate_priv.h"
+#include "libyuv/rotate.h"
+#include "unit_test/unit_test.h"
 namespace libyuv {
@@ -33,8 +32,8 @@ TEST_F(libyuvTest, Transpose) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 8; iw < _rotate_max_w && !err; ++iw)
+  for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
      int i;
      ow = ih;
      oh = iw;
@@ -77,8 +76,8 @@ TEST_F(libyuvTest, TransposeUV) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
      int i;
      ow = ih;
@@ -134,8 +133,8 @@ TEST_F(libyuvTest, RotatePlane90) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 8; iw < _rotate_max_w && !err; ++iw)
+  for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
      int i;
      ow = ih;
@@ -191,8 +190,8 @@ TEST_F(libyuvTest, RotateUV90) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
      int i;
      ow = ih;
@@ -265,8 +264,8 @@ TEST_F(libyuvTest, RotateUV180) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
      int i;
      ow = iw >> 1;
@@ -339,8 +338,8 @@ TEST_F(libyuvTest, RotateUV270) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 2)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
      int i;
      ow = ih;
@@ -414,8 +413,8 @@ TEST_F(libyuvTest, RotatePlane180) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 8; iw < _rotate_max_w && !err; ++iw)
+  for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
      int i;
      ow = iw;
@@ -459,8 +458,8 @@ TEST_F(libyuvTest, RotatePlane270) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 8; iw < _rotate_max_w && !err; ++iw)
+  for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
-    for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+    for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
      int i;
      ow = ih;
@@ -516,8 +515,8 @@ TEST_F(libyuvTest, RotatePlane90and270) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
-    for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
      int i;
      ow = ih;
@@ -561,8 +560,8 @@ TEST_F(libyuvTest, RotatePlane90Pitch) {
  int iw, ih;
  int err = 0;
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
-    for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
      int i;
      int ow = ih;
@@ -618,8 +617,8 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
  int iw, ih, ow, oh;
  int err = 0;
-  for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
+  for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
-    for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+    for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
      int i;
      ow = ih;

--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -8,13 +8,12 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
-#include "unit_test.h"
 #include <stdlib.h>
 #include <time.h>
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
+#include "unit_test/unit_test.h"
 namespace libyuv {

--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -8,15 +8,13 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
+#include "unit_test/unit_test.h"
 #include <cstring>
-#include "unit_test.h"
-libyuvTest::libyuvTest() :
+libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
-  _rotate_max_w(128),
+    benchmark_iterations_(1000), benchmark_width_(1280),
-  _rotate_max_h(128),
+    benchmark_height_(720) {
-  _benchmark_iterations(1000),
-  _benchmark_width(1280),
-  _benchmark_height(720) {
 }
 int main(int argc, char** argv) {

--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -8,17 +8,17 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
-#ifndef UINIT_TEST_H_
+#ifndef UNIT_TEST_UNIT_TEST_H_
-#define UINIT_TEST_H_
+#define UNIT_TEST_UNIT_TEST_H_
 #include <gtest/gtest.h>
-#define align_buffer_16(var, size) \
+#define align_buffer_16(var, size)                                             \
-  uint8 *var; \
+  uint8* var;                                                                  \
-  uint8 *var##_mem; \
+  uint8* var##_mem;                                                            \
-  var##_mem = reinterpret_cast<uint8*>(calloc((size)+15, sizeof(uint8))); \
+  var##_mem = reinterpret_cast<uint8*>(calloc((size) + 15, sizeof(uint8)));    \
-  var = reinterpret_cast<uint8*> \
+  var = reinterpret_cast<uint8*>                                               \
-        ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f));
+        ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f));              \
 #define free_aligned_buffer_16(var) \
  free(var##_mem);  \
@@ -27,12 +27,11 @@
 #ifdef WIN32
 #include <windows.h>
-static double get_time()
+static double get_time() {
-{
+  LARGE_INTEGER t, f;
-    LARGE_INTEGER t, f;
+  QueryPerformanceCounter(&t);
-    QueryPerformanceCounter(&t);
+  QueryPerformanceFrequency(&f);
-    QueryPerformanceFrequency(&f);
+  return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
-    return double(t.QuadPart)/double(f.QuadPart);
 }
 #define random rand
@@ -46,7 +45,7 @@ static double get_time() {
  struct timeval t;
  struct timezone tzp;
  gettimeofday(&t, &tzp);
-  return t.tv_sec + t.tv_usec*1e-6;
+  return t.tv_sec + t.tv_usec * 1e-6;
 }
 #endif
@@ -55,13 +54,12 @@ class libyuvTest : public ::testing::Test {
 protected:
  libyuvTest();
-  const int _rotate_max_w;
+  const int rotate_max_w_;
-  const int _rotate_max_h;
+  const int rotate_max_h_;
-  const int _benchmark_iterations;
-  const int _benchmark_width;
-  const int _benchmark_height;
+  const int benchmark_iterations_;
+  const int benchmark_width_;
+  const int benchmark_height_;
 };
-#endif // UNIT_TEST_H_
+#endif  // UNIT_TEST_UNIT_TEST_H_