Flat shade an ARGB image

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/683004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@298 16f28f9a-4ce2-e073-06de-1de4eb20be90

Flat shade an ARGB image
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/683004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@298 16f28f9a-4ce2-e073-06de-1de4eb20be90
c4c578e3 · fbarchard@google.com · c4500c9f · c4c578e3 · c4c578e3 · c4c578e3
Commit c4c578e3 authored Jul 10, 2012 by fbarchard@google.com
10 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 297
+Version: 298
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -15,7 +15,7 @@
 // TODO(fbarchard): Remove the following headers includes
 #include "libyuv/convert.h"
-#include "libyuv/planar_functions.h"
+#include "libyuv/convert_argb.h"
 #ifdef __cplusplus
 namespace libyuv {
@@ -188,11 +188,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height);
-// Multiply ARGB image by ARGB value.
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value);
 // Convert MJPG to ARGB.
 int MJPGToARGB(const uint8* sample, size_t sample_size,
               uint8* argb, int argb_stride,
@@ -212,6 +207,11 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
             int32* dst_cumsum, int dst_stride32_cumsum,
             int width, int height, int radius);
+// Multiply ARGB image by ARGB value.
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 297
+#define LIBYUV_VERSION 298
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -784,11 +784,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
  return 0;
 }
-// Visual C for x86 defines these.
+// Visual C x86 or GCC little endian.
-#if defined(_M_X64) || defined(_M_IX86)
+#if defined(_M_X64) || defined(_M_IX86) || (defined(__BYTE_ORDER) && \
-#define LIBYUV_LITTLE_ENDIAN
+  (__BYTE_ORDER == __ORDER_LITTLE_ENDIAN__ || __BYTE_ORDER == __LITTLE_ENDIAN))
-// GCC provided macros.
-#elif __BYTE_ORDER == __ORDER_LITTLE_ENDIAN__ || __BYTE_ORDER == __LITTLE_ENDIAN
 #define LIBYUV_LITTLE_ENDIAN
 #endif

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -401,18 +401,15 @@ static void I42xToUYVYRow_C(const uint8* src_y,
    }
 }
-// Visual C for x86 defines these.
+// Visual C x86 or GCC little endian.
-#if defined(_M_X64) || defined(_M_IX86)
+#if defined(_M_X64) || defined(_M_IX86) || (defined(__BYTE_ORDER) && \
-#define LIBYUV_LITTLE_ENDIAN
+  (__BYTE_ORDER == __ORDER_LITTLE_ENDIAN__ || __BYTE_ORDER == __LITTLE_ENDIAN))
-// GCC provided macros.
-#elif __BYTE_ORDER == __ORDER_LITTLE_ENDIAN__ || __BYTE_ORDER == __LITTLE_ENDIAN
 #define LIBYUV_LITTLE_ENDIAN
 #endif
 #ifdef LIBYUV_LITTLE_ENDIAN
 #define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
 #else
 static inline void WRITEWORD(uint8* p, uint32 v) {
  p[0] = (uint8)(v & 255);
  p[1] = (uint8)((v >> 8) & 255);

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -60,8 +60,7 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
               uint8*, int,
               uint8*, int,
               int width, int height) {
-  if (!src_y || !dst_y ||
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
-      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
@@ -112,8 +111,7 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
-  if (!src_y || !src_u || !src_v ||
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
-      !dst_y || !dst_u || !dst_v ||
      width <= 0 || height == 0) {
    return -1;
  }
@@ -143,9 +141,7 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
 int ARGBMirror(const uint8* src_argb, int src_stride_argb,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
-  if (!src_argb ||
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-      !dst_argb ||
-      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
@@ -224,6 +220,9 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               int width, int height) {
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -255,6 +254,9 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -298,6 +300,9 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
 int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
                uint8* dst_rgb24, int dst_stride_rgb24,
                int width, int height) {
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -330,6 +335,9 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
 int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
              uint8* dst_raw, int dst_stride_raw,
              int width, int height) {
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -362,6 +370,9 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
 int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height) {
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -393,6 +404,9 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
 int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
                   uint8* dst_argb1555, int dst_stride_argb1555,
                   int width, int height) {
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -424,6 +438,9 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
 int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
                   uint8* dst_argb4444, int dst_stride_argb4444,
                   int width, int height) {
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -457,6 +474,9 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                 const uint8* src_uv, int src_stride_uv,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height) {
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@@ -499,6 +519,9 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
                 const uint8* src_vu, int src_stride_vu,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height) {
+  if (!src_y || !src_vu || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@@ -762,6 +785,9 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
 int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
                  uint8* dst_argb, int dst_stride_argb,
                  int width, int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -796,6 +822,9 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
 int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
  if (height < 0) {
    height = -height;
    src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -866,7 +895,8 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
 int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
                    const int8* matrix_argb,
                    int dst_x, int dst_y, int width, int height) {
-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+  if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
    return -1;
  }
  void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
@@ -890,7 +920,8 @@ int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
 int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
                   const uint8* table_argb,
                   int dst_x, int dst_y, int width, int height) {
-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
    return -1;
  }
  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
@@ -972,6 +1003,9 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int32* dst_cumsum, int dst_stride32_cumsum,
             int width, int height, int radius) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
  void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft,
@@ -1052,6 +1086,30 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 int ARGBShade(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height, uint32 value) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+                       int width, uint32 value) = ARGBShadeRow_C;
+#if defined(HAS_ARGBSHADE_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBShadeRow = ARGBShadeRow_SSE2;
+  }
+#endif
+  for (int y = 0; y < height; ++y) {
+    ARGBShadeRow(src_argb, dst_argb, width, value);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
 }
 #ifdef __cplusplus

--- a/source/row.h
+++ b/source/row.h
@@ -85,6 +85,7 @@ extern "C" {
 // The following are Windows only:
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBSHADE_SSE2
 #endif
 // The following are disabled when SSSE3 is available:
@@ -516,6 +517,11 @@ void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft,
 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
                               const int32* previous_cumsum, int width);
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -956,6 +956,32 @@ void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
  }
 }
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value) {
+  const uint32 b_scale = REPEAT8(value & 0xff);
+  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32 a_scale = REPEAT8(value >> 24);
+  for (int i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb[0]);
+    const uint32 g = REPEAT8(src_argb[1]);
+    const uint32 r = REPEAT8(src_argb[2]);
+    const uint32 a = REPEAT8(src_argb[3]);
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -15,12 +15,12 @@ namespace libyuv {
 extern "C" {
 #endif
-// This module is for Visual C x86
+// This module is for Visual C x86.
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #ifdef HAS_ARGBTOYROW_SSSE3
-// Constants for ARGB
+// Constants for ARGB.
 static const vec8 kARGBToY = {
  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
@@ -33,7 +33,7 @@ static const vec8 kARGBToV = {
  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
-// Constants for BGRA
+// Constants for BGRA.
 static const vec8 kBGRAToY = {
  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
 };
@@ -46,7 +46,7 @@ static const vec8 kBGRAToV = {
  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
 };
-// Constants for ABGR
+// Constants for ABGR.
 static const vec8 kABGRToY = {
  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
 };
@@ -247,13 +247,13 @@ __asm {
  }
 }
-// pmul method to replicate bits
+// pmul method to replicate bits.
-// Math to replicate bits
+// Math to replicate bits:
 // (v << 8) | (v << 3)
 // v * 256 + v * 8
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-// 20 instructions
+// 20 instructions.
 __declspec(naked) __declspec(align(16))
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                          int pix) {
@@ -358,7 +358,7 @@ __asm {
  }
 }
-// 18 instructions
+// 18 instructions.
 __declspec(naked) __declspec(align(16))
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                            int pix) {
@@ -514,7 +514,7 @@ __asm {
  }
 }
-// TODO(fbarchard): Improve sign extension/packing
+// TODO(fbarchard): Improve sign extension/packing.
 __declspec(naked) __declspec(align(16))
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
@@ -587,7 +587,7 @@ __asm {
  }
 }
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 __declspec(naked) __declspec(align(16))
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
@@ -1249,8 +1249,9 @@ static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
 // TODO(fbarchard): NV12/NV21 fetch UV and use directly.
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
-// Read 8 UV from 411
+// Read 8 UV from 411.
 #define READYUV444 __asm {                                                     \
    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
@@ -1258,7 +1259,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
  }
-// Read 4 UV from 422, upsample to 8 UV
+// Read 4 UV from 422, upsample to 8 UV.
 #define READYUV422 __asm {                                                     \
    __asm movd       xmm0, [esi]          /* U */                              \
    __asm movd       xmm1, [esi + edi]    /* V */                              \
@@ -1267,7 +1268,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
  }
-// Read 2 UV from 411, upsample to 8 UV
+// Read 2 UV from 411, upsample to 8 UV.
 #define READYUV411 __asm {                                                     \
    __asm movd       xmm0, [esi]          /* U */                              \
    __asm movd       xmm1, [esi + edi]    /* V */                              \
@@ -1277,14 +1278,14 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
  }
-// Read 4 UV from NV12, upsample to 8 UV
+// Read 4 UV from NV12, upsample to 8 UV.
 #define READNV12 __asm {                                                       \
    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
    __asm lea        esi,  [esi + 8]                                           \
    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
  }
-// Convert 8 pixels: 8 UV and 8 Y
+// Convert 8 pixels: 8 UV and 8 Y.
 #define YUVTORGB __asm {                                                       \
    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
    __asm movdqa     xmm1, xmm0                                                \
@@ -1312,7 +1313,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm packuswb   xmm2, xmm2           /* R */                              \
  }
-// Convert 8 pixels: 8 VU and 8 Y
+// Convert 8 pixels: 8 VU and 8 Y.
 #define YVUTORGB __asm {                                                       \
    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
    __asm movdqa     xmm1, xmm0                                                \
@@ -1341,7 +1342,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
  }
 // 8 pixels, dest aligned 16.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
@@ -1384,7 +1385,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 }
 // 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
@@ -1427,7 +1428,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 }
 // 8 pixels, dest aligned 16.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
 __declspec(naked) __declspec(align(16))
 void I411ToARGBRow_SSSE3(const uint8* y_buf,
@@ -1471,7 +1472,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
 }
 // 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* uv_buf,
@@ -1509,7 +1510,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
 }
 // 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* uv_buf,
@@ -1547,7 +1548,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
 }
 // 8 pixels, unaligned.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
@@ -1590,7 +1591,7 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 }
 // 8 pixels, unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
@@ -1633,7 +1634,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 }
 // 8 pixels, unaligned.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
 __declspec(naked) __declspec(align(16))
 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
@@ -1678,7 +1679,7 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 // 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* uv_buf,
@@ -1716,7 +1717,7 @@ void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 }
 // 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* uv_buf,
@@ -2127,7 +2128,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 #endif  // HAS_SPLITUV_SSE2
 #ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
 __declspec(naked) __declspec(align(16))
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  __asm {
@@ -2574,13 +2575,13 @@ static const uvec8 kShuffleAlpha = {
  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
 };
-// Same as SSE2, but replaces
+// Same as SSE2, but replaces:
 //    psrlw      xmm3, 8          // alpha
 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
 //    pshuflw    xmm3, xmm3,0F5h
 // with..
 //    pshufb     xmm3, kShuffleAlpha // alpha
-// Blend 8 pixels at a time
+// Blend 8 pixels at a time.
 __declspec(naked) __declspec(align(16))
 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
@@ -2698,7 +2699,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBATTENUATE_SSE2
 // Attenuate 4 pixels at a time.
-// aligned to 16 bytes
+// Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
  __asm {
@@ -2741,7 +2742,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 #endif  // HAS_ARGBATTENUATE_SSE2
 #ifdef HAS_ARGBATTENUATE_SSSE3
-// Shuffle table duplicating alpha
+// Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {
  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
 };
@@ -2791,7 +2792,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 #ifdef HAS_ARGBUNATTENUATE_SSE2
 // Unattenuate 4 pixels at a time.
-// aligned to 16 bytes
+// Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                             int width) {
@@ -2845,12 +2846,12 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 #endif  // HAS_ARGBUNATTENUATE_SSE2
 #ifdef HAS_ARGBGRAYROW_SSSE3
-// Constant for ARGB color to gray scale.  0.11 * B + 0.59 * G + 0.30 * R
+// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
 static const vec8 kARGBToGray = {
  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
 };
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
 __declspec(naked) __declspec(align(16))
 void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
  __asm {
@@ -2893,7 +2894,7 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
 //    b = (r * 35 + g * 68 + b * 17) >> 7
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone
+// Constant for ARGB color to sepia tone.
 static const vec8 kARGBToSepiaB = {
  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
 };
@@ -3071,7 +3072,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-// aligned to 16 bytes
+// Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                          int interval_offset, int width) {
@@ -3306,6 +3307,42 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+#ifdef HAS_ARGBSHADE_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    movd       xmm2, [esp + 16]  // value
+    sub        edx, eax
+    punpcklbw  xmm2, xmm2
+    punpcklqdq xmm2, xmm2
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0       // first 2
+    punpckhbw  xmm1, xmm1       // next 2
+    pmulhuw    xmm0, xmm2       // argb * value
+    pmulhuw    xmm1, xmm2       // argb * value
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [eax + edx], xmm0
+    lea        eax, [eax + 16]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBSHADE_SSE2
 #endif  // _M_IX86

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -331,7 +331,6 @@ TESTATOBRANDOM(RGB565, 2, 2, ARGB, 4)
 TESTATOBRANDOM(ARGB1555, 2, 2, ARGB, 4)
 TESTATOBRANDOM(ARGB4444, 2, 2, ARGB, 4)
 TEST_F(libyuvTest, TestAttenuate) {
  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
  SIMD_ALIGNED(uint8 atten_pixels[256][4]);
@@ -649,4 +648,56 @@ TEST_F(libyuvTest, TestARGBMirror) {
  }
 }
+TEST_F(libyuvTest, TestShade) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 shade_pixels[256][4]);
+  // Test unattenuation clamps
+  orig_pixels[0][0] = 10u;
+  orig_pixels[0][1] = 20u;
+  orig_pixels[0][2] = 40u;
+  orig_pixels[0][3] = 80u;
+  // Test unattenuation transparent and opaque are unaffected
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 0u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 255u;
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 0u;
+  orig_pixels[2][3] = 0u;
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 0u;
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80ffffff);
+  EXPECT_EQ(10u, shade_pixels[0][0]);
+  EXPECT_EQ(20u, shade_pixels[0][1]);
+  EXPECT_EQ(40u, shade_pixels[0][2]);
+  EXPECT_EQ(40u, shade_pixels[0][3]);
+  EXPECT_EQ(0u, shade_pixels[1][0]);
+  EXPECT_EQ(0u, shade_pixels[1][1]);
+  EXPECT_EQ(0u, shade_pixels[1][2]);
+  EXPECT_EQ(128u, shade_pixels[1][3]);
+  EXPECT_EQ(0u, shade_pixels[2][0]);
+  EXPECT_EQ(0u, shade_pixels[2][1]);
+  EXPECT_EQ(0u, shade_pixels[2][2]);
+  EXPECT_EQ(0u, shade_pixels[2][3]);
+  EXPECT_EQ(0u, shade_pixels[3][0]);
+  EXPECT_EQ(0u, shade_pixels[3][1]);
+  EXPECT_EQ(0u, shade_pixels[3][2]);
+  EXPECT_EQ(0u, shade_pixels[3][3]);
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80808080);
+  EXPECT_EQ(5u, shade_pixels[0][0]);
+  EXPECT_EQ(10u, shade_pixels[0][1]);
+  EXPECT_EQ(20u, shade_pixels[0][2]);
+  EXPECT_EQ(40u, shade_pixels[0][3]);
+  for (int i = 0; i < 1000 * 1280 * 720 / 256; ++i) {
+    ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1,
+              0x80808080);
+  }
+}
 }  // namespace libyuv