ARGBcolorMatrix for applying transforms such as grey and sepia in a more general…

ARGBcolorMatrix for applying transforms such as grey and sepia in a more general form. Unittest does sepia for comparison. BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/656004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@288 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBcolorMatrix for applying transforms such as grey and sepia in a more general…
ARGBcolorMatrix for applying transforms such as grey and sepia in a more general form. Unittest does sepia for comparison. BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/656004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@288 16f28f9a-4ce2-e073-06de-1de4eb20be90
e442dc4c · fbarchard@google.com · 794fe123 · e442dc4c · e442dc4c · e442dc4c
Commit e442dc4c authored Jun 18, 2012 by fbarchard@google.com
9 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 287
+Version: 288
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -216,6 +216,18 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
 int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
              int x, int y, int width, int height);
+// Apply a 4x3 matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int x, int y, int width, int height);
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int x, int y, int width, int height);
 // Copy ARGB to ARGB.
 int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 287
+#define LIBYUV_VERSION 288
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1468,7 +1468,51 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
  return 0;
 }
+// Apply a 4x3 matrix rotation to each ARGB pixel.
+int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
+                             int width) = ARGBColorMatrixRow_C;
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+  }
+#endif
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  for (int y = 0; y < height; ++y) {
+    ARGBColorMatrixRow(dst, matrix_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                            int width) = ARGBColorTableRow_C;
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBColorTableRow = ARGBColorTableRow_X86;
+  }
+#endif
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  for (int y = 0; y < height; ++y) {
+    ARGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
 #ifdef HAVE_JPEG
 struct ARGBBuffers {
  uint8* argb;

--- a/source/row.h
+++ b/source/row.h
@@ -75,10 +75,17 @@ extern "C" {
 #define HAS_YUY2TOYROW_SSE2
 #define HAS_ARGBGRAYROW_SSSE3
 #define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGE_SSE2
 #endif
+// The following are Windows only:
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_ARGBCOLORTABLEROW_X86
+#endif
 // The following are disabled when SSSE3 is available:
 #if !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
@@ -482,6 +489,14 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width);
 void ARGBSepiaRow_C(uint8* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+                              int width);
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width);
 // Used for blur.
 void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
                                 int width, int area, uint8* dst, int count);

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -293,16 +293,54 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
    int b = dst_argb[0];
    int g = dst_argb[1];
    int r = dst_argb[2];
-    int sb = (r * 35 + g * 68 + b * 17) >> 7;
+    int sb = (b * 17 + g * 68 + r * 35) >> 7;
-    int sg = (r * 45 + g * 88 + b * 22) >> 7;
+    int sg = (b * 22 + g * 88 + r * 45) >> 7;
-    int sr = (r * 50 + g * 98 + b * 24) >> 7;
+    int sr = (b * 24 + g * 98 + r * 50) >> 7;
+    // b does not over flow.  a is preserved from original.
+    if (sg > 255) {
+      sg = 255;
+    }
    if (sr > 255) {
      sr = 255;
    }
+    dst_argb[0] = sb;
+    dst_argb[1] = sg;
+    dst_argb[2] = sr;
+    dst_argb += 4;
+  }
+}
+// Apply color matrix to a row of image.  Matrix is signed.
+void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
+    if (sb < 0) {
+      sb = 0;
+    }
+    if (sb > 255) {
+      sb = 255;
+    }
+    if (sg < 0) {
+      sg = 0;
+    }
    if (sg > 255) {
      sg = 255;
    }
-    // b does not over flow.  a is preserved from original.
+    if (sr < 0) {
+      sr = 0;
+    }
+    if (sr > 255) {
+      sr = 255;
+    }
    dst_argb[0] = sb;
    dst_argb[1] = sg;
    dst_argb[2] = sr;
@@ -310,6 +348,21 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
  }
 }
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb[3] = table_argb[a * 4 + 3];
+    dst_argb += 4;
+  }
+}
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
  // Copy a Y to RGB.
  for (int x = 0; x < width; ++x) {
@@ -790,9 +843,9 @@ YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
 YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C)
+YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
-YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C)
+YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
-YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C)
+YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
 #endif
 #undef YANY

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2800,7 +2800,7 @@ CONST vec8 kARGBToSepiaR = {
  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
 };
-// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  asm volatile (
    "movdqa    %2,%%xmm2                       \n"
@@ -2859,6 +2859,69 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+                              int width) {
+  asm volatile (
+    "movd      (%2),%%xmm2                     \n"
+    "movd      0x4(%2),%%xmm3                  \n"
+    "movd      0x8(%2),%%xmm4                  \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+  // 8 pixel loop                              \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm6                 \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm6                   \n"
+    "phaddw    %%xmm6,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm5                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqa    (%0),%%xmm5                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "movdqa    (%0),%%xmm6                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "psrld     $0x18,%%xmm6                    \n"
+    "psrld     $0x18,%%xmm1                    \n"
+    "packuswb  %%xmm1,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm5                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "punpckhwd %%xmm5,%%xmm1                   \n"
+    "sub       $0x8,%1                         \n"
+    "movdqa    %%xmm0,(%0)                     \n"
+    "movdqa    %%xmm1,0x10(%0)                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),      // %0
+    "+r"(width)          // %1
+  : "r"(matrix_argb)     // %2
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value, inclusive of the value.

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2877,7 +2877,7 @@ static const vec8 kARGBToSepiaR = {
  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
 };
-// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 __declspec(naked) __declspec(align(16))
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  __asm {
@@ -2930,6 +2930,117 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  }
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg.  To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+// TODO(fbarchard): phaddw not paired.
+// TODO(fbarchard): Test data copying from mem instead of from reg.
+// TODO(fbarchard): packing and then unpacking the A - is simple pand/por faster
+__declspec(naked) __declspec(align(16))
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+                              int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* dst_argb */
+    mov        edx, [esp + 8]   /* matrix_argb */
+    mov        ecx, [esp + 12]  /* width */
+    movd       xmm2, [edx]
+    movd       xmm3, [edx + 4]
+    movd       xmm4, [edx + 8]
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pshufd     xmm4, xmm4, 0
+    align      16
+ convertloop:
+    movdqa     xmm0, [eax]  // B
+    movdqa     xmm6, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm6, xmm2
+    phaddw     xmm0, xmm6
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 B values
+    movdqa     xmm5, [eax]  // G
+    movdqa     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 G values
+    punpcklbw  xmm0, xmm5   // 8 BG values
+    movdqa     xmm5, [eax]  // R
+    movdqa     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 R values
+    movdqa     xmm6, [eax]  // A
+    movdqa     xmm1, [eax + 16]
+    psrld      xmm6, 24
+    psrld      xmm1, 24
+    packuswb   xmm6, xmm1
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm5, xmm6   // 8 RA values
+    movdqa     xmm1, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm5   // BGRA first 4
+    punpckhwd  xmm1, xmm5   // BGRA next 4
+    sub        ecx, 8
+    movdqa     [eax], xmm0
+    movdqa     [eax + 16], xmm1
+    lea        eax, [eax + 32]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       ebx
+    push       edi
+    push       ebp
+    mov        eax, [esp + 12 + 4]   /* dst_argb */
+    mov        edi, [esp + 12 + 8]   /* table_argb */
+    mov        ecx, [esp + 12 + 12]  /* width */
+    xor        ebx, ebx
+    xor        edx, edx
+    align      16
+ convertloop:
+    mov        ebp, dword ptr [eax]  // BGRA
+    mov        esi, ebp
+    and        ebp, 255
+    shr        esi, 8
+    and        esi, 255
+    mov        bl, [edi + ebp * 4 + 0]  // B
+    mov        dl, [edi + esi * 4 + 1]  // G
+    mov        ebp, dword ptr [eax]  // BGRA
+    mov        esi, ebp
+    shr        ebp, 16
+    shr        esi, 24
+    and        ebp, 255
+    mov        [eax], bl
+    mov        [eax + 1], dl
+    mov        bl, [edi + ebp * 4 + 2]  // R
+    mov        dl, [edi + esi * 4 + 3]  // A
+    mov        [eax + 2], bl
+    mov        [eax + 3], dl
+    lea        eax, [eax + 4]
+    sub        ecx, 1
+    jg         convertloop
+    pop        ebp
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
 // Consider float CumulativeSum.

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -487,4 +487,66 @@ TEST_F(libyuvTest, TestARGBSepia) {
    ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1);
  }
 }
+TEST_F(libyuvTest, TestARGBColorMatrix) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  // Matrix for Sepia.
+  static const int8 kARGBToSepiaB[] = {
+    17, 68, 35, 0,
+    22, 88, 45, 0,
+    24, 98, 50, 0,
+  };
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepiaB[0], 0, 0, 16, 1);
+  EXPECT_EQ(33u, orig_pixels[0][0]);
+  EXPECT_EQ(43u, orig_pixels[0][1]);
+  EXPECT_EQ(47u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(135u, orig_pixels[1][0]);
+  EXPECT_EQ(175u, orig_pixels[1][1]);
+  EXPECT_EQ(195u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(69u, orig_pixels[2][0]);
+  EXPECT_EQ(89u, orig_pixels[2][1]);
+  EXPECT_EQ(99u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(88u, orig_pixels[3][0]);
+  EXPECT_EQ(114u, orig_pixels[3][1]);
+  EXPECT_EQ(127u, orig_pixels[3][2]);
+  EXPECT_EQ(224u, orig_pixels[3][3]);
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < 1000 * 1280 * 720 / 256; ++i) {
+    ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepiaB[0], 0, 0, 256, 1);
+  }
+}
 }  // namespace libyuv