Full metal BCS

BUG=none TEST=Luma* unittest R=thorcarpenter@google.com Review URL: https://webrtc-codereview.appspot.com/3029004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@828 16f28f9a-4ce2-e073-06de-1de4eb20be90

Full metal BCS
BUG=none TEST=Luma* unittest R=thorcarpenter@google.com Review URL: https://webrtc-codereview.appspot.com/3029004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@828 16f28f9a-4ce2-e073-06de-1de4eb20be90
6f7e514c · fbarchard@google.com · fb99c030 · 6f7e514c · 6f7e514c · 6f7e514c
Commit 6f7e514c authored Oct 28, 2013 by fbarchard@google.com
7 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 827
+Version: 828
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -144,6 +144,7 @@ extern "C" {
 // Effects:
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_RGBCOLORTABLEROW_X86
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
 #endif

 // The following are available on all x86 platforms, including NaCL, but
@@ -173,8 +174,6 @@ extern "C" {
 // TODO(fbarchard): Port to gcc.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 // Effects:
-// TODO(fbarchard): Optimize and enable
-// #define HAS_ARGBLUMACOLORTABLEROW_SSSE3

 // Caveat: Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 827
+#define LIBYUV_VERSION 828

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1848,7 +1848,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
  }
  // ARGBToBayer used to select G channel from ARGB.
  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
+                         uint32 selector, int pix) = ARGBToBayerGGRow_C;
 #if defined(HAS_ARGBTOBAYERGGROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
@@ -2014,9 +2014,15 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
                   uint8* dst_argb, int dst_stride_argb,
                   const float* poly,
                   int width, int height) {
-  if (!src_argb || !dst_argb || !poly || width <= 0 || height <= 0) {
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
    return -1;
  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
  // Coalesce rows.
  if (src_stride_argb == width * 4 &&
      dst_stride_argb == width * 4) {
@@ -2052,9 +2058,15 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_argb, int dst_stride_argb,
                       const uint8* luma,
                       int width, int height) {
-  if (!src_argb || !dst_argb || !luma || width <= 0 || height <= 0) {
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
    return -1;
  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
  // Coalesce rows.
  if (src_stride_argb == width * 4 &&
      dst_stride_argb == width * 4) {

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1864,7 +1864,7 @@ void ARGBToBayerRow_C(const uint8* src_argb,
 // Select G channel from ARGB.  e.g.  GGGGGGGG
 void ARGBToBayerGGRow_C(const uint8* src_argb,
                        uint8* dst_bayer, uint32 /*selector*/, int pix) {
-  // Copy a row of Bayer.
+  // Copy a row of G.
  for (int x = 0; x < pix - 1; x += 2) {
    dst_bayer[0] = src_argb[1];
    dst_bayer[1] = src_argb[5];

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -6336,7 +6336,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
-    uintptr_t pixel_temp = 0u;
+  uintptr_t pixel_temp = 0u;
  asm volatile (
    // 1 pixel loop.
    ".p2align  4                               \n"
@@ -6361,6 +6361,104 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
 }
 #endif  // HAS_RGBCOLORTABLEROW_X86

+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+                                 uint8* dst_argb, const uint8* luma,
+                                 int width) {
+  uintptr_t pixel_temp = 0u;
+  uintptr_t table_temp = 0u;
+  asm volatile (
+    "movdqa    %6,%%xmm3                       \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0x8,%%xmm4                     \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 4 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movq      (%2),%%xmm0                     \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "phaddw    %%xmm0,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     (%2),%0                         \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,(%3)                        \n"
+    "movzb     0x1(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0x1(%3)                     \n"
+    "movzb     0x2(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0x2(%3)                     \n"
+    "movzb     0x3(%2),%0                      \n"
+    "mov       %b0,0x3(%3)                     \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     0x4(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0x4(%3)                     \n"
+    "movzb     0x5(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0x5(%3)                     \n"
+    "movzb     0x6(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0x6(%3)                     \n"
+    "movzb     0x7(%2),%0                      \n"
+    "mov       %b0,0x7(%3)                     \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     0x8(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0x8(%3)                     \n"
+    "movzb     0x9(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0x9(%3)                     \n"
+    "movzb     0xa(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0xa(%3)                     \n"
+    "movzb     0xb(%2),%0                      \n"
+    "mov       %b0,0xb(%3)                     \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+
+    "movzb     0xc(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0xc(%3)                     \n"
+    "movzb     0xd(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0xd(%3)                     \n"
+    "movzb     0xe(%2),%0                      \n"
+    "movzb     (%1,%0,1),%0                    \n"
+    "mov       %b0,0xe(%3)                     \n"
+    "movzb     0xf(%2),%0                      \n"
+    "mov       %b0,0xf(%3)                     \n"
+    "sub       $0x4,%4                         \n"
+    "lea       0x10(%2),%2                     \n"
+    "lea       0x10(%3),%3                     \n"
+    "jg        1b                              \n"
+  : "+d"(pixel_temp),  // %0
+    "+b"(table_temp),  // %1
+    "+r"(src_argb),    // %2
+    "+r"(dst_argb),    // %3
+    "+rm"(width)       // %4
+  : "rm"(luma),        // %5
+    "m"(kARGBToYJ)     // %6
+  : "memory", "cc");
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -5202,83 +5202,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3

-#ifdef HAS_ARGBCOLORTABLEROW_X86
-
-static uvec8 kMaskB = {
-  255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
-};
-static uvec8 kMaskG = {
-  0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0,
-};
-static uvec8 kMaskR = {
-  0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0,
-};
-static uvec8 kMaskA = {
-  0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255,
-};
-
-// Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
-
-  convertloop:
-    movzx      edx, byte ptr [eax]
-    lea        eax, [eax + 4]
-    movzx      edx, byte ptr [esi + edx * 4]
-    mov        byte ptr [eax - 4], dl
-    movzx      edx, byte ptr [eax - 4 + 1]
-    movzx      edx, byte ptr [esi + edx * 4 + 1]
-    mov        byte ptr [eax - 4 + 1], dl
-    movzx      edx, byte ptr [eax - 4 + 2]
-    movzx      edx, byte ptr [esi + edx * 4 + 2]
-    mov        byte ptr [eax - 4 + 2], dl
-    movzx      edx, byte ptr [eax - 4 + 3]
-    movzx      edx, byte ptr [esi + edx * 4 + 3]
-    mov        byte ptr [eax - 4 + 3], dl
-    dec        ecx
-    jg         convertloop
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-__declspec(naked) __declspec(align(16))
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
-
-  convertloop:
-    movzx      edx, byte ptr [eax]
-    lea        eax, [eax + 4]
-    movzx      edx, byte ptr [esi + edx * 4]
-    mov        byte ptr [eax - 4], dl
-    movzx      edx, byte ptr [eax - 4 + 1]
-    movzx      edx, byte ptr [esi + edx * 4 + 1]
-    mov        byte ptr [eax - 4 + 1], dl
-    movzx      edx, byte ptr [eax - 4 + 2]
-    movzx      edx, byte ptr [esi + edx * 4 + 2]
-    mov        byte ptr [eax - 4 + 2], dl
-    dec        ecx
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_RGBCOLORTABLEROW_X86
-
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
 // Aligned to 16 bytes.
@@ -7149,72 +7072,171 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2

+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */

-// RGB to Luminance.
-// Leverage the fact that we want shifted left by 8 by the caller.
-//
-// Borrowed from libyuv/files/source/row_common.cc.
-// JPeg 7 bit Y:
-// b 0.11400 * 128 = 14.592 = 15
-// g 0.58700 * 128 = 75.136 = 75
-// r 0.29900 * 128 = 38.272 = 38
+    // 1 pixel loop.
+    align      4
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    movzx      edx, byte ptr [eax - 4 + 3]
+    movzx      edx, byte ptr [esi + edx * 4 + 3]
+    mov        byte ptr [eax - 4 + 3], dl
+    dec        ecx
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86

-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
 __declspec(naked) __declspec(align(16))
-void ARGBToYJx4_SSSE3(const uint8* src_argb, const uint8* luma, uint8** lut) {
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    movdqa     xmm0, [eax]
-    pmaddubsw  xmm0, kARGBToYJ
-    movd       xmm1, [esp + 8]  /* luma */
-    mov        edx, [esp + 12]  /* lut */
-    phaddw     xmm0, xmm0
-    pshufd     xmm1, xmm1, 0
-    pxor       xmm2, xmm2
-    psrlw      xmm0, 8
-    psllw      xmm0, 8     // 0y0y0y0y
-    punpcklwd  xmm0, xmm2  // 000y000y000y000y
-    paddd      xmm0, xmm1  // lum0lum1lum2lum3
-    movdqa     [edx], xmm0
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+    align      4
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    dec        ecx
+    jg         convertloop
+
+    pop        esi
    ret
  }
 }
+#endif  // HAS_RGBCOLORTABLEROW_X86

+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked) __declspec(align(16))
 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
                                 uint8* dst_argb, const uint8* luma,
                                 int width) {
-  SIMD_ALIGNED(uint8* lut4[4]);
-  for (int i = 0; i < width - 3; i += 4) {
-    ARGBToYJx4_SSSE3(src_argb, luma, lut4);
-    // Luminance in rows, color values in columns.
-    const uint8* luma0 = lut4[0];
-    dst_argb[0] = luma0[src_argb[0]];
-    dst_argb[1] = luma0[src_argb[1]];
-    dst_argb[2] = luma0[src_argb[2]];
-    dst_argb[3] = src_argb[3];
-
-    luma0 = lut4[1];
-    dst_argb[4] = luma0[src_argb[4]];
-    dst_argb[5] = luma0[src_argb[5]];
-    dst_argb[6] = luma0[src_argb[6]];
-    dst_argb[7] = src_argb[7];
-
-    luma0 = lut4[2];
-    dst_argb[8] = luma0[src_argb[8]];
-    dst_argb[9] = luma0[src_argb[9]];
-    dst_argb[10] = luma0[src_argb[10]];
-    dst_argb[11] = src_argb[11];
-
-    luma0 = lut4[3];
-    dst_argb[12] = luma0[src_argb[12]];
-    dst_argb[13] = luma0[src_argb[13]];
-    dst_argb[14] = luma0[src_argb[14]];
-    dst_argb[15] = src_argb[15];
-
-    src_argb += 16;
-    dst_argb += 16;
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   /* src_argb */
+    mov        edi, [esp + 8 + 8]   /* dst_argb */
+    movd       xmm2, dword ptr [esp + 8 + 12]  /* table_argb */
+    pshufd     xmm2, xmm2, 0
+    mov        ecx, [esp + 8 + 16]  /* width */
+    movdqa     xmm3, kARGBToYJ
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff00ff00
+    psllw      xmm4, 8
+    pxor       xmm5, xmm5
+
+    // 4 pixel loop.
+    align      4
+  convertloop:
+    movq       xmm0, qword ptr [eax]      // generate luma ptr
+    pmaddubsw  xmm0, xmm3
+    phaddw     xmm0, xmm0
+    pand       xmm0, xmm4  // mask out low bits
+    punpcklwd  xmm0, xmm5
+    paddd      xmm0, xmm2  // add table base
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi], dl
+    movzx      edx, byte ptr [eax + 1]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 1], dl
+    movzx      edx, byte ptr [eax + 2]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 2], dl
+    movzx      edx, byte ptr [eax + 3]  // copy alpha.
+    mov        byte ptr [edi + 3], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 4]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 4], dl
+    movzx      edx, byte ptr [eax + 5]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 5], dl
+    movzx      edx, byte ptr [eax + 6]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 6], dl
+    movzx      edx, byte ptr [eax + 7]  // copy alpha.
+    mov        byte ptr [edi + 7], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 8]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 8], dl
+    movzx      edx, byte ptr [eax + 9]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 9], dl
+    movzx      edx, byte ptr [eax + 10]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 10], dl
+    movzx      edx, byte ptr [eax + 11]  // copy alpha.
+    mov        byte ptr [edi + 11], dl
+
+    movd       esi, xmm0
+
+    movzx      edx, byte ptr [eax + 12]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 12], dl
+    movzx      edx, byte ptr [eax + 13]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 13], dl
+    movzx      edx, byte ptr [eax + 14]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 14], dl
+    movzx      edx, byte ptr [eax + 15]  // copy alpha.
+    mov        byte ptr [edi + 15], dl
+
+    sub        ecx, 4
+    lea        eax, [eax + 16]
+    lea        edi, [edi + 16]
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
  }
 }
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3

 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)