implement I444ToABGR by swapping uv and transpose matrix

U contributes to B and G. V contributes to R and G. By swapping U and V, they contribute to the opposite channels. Adjust the matrix so the U contribution is in the matrix location such that it till contribute to the new B channel and vice versa. This allows ABGR versions of YUV conversion to use the same low level code as ARGB, just using a different matrix and swapping U and V pointers. As a result the existing I444ToABGRRow functions are no longer needed and are removed. Previously this function was only Intel AVX2 optimized for Windwos. Now it is also optimized for Arm and GCC. ARMv7 Neon Was LibYUVConvertTest.I444ToABGR_Opt (75971 ms) Now LibYUVConvertTest.I444ToABGR_Opt (3672 ms) 20.6 times faster. R=xhwang@chromium.org BUG=libyuv:515 Review URL: https://codereview.chromium.org/1414133006 .

implement I444ToABGR by swapping uv and transpose matrix
U contributes to B and G. V contributes to R and G. By swapping U and V, they contribute to the opposite channels. Adjust the matrix so the U contribution is in the matrix location such that it till contribute to the new B channel and vice versa. This allows ABGR versions of YUV conversion to use the same low level code as ARGB, just using a different matrix and swapping U and V pointers. As a result the existing I444ToABGRRow functions are no longer needed and are removed. Previously this function was only Intel AVX2 optimized for Windwos. Now it is also optimized for Arm and GCC. ARMv7 Neon Was LibYUVConvertTest.I444ToABGR_Opt (75971 ms) Now LibYUVConvertTest.I444ToABGR_Opt (3672 ms) 20.6 times faster. R=xhwang@chromium.org BUG=libyuv:515 Review URL: https://codereview.chromium.org/1414133006 .
cf160cdb · Frank Barchard · e8ee1755 · cf160cdb · cf160cdb · cf160cdb
Commit cf160cdb authored Oct 27, 2015 by Frank Barchard
8 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1524
+Version: 1525
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -120,7 +120,6 @@ extern "C" {
 #define HAS_I422TORGBAROW_SSSE3
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
-#define HAS_I444TOABGRROW_SSSE3
 #define HAS_I444TOARGBROW_SSSE3
 #define HAS_J400TOARGBROW_SSE2
 #define HAS_J422TOABGRROW_SSSE3
@@ -246,7 +245,6 @@ extern "C" {
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TORGB565ROW_AVX2
-#define HAS_I444TOABGRROW_AVX2
 #define HAS_I444TOARGBROW_AVX2
 #define HAS_J400TOARGBROW_AVX2
 #define HAS_NV12TORGB565ROW_AVX2
@@ -460,6 +458,7 @@ struct YuvConstants {
 extern const struct YuvConstants kYuvIConstants;  // BT.601
 extern const struct YuvConstants kYuvJConstants;  // JPeg color space
 extern const struct YuvConstants kYuvHConstants;  // BT.709
+extern const struct YuvConstants kYvuIConstants;  // YVU to BGR BT.601

 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
@@ -1035,12 +1034,6 @@ void I444ToARGBRow_C(const uint8* src_y,
                     uint8* dst_argb,
                     const struct YuvConstants* yuvconstants,
                     int width);
-void I444ToABGRRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
 void I422ToARGBRow_C(const uint8* src_y,
                     const uint8* src_u,
                     const uint8* src_v,
@@ -1210,30 +1203,6 @@ void I444ToARGBRow_AVX2(const uint8* src_y,
                        uint8* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
-void I444ToABGRRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_abgr,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I444ToABGRRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_abgr,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I444ToABGRRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_abgr,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I444ToABGRRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_abgr,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
 void I422ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -1452,18 +1421,6 @@ void I444ToARGBRow_Any_AVX2(const uint8* src_y,
                            uint8* dst_argb,
                            const struct YuvConstants* yuvconstants,
                            int width);
-void I444ToABGRRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_abgr,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I444ToABGRRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_abgr,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
 void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1524
+#define LIBYUV_VERSION 1525

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -129,7 +129,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
                          width, height);
 }

-
 // Convert J444 to ARGB.
 LIBYUV_API
 int J444ToARGB(const uint8* src_y, int src_stride_y,
@@ -145,7 +144,6 @@ int J444ToARGB(const uint8* src_y, int src_stride_y,
                          width, height);
 }

-
 // Convert I444 to ABGR.
 LIBYUV_API
 int I444ToABGR(const uint8* src_y, int src_stride_y,
@@ -153,66 +151,12 @@ int I444ToABGR(const uint8* src_y, int src_stride_y,
               const uint8* src_v, int src_stride_v,
               uint8* dst_abgr, int dst_stride_abgr,
               int width, int height) {
-  int y;
-  void (*I444ToABGRRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I444ToABGRRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_abgr ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
-    dst_stride_abgr = -dst_stride_abgr;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u == width &&
-      src_stride_v == width &&
-      dst_stride_abgr == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
-  }
-#if defined(HAS_I444TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I444ToABGRRow = I444ToABGRRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToABGRRow = I444ToABGRRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I444TOABGRROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I444ToABGRRow = I444ToABGRRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I444ToABGRRow = I444ToABGRRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I444TOABGRROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I444ToABGRRow = I444ToABGRRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I444ToABGRRow = I444ToABGRRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I444ToABGRRow(src_y, src_u, src_v, dst_abgr, &kYuvIConstants, width);
-    dst_abgr += dst_stride_abgr;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
+  return I444ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuIConstants,
+                          width, height);
 }

 // Convert I422 to ARGB.

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -126,9 +126,6 @@ ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
 ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
 ANY31C(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
 #endif  // HAS_I444TOARGBROW_SSSE3
-#ifdef HAS_I444TOABGRROW_SSSE3
-ANY31C(I444ToABGRRow_Any_SSSE3, I444ToABGRRow_SSSE3, 0, 0, 4, 7)
-#endif
 #ifdef HAS_I422TORGB24ROW_AVX2
 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
 #endif
@@ -150,9 +147,6 @@ ANY31C(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I444TOARGBROW_AVX2
 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
 #endif
-#ifdef HAS_I444TOABGRROW_AVX2
-ANY31C(I444ToABGRRow_Any_AVX2, I444ToABGRRow_AVX2, 0, 0, 4, 15)
-#endif
 #ifdef HAS_I411TOARGBROW_AVX2
 ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
 #endif

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1018,8 +1018,6 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
 // BT.601 constants for YUV to RGB.
 // TODO(fbarchard): Unify these structures to be platform independent.
 // TODO(fbarchard): Generate SIMD structures from float matrix.
-
-// BT601 constants for YUV to RGB.
 #if defined(__aarch64__)
 const YuvConstants SIMD_ALIGNED(kYuvIConstants) = {
  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
@@ -1029,7 +1027,6 @@ const YuvConstants SIMD_ALIGNED(kYuvIConstants) = {
  { BB, BG, BR, 0, 0, 0, 0, 0 },
  { 0x0101 * YG, 0, 0, 0 }
 };
-
 #elif defined(__arm__)
 const YuvConstants SIMD_ALIGNED(kYuvIConstants) = {
  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
@@ -1099,6 +1096,42 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
  *r = Clamp((int32)(y1 + YGB) >> 6);
 }

+// BT.601 constants for YVU to BGR.
+// Allows YUV TO RGB code to implement YUV to BGR by swapping UV and using this
+// matrix.
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYvuIConstants) = {
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYvuIConstants) = {
+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYvuIConstants) = {
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, UB, 0 },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, VR },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
+
 #undef BB
 #undef BG
 #undef BR
@@ -1279,34 +1312,6 @@ void I444ToARGBRow_C(const uint8* src_y,
    rgb_buf[3] = 255;
  }
 }
-
-void I444ToABGRRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel(src_y[0], u, v, rgb_buf + 2, rgb_buf + 1, rgb_buf + 0,
-             yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], u, v, rgb_buf + 6, rgb_buf + 5, rgb_buf + 4,
-             yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    src_u += 2;
-    src_v += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
 #else
 void I444ToARGBRow_C(const uint8* src_y,
                     const uint8* src_u,
@@ -1325,24 +1330,6 @@ void I444ToARGBRow_C(const uint8* src_y,
    rgb_buf += 4;  // Advance 1 pixel.
  }
 }
-
-void I444ToABGRRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
-    rgb_buf[3] = 255;
-    src_y += 1;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
-}
 #endif

 // Also used for 420

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1619,33 +1619,6 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
  );
 }

-void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_abgr,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB(yuvconstants)
-    STOREABGR
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-
 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2267,45 +2267,6 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I444TOARGBROW_AVX2

-#ifdef HAS_I444TOABGRROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ABGR (64 bytes).
-__declspec(naked)
-void I444ToABGRRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_abgr,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
- convertloop:
-    READYUV444_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREABGR_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I444TOABGRROW_AVX2
-
 #ifdef HAS_I411TOARGBROW_AVX2
 // 16 pixels
 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
@@ -2870,43 +2831,6 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }

-// 8 pixels.
-// 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes).
-__declspec(naked)
-void I444ToABGRRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_abgr,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV444
-    YUVTORGB(ebx)
-    STOREABGR
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
 __declspec(naked)