I420ToARGB conversion with matrix.

Take color conversion constants as a parameter to row function for I420ToARGBMatrixRow_SSSE3. Allows future variations of color space using a single low level. R=harryjin@google.com BUG=libyuv:488 Review URL: https://webrtc-codereview.appspot.com/56669004 .

I420ToARGB conversion with matrix.
Take color conversion constants as a parameter to row function for I420ToARGBMatrixRow_SSSE3. Allows future variations of color space using a single low level. R=harryjin@google.com BUG=libyuv:488 Review URL: https://webrtc-codereview.appspot.com/56669004 .
925c3d9e · Frank Barchard · 0bc626a5 · 925c3d9e · 925c3d9e · 925c3d9e
Commit 925c3d9e authored Sep 02, 2015 by Frank Barchard
6 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1475
+Version: 1476
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -88,6 +88,7 @@ extern "C" {
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
 #define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOARGBMATRIXROW_SSSE3
 #define HAS_I422TOBGRAROW_SSSE3
 #define HAS_I422TORAWROW_SSSE3
 #define HAS_I422TORGB24ROW_SSSE3
@@ -161,6 +162,7 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
    (!defined(__clang__) || defined(__SSSE3__))
 #define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOARGBMATRIXROW_SSSE3
 #endif
 // GCC >= 4.7.0 required for AVX2.
@@ -223,6 +225,7 @@ extern "C" {
 #define HAS_I400TOARGBROW_AVX2
 #define HAS_I422TOABGRROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TOARGBMATRIXROW_AVX2
 #define HAS_I422TOBGRAROW_AVX2
 #define HAS_I422TORAWROW_AVX2
 #define HAS_I422TORGB24ROW_AVX2
@@ -290,6 +293,8 @@ extern "C" {
 #define HAS_I422TOARGB1555ROW_NEON
 #define HAS_I422TOARGB4444ROW_NEON
 #define HAS_I422TOARGBROW_NEON
+// TODO(fbarchard): Implement NEON version
+#define HAS_I422TOARGBMATRIXROW_NEON
 #define HAS_I422TOBGRAROW_NEON
 #define HAS_I422TORAWROW_NEON
 #define HAS_I422TORGB24ROW_NEON
@@ -414,6 +419,21 @@ typedef uint32 ulvec32[8];
 typedef uint8 ulvec8[32];
 #endif
+// This struct is for Intel color conversion.
+#if defined(_M_IX86) || defined(_M_X64) || \
+    defined(__x86_64__) || defined(__i386__)
+struct YuvConstants {
+  lvec8 kUVToB;
+  lvec8 kUVToG;
+  lvec8 kUVToR;
+  lvec16 kUVBiasB;
+  lvec16 kUVBiasG;
+  lvec16 kUVBiasR;
+  lvec16 kYToRgb;
+};
+#endif
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
 #else
@@ -509,6 +529,12 @@ void I422ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_v,
                        uint8* dst_argb,
                        int width);
+void I422ToARGBMatrixRow_NEON(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              struct YuvConstants* YuvConstants,
+                              int width);
 void I411ToARGBRow_NEON(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
@@ -962,6 +988,12 @@ void I422ToARGBRow_C(const uint8* src_y,
                     const uint8* src_v,
                     uint8* dst_argb,
                     int width);
+void I422ToARGBMatrixRow_C(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           struct YuvConstants* YuvConstants,
+                           int width);
 void I411ToARGBRow_C(const uint8* src_y,
                     const uint8* src_u,
                     const uint8* src_v,
@@ -1039,6 +1071,12 @@ void I422ToARGBRow_AVX2(const uint8* src_y,
                        const uint8* src_v,
                        uint8* dst_argb,
                        int width);
+void I422ToARGBMatrixRow_AVX2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              struct YuvConstants* YuvConstants,
+                              int width);
 void I422ToBGRARow_AVX2(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
@@ -1069,6 +1107,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_argb,
                         int width);
+void I422ToARGBMatrixRow_SSSE3(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_argb,
+                               struct YuvConstants* YuvConstants,
+                               int width);
 void I411ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -1203,6 +1247,12 @@ void I422ToARGBRow_Any_AVX2(const uint8* src_y,
                            const uint8* src_v,
                            uint8* dst_argb,
                            int width);
+void I422ToARGBMatrixRow_Any_AVX2(const uint8* src_y,
+                                  const uint8* src_u,
+                                  const uint8* src_v,
+                                  uint8* dst_argb,
+                                  struct YuvConstants* YuvConstants,
+                                  int width);
 void I422ToBGRARow_Any_AVX2(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
@@ -1233,6 +1283,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
+void I422ToARGBMatrixRow_Any_SSSE3(const uint8* src_y,
+                                   const uint8* src_u,
+                                   const uint8* src_v,
+                                   uint8* dst_argb,
+                                   struct YuvConstants* YuvConstants,
+                                   int width);
 void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
@@ -1463,7 +1519,13 @@ void I422ToARGBRow_Any_NEON(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb,
+                            struct YuvConstants* YuvConstants,
                            int width);
+void I422ToARGBMatrixRow_Any_NEON(const uint8* src_y,
+                                  const uint8* src_u,
+                                  const uint8* src_v,
+                                  uint8* dst_argb,
+                                  int width);
 void I411ToARGBRow_Any_NEON(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1475
+#define LIBYUV_VERSION 1476
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2156,6 +2156,51 @@ void I422ToUYVYRow_C(const uint8* src_y,
  }
 }
+#if defined(HAS_I422TOARGBMATRIXROW_SSSE3)
+extern struct YuvConstants kYuvConstants;
+extern struct YuvConstants kYuvJConstants;
+// JPeg color space version of I422ToARGB
+void J422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  I422ToARGBMatrixRow_SSSE3(y_buf, u_buf, v_buf, dst_argb,
+                            &kYuvJConstants, width);
+}
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  I422ToARGBMatrixRow_SSSE3(y_buf, u_buf, v_buf, dst_argb,
+                            &kYuvConstants, width);
+}
+#if defined(HAS_I422TOARGBMATRIXROW_AVX2)
+// JPeg color space version of I422ToARGB
+void J422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  I422ToARGBMatrixRow_AVX2(y_buf, u_buf, v_buf, dst_argb,
+                           &kYuvJConstants, width);
+}
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  I422ToARGBMatrixRow_AVX2(y_buf, u_buf, v_buf, dst_argb,
+                           &kYuvConstants, width);
+}
+#endif
+#endif
 // Maximum temporary width for wrappers to process at a time, in pixels.
 #define MAXTWIDTH 2048

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1319,16 +1319,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
-struct YuvConstants {
-  lvec8 kUVToB;     // 0
-  lvec8 kUVToG;     // 32
-  lvec8 kUVToR;     // 64
-  lvec16 kUVBiasB;  // 96
-  lvec16 kUVBiasG;  // 128
-  lvec16 kUVBiasR;  // 160
-  lvec16 kYToRgb;   // 192
-};
 // BT.601 YUV to RGB reference
 //  R = (Y - 16) * 1.164              - V * -1.596
 //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
@@ -1351,7 +1341,7 @@ struct YuvConstants {
 #define BR            (VR * 128 + YGB)
 // BT601 constants for YUV to RGB.
-static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+YuvConstants SIMD_ALIGNED(kYuvConstants) = {
  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
@@ -1365,7 +1355,7 @@ static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
 };
 // BT601 constants for NV21 where chroma plane is VU instead of UV.
-static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+YuvConstants SIMD_ALIGNED(kYvuConstants) = {
  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
@@ -1658,11 +1648,12 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
  );
 }
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+void OMITFP I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
+                                      const uint8* u_buf,
-                                const uint8* v_buf,
+                                      const uint8* v_buf,
-                                uint8* dst_argb,
+                                      uint8* dst_argb,
-                                int width) {
+                                      struct YuvConstants* YuvConstants,
+                                      int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -1678,33 +1669,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
    [v_buf]"+r"(v_buf),    // %[v_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : [kYuvConstants]"r"(YuvConstants)  // %[YuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
-                                int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(kYuvConstants)
-    STOREARGB
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  );
@@ -1939,56 +1904,15 @@ void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I422TOBGRAROW_AVX2
-#if defined(HAS_I422TOARGBROW_AVX2)
+#if defined(HAS_I422TOARGBMATRIXROW_AVX2)
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
-                               int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
-    // Step 3: Weave into ARGB
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
-    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
-}
-#endif  // HAS_I422TOARGBROW_AVX2
-#if defined(HAS_J422TOARGBROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
+void OMITFP I422ToARGBMatrixRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
+                                     const uint8* u_buf,
-                               const uint8* v_buf,
+                                     const uint8* v_buf,
-                               uint8* dst_argb,
+                                     uint8* dst_argb,
-                               int width) {
+                                     struct YuvConstants* YuvConstants,
+                                     int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
@@ -2016,12 +1940,12 @@ void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
    [v_buf]"+r"(v_buf),    // %[v_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
+  : [kYuvConstants]"r"(YuvConstants)  // %[YuvConstants]
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  );
 }
-#endif  // HAS_J422TOARGBROW_AVX2
+#endif  // HAS_I422TOARGBMATRIXROW_AVX2
 #if defined(HAS_I422TOABGRROW_AVX2)
 // 16 pixels

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -25,16 +25,6 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
-struct YuvConstants {
-  lvec8 kUVToB;
-  lvec8 kUVToG;
-  lvec8 kUVToR;
-  lvec16 kUVBiasB;
-  lvec16 kUVBiasG;
-  lvec16 kUVBiasR;
-  lvec16 kYToRgb;
-};
 #define KUVTOB   0
 #define KUVTOG   32
 #define KUVTOR   64
@@ -65,7 +55,7 @@ struct YuvConstants {
 #define BR            (VR * 128 + YGB)
 // BT601 constants for YUV to RGB.
-static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+YuvConstants SIMD_ALIGNED(kYuvConstants) = {
  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
@@ -79,7 +69,7 @@ static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
 };
 // BT601 constants for NV21 where chroma plane is VU instead of UV.
-static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+YuvConstants SIMD_ALIGNED(kYvuConstants) = {
  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
@@ -124,7 +114,7 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
 #define BRJ             (VRJ * 128 + YGBJ)
 // JPEG constants for YUV to RGB.
-static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
@@ -155,12 +145,13 @@ static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
 // 64 bit
 #if defined(_M_X64)
-#if defined(HAS_I422TOARGBROW_SSSE3)
+#if defined(HAS_I422TOARGBMATRIXROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
+void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
+                               const uint8* u_buf,
-                         const uint8* v_buf,
+                               const uint8* v_buf,
-                         uint8* dst_argb,
+                               uint8* dst_argb,
-                         int width) {
+                               struct YuvConstants* YuvConstants,
+                               int width) {
  __m128i xmm0, xmm1, xmm2, xmm3;
  const __m128i xmm5 = _mm_set1_epi8(-1);
  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
@@ -172,15 +163,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
    xmm1 = _mm_loadu_si128(&xmm0);
    xmm2 = _mm_loadu_si128(&xmm0);
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)YuvConstants->kUVToB);
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)YuvConstants->kUVToG);
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)YuvConstants->kUVToR);
-    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
+    xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0);
-    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
+    xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1);
-    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
+    xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2);
    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
-    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
+    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)YuvConstants->kYToRgb);
    xmm0 = _mm_adds_epi16(xmm0, xmm3);
    xmm1 = _mm_adds_epi16(xmm1, xmm3);
    xmm2 = _mm_adds_epi16(xmm2, xmm3);
@@ -2012,77 +2003,45 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
    __asm lea        edx,  [edx + 64]                                          \
  }
-#ifdef HAS_I422TOARGBROW_AVX2
+#ifdef HAS_I422TOARGBMATRIXROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 __declspec(naked)
-void I422ToARGBRow_AVX2(const uint8* y_buf,
+void I422ToARGBMatrixRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
+                              const uint8* u_buf,
-                        const uint8* v_buf,
+                              const uint8* v_buf,
-                        uint8* dst_argb,
+                              uint8* dst_argb,
-                        int width) {
+                              struct YuvConstants* YuvConstants,
+                              int width) {
  __asm {
    push       esi
    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
+    push       ebp
-    mov        esi, [esp + 8 + 8]   // U
+    mov        eax, [esp + 12 + 4]   // Y
-    mov        edi, [esp + 8 + 12]  // V
+    mov        esi, [esp + 12 + 8]   // U
-    mov        edx, [esp + 8 + 16]  // argb
+    mov        edi, [esp + 12 + 12]  // V
-    mov        ecx, [esp + 8 + 20]  // width
+    mov        edx, [esp + 12 + 16]  // argb
-    sub        edi, esi
+    mov        ebp, [esp + 12 + 20]  // YuvConstants
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    mov        ecx, [esp + 12 + 20]  // width
- convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvConstants)
-    STOREARGB_AVX2
-    sub        ecx, 16
-    jg         convertloop
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422TOARGBROW_AVX2
-#ifdef HAS_J422TOARGBROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void J422ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
    sub        edi, esi
    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 convertloop:
    READYUV422_AVX2
-    YUVTORGB_AVX2(kYuvJConstants)
+    YUVTORGB_AVX2(ebp)
    STOREARGB_AVX2
    sub        ecx, 16
    jg         convertloop
+    pop        ebp
    pop        edi
    pop        esi
    vzeroupper
    ret
  }
 }
-#endif  // HAS_J422TOARGBROW_AVX2
+#endif  // HAS_I422TOARGBMATRIXROW_AVX2
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
@@ -2691,11 +2650,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
+void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
+                               const uint8* u_buf,
-                         const uint8* v_buf,
+                               const uint8* v_buf,
-                         uint8* dst_argb,
+                               uint8* dst_argb,
-                         int width) {
+                               struct YuvConstants* YuvConstants,
+                               int width) {
  __asm {
    push       esi
    push       edi
@@ -2704,8 +2664,9 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
    mov        esi, [esp + 12 + 8]   // U
    mov        edi, [esp + 12 + 12]  // V
    mov        edx, [esp + 12 + 16]  // argb
-    mov        ecx, [esp + 12 + 20]  // width
+    mov        ebp, [esp + 12 + 20]  // YuvConstants
-    lea        ebp, kYuvConstants
+    mov        ecx, [esp + 12 + 24]  // width
    sub        edi, esi
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@@ -2724,40 +2685,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }
-// 8 pixels.
-// JPeg color space version of I422ToARGB
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void J422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
- convertloop:
-    READYUV422
-    YUVTORGB(kYuvJConstants)
-    STOREARGB
-    sub        ecx, 8
-    jg         convertloop
-    pop        edi
-    pop        esi
-    ret
-  }
-}
 // 8 pixels.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.