Remove sub 16 from yuv conversions and change bias to include it.

BUG=388 TESTED=out\release\libyuv_unittest --gtest_catch_exceptions=0 --gtest_filter=*420ToARGB_Opt | sortms R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/34609004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1216 16f28f9a-4ce2-e073-06de-1de4eb20be90

Remove sub 16 from yuv conversions and change bias to include it.
BUG=388 TESTED=out\release\libyuv_unittest --gtest_catch_exceptions=0 --gtest_filter=*420ToARGB_Opt | sortms R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/34609004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1216 16f28f9a-4ce2-e073-06de-1de4eb20be90
966233e5 · fbarchard@google.com · 8723fc11 · 966233e5 · 966233e5 · 966233e5
Commit 966233e5 authored Dec 31, 2014 by fbarchard@google.com
6 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1215
+Version: 1216
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -96,8 +96,7 @@ extern "C" {
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
-// TODO(fbarchard): Implement SSSE3 version of J422ToARGB
+// #define HAS_J422TOARGBROW_SSSE3
-//#define HAS_J422TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #define HAS_MIRRORROW_SSSE3

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1215
+#define LIBYUV_VERSION 1216
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -65,7 +65,7 @@ YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
 #ifdef HAS_J422TOARGBROW_SSSE3
 YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C,
     1, 4, 7)
-#endif  // HAS_I422TOARGBROW_SSSE3
+#endif  // HAS_J422TOARGBROW_SSSE3
 #ifdef HAS_I422TOARGBROW_AVX2
 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
 #endif  // HAS_I422TOARGBROW_AVX2

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -881,11 +881,6 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 #endif  // HAS_ARGBTOYJROW_AVX2
 #ifdef HAS_ARGBTOUVROW_SSSE3
-// TODO(fbarchard): pass xmm constants to single block of assembly.
-// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
-// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
-// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
-// and considered unsafe.
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
  asm volatile (
@@ -1523,20 +1518,20 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
 }
 #ifdef HAS_I422TOARGBROW_SSSE3
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
+#define YG 74 /* (int8)round(1.164 * 64 + 0.5) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
+#define UB 127 /* min(63,(int8)round(2.018 * 64)) */
+#define UG -25 /* (int8)round(-0.391 * 64 - 0.5) */
 #define UR 0
 #define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
+#define VG -52 /* (int8)round(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
+#define VR 102 /* (int8)round(1.596 * 64 + 0.5) */
 // Bias
-#define BB UB * 128 + VB * 128
+#define BB (UB * 128 + VB * 128 + YG * 16)
-#define BG UG * 128 + VG * 128
+#define BG (UG * 128 + VG * 128 + YG * 16)
-#define BR UR * 128 + VR * 128
+#define BR (UR * 128 + VR * 128 + YG * 16)
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
 struct {
  vec8 kUVToB;  // 0
@@ -1545,11 +1540,10 @@ struct {
  vec16 kUVBiasB;  // 48
  vec16 kUVBiasG;  // 64
  vec16 kUVBiasR;  // 80
-  vec16 kYSub16;  // 96
+  vec16 kYToRgb;  // 96
-  vec16 kYToRgb;  // 112
+  vec8 kVUToB;  // 112
-  vec8 kVUToB;  // 128
+  vec8 kVUToG;  // 128
-  vec8 kVUToG;  // 144
+  vec8 kVUToR;  // 144
-  vec8 kVUToR;  // 160
 } static SIMD_ALIGNED(kYuvConstants) = {
  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
@@ -1557,7 +1551,6 @@ struct {
  { BB, BB, BB, BB, BB, BB, BB, BB },
  { BG, BG, BG, BG, BG, BG, BG, BG },
  { BR, BR, BR, BR, BR, BR, BR, BR },
-  { 16, 16, 16, 16, 16, 16, 16, 16 },
  { YG, YG, YG, YG, YG, YG, YG, YG },
  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
@@ -1607,8 +1600,7 @@ struct {
    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
-    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
+    "pmullw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
-    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
    "paddsw     %%xmm3,%%xmm0                                   \n"            \
    "paddsw     %%xmm3,%%xmm1                                   \n"            \
    "paddsw     %%xmm3,%%xmm2                                   \n"            \
@@ -1623,17 +1615,16 @@ struct {
 #define YVUTORGB                                                               \
    "movdqa     %%xmm0,%%xmm1                                   \n"            \
    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \
+    "pmaddubsw  " MEMACCESS2(112, [kYuvConstants]) ",%%xmm0     \n"            \
-    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \
+    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm1     \n"            \
-    "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \
+    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm2     \n"            \
    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
-    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
+    "pmullw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
-    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
    "paddsw     %%xmm3,%%xmm0                                   \n"            \
    "paddsw     %%xmm3,%%xmm1                                   \n"            \
    "paddsw     %%xmm3,%%xmm2                                   \n"            \
@@ -1767,7 +1758,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
    [dst_raw]"+r"(dst_raw),  // %[dst_raw]
 // TODO(fbarchard): Make width a register for 32 bit.
 #if defined(__APPLE__) && defined(__i386__)
-    [width]"+m"(width)     // %[width]
+    [width]"+m"(width)    // %[width]
 #else
    [width]"+rm"(width)    // %[width]
 #endif
@@ -2059,8 +2050,7 @@ struct {
  lvec16 kUVBiasB_AVX;  // 96
  lvec16 kUVBiasG_AVX;  // 128
  lvec16 kUVBiasR_AVX;  // 160
-  lvec16 kYSub16_AVX;   // 192
+  lvec16 kYToRgb_AVX;   // 192
-  lvec16 kYToRgb_AVX;   // 224
 } static SIMD_ALIGNED(kYuvConstants_AVX) = {
  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
    UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
@@ -2074,8 +2064,6 @@ struct {
    BG, BG, BG, BG, BG, BG, BG, BG },
  { BR, BR, BR, BR, BR, BR, BR, BR,
    BR, BR, BR, BR, BR, BR, BR, BR },
-  { 16, 16, 16, 16, 16, 16, 16, 16,
-    16, 16, 16, 16, 16, 16, 16, 16 },
  { YG, YG, YG, YG, YG, YG, YG, YG,
    YG, YG, YG, YG, YG, YG, YG, YG }
 };
@@ -2102,8 +2090,7 @@ struct {
    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
    "vpunpcklbw  %%ymm4,%%ymm3,%%ymm3                               \n"        \
-    "vpsubsw     " MEMACCESS2(192, [kYuvConstants]) ",%%ymm3,%%ymm3 \n"        \
+    "vpmullw     " MEMACCESS2(192, [kYuvConstants]) ",%%ymm3,%%ymm3 \n"        \
-    "vpmullw     " MEMACCESS2(224, [kYuvConstants]) ",%%ymm3,%%ymm3 \n"        \
    "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
    "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
    "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -24,20 +24,20 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
    (defined(_M_IX86) || defined(_M_X64))
-#define YG 74  /* (int8)(1.164 * 64 + 0.5) */
+#define YG 74 /* (int8)round(1.164 * 64 + 0.5) */
-#define UB 127  /* min(127,(int8)(2.018 * 64)) */
+#define UB 127 /* min(63,(int8)round(2.018 * 64)) */
-#define UG -25  /* (int8)(-0.391 * 64 - 0.5) */
+#define UG -25 /* (int8)round(-0.391 * 64 - 0.5) */
 #define UR 0
 #define VB 0
-#define VG -52  /* (int8)(-0.813 * 64 - 0.5) */
+#define VG -52 /* (int8)round(-0.813 * 64 - 0.5) */
-#define VR 102  /* (int8)(1.596 * 64 + 0.5) */
+#define VR 102 /* (int8)round(1.596 * 64 + 0.5) */
 // Bias
-#define BB UB * 128 + VB * 128
+#define BB (UB * 128 + VB * 128 + YG * 16)
-#define BG UG * 128 + VG * 128
+#define BG (UG * 128 + VG * 128 + YG * 16)
-#define BR UR * 128 + VR * 128
+#define BR (UR * 128 + VR * 128 + YG * 16)
 static const vec8 kUVToB = {
  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
@@ -64,7 +64,6 @@ static const vec8 kVUToG = {
 };
 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
@@ -98,7 +97,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
-    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
    xmm0 = _mm_adds_epi16(xmm0, xmm3);
    xmm1 = _mm_adds_epi16(xmm1, xmm3);
@@ -1489,9 +1487,6 @@ static const lvec8 kUVToG_AVX = {
 static const lvec16 kYToRgb_AVX = {
  YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
 };
-static const lvec16 kYSub16_AVX = {
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-};
 static const lvec16 kUVBiasB_AVX = {
  BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
 };
@@ -1527,7 +1522,6 @@ static const lvec16 kUVBiasR_AVX = {
    __asm lea        eax, [eax + 16]                                           \
    __asm vpermq     ymm3, ymm3, 0xd8                                          \
    __asm vpunpcklbw ymm3, ymm3, ymm4                                          \
-    __asm vpsubsw    ymm3, ymm3, kYSub16_AVX                                   \
    __asm vpmullw    ymm3, ymm3, kYToRgb_AVX                                   \
    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
@@ -1727,7 +1721,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I422TOABGRROW_AVX2
-#ifdef HAS_I422TOARGBROW_SSSE3
+#if defined(HAS_I422TOARGBROW_SSSE3)
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 // Read 8 UV from 444.
@@ -1781,7 +1775,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
    __asm lea        eax, [eax + 8]                                            \
    __asm punpcklbw  xmm3, xmm4                                                \
-    __asm psubsw     xmm3, kYSub16                                             \
    __asm pmullw     xmm3, kYToRgb                                             \
    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
@@ -1809,7 +1802,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
    __asm lea        eax, [eax + 8]                                            \
    __asm punpcklbw  xmm3, xmm4                                                \
-    __asm psubsw     xmm3, kYSub16                                             \
    __asm pmullw     xmm3, kYToRgb                                             \
    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
    __asm paddsw     xmm1, xmm3           /* G += Y */                         \