remove row table, make C use math that mimics SIMD for exactness. Also 2x…

remove row table, make C use math that mimics SIMD for exactness. Also 2x faster than old code which mimiced old SIMD via tables. 9000 ms instead of 20000 ms BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/267020 git-svn-id: http://libyuv.googlecode.com/svn/trunk@85 16f28f9a-4ce2-e073-06de-1de4eb20be90

remove row table, make C use math that mimics SIMD for exactness. Also 2x…
remove row table, make C use math that mimics SIMD for exactness. Also 2x faster than old code which mimiced old SIMD via tables. 9000 ms instead of 20000 ms BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/267020 git-svn-id: http://libyuv.googlecode.com/svn/trunk@85 16f28f9a-4ce2-e073-06de-1de4eb20be90
bc8f28eb · fbarchard@google.com · 15c3d45c · bc8f28eb · bc8f28eb · bc8f28eb
Commit bc8f28eb authored Nov 18, 2011 by fbarchard@google.com
6 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 84
+Version: 85
 License: BSD
 License File: LICENSE


--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -44,7 +44,6 @@
        'source/planar_functions.cc',
        'source/rotate.cc',
        'source/row_common.cc',
-        'source/row_table.cc',
        'source/scale.cc',
        'source/video_common.cc',
      ],

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -220,43 +220,41 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
 }

 // C reference code that mimic the YUV assembly.
-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
-    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
-
-static inline void YuvPixel(uint8 y,
-                            uint8 u,
-                            uint8 v,
-                            uint8* rgb_buf,
-                            int ashift,
-                            int rshift,
-                            int gshift,
-                            int bshift) {
-
-  int b = kCoefficientsRgbY[256+u][0];
-  int g = kCoefficientsRgbY[256+u][1];
-  int r = kCoefficientsRgbY[256+u][2];
-  int a = kCoefficientsRgbY[256+u][3];
-
-  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
-  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
-  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
-  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
-
-  b = paddsw(b, kCoefficientsRgbY[y][0]);
-  g = paddsw(g, kCoefficientsRgbY[y][1]);
-  r = paddsw(r, kCoefficientsRgbY[y][2]);
-  a = paddsw(a, kCoefficientsRgbY[y][3]);
-
-  b >>= 6;
-  g >>= 6;
-  r >>= 6;
-  a >>= 6;
-
-  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
-                                        (packuswb(g) << gshift) |
-                                        (packuswb(r) << rshift) |
-                                        (packuswb(a) << ashift);
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static inline uint32 Clip(int32 val) {
+  if (val < 0) {
+    return (uint32) 0;
+  } else if (val > 255){
+    return (uint32) 255;
+  }
+  return (uint32) val;
+}
+
+static inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
+                            int ashift, int rshift, int gshift, int bshift) {
+  int32 y1 = (static_cast<int32>(y) - 16) * YG;
+  uint32 b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
+  uint32 g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
+  uint32 r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
+  *reinterpret_cast<uint32*>(rgb_buf) = (b << bshift) |
+                                        (g << gshift) |
+                                        (r << rshift) |
+                                        (255u << ashift);
 }

 void FastConvertYUVToARGBRow_C(const uint8* y_buf,

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -381,9 +381,9 @@ struct {
  "punpcklbw   %%xmm4,%%xmm3                   \n"                             \
  "psubsw      96(%5),%%xmm3                   \n"                             \
  "pmullw      112(%5),%%xmm3                  \n"                             \
-  "paddw       %%xmm3,%%xmm0                   \n"                             \
-  "paddw       %%xmm3,%%xmm1                   \n"                             \
-  "paddw       %%xmm3,%%xmm2                   \n"                             \
+  "paddsw      %%xmm3,%%xmm0                   \n"                             \
+  "paddsw      %%xmm3,%%xmm1                   \n"                             \
+  "paddsw      %%xmm3,%%xmm2                   \n"                             \
  "psraw       $0x6,%%xmm0                     \n"                             \
  "psraw       $0x6,%%xmm1                     \n"                             \
  "psraw       $0x6,%%xmm2                     \n"                             \
@@ -528,9 +528,9 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,  // rdi
    "punpcklbw   %%xmm4,%%xmm3                 \n"
    "psubsw      96(%5),%%xmm3                 \n"
    "pmullw      112(%5),%%xmm3                \n"
-    "paddw       %%xmm3,%%xmm0                 \n"
-    "paddw       %%xmm3,%%xmm1                 \n"
-    "paddw       %%xmm3,%%xmm2                 \n"
+    "paddsw      %%xmm3,%%xmm0                 \n"
+    "paddsw      %%xmm3,%%xmm1                 \n"
+    "paddsw      %%xmm3,%%xmm2                 \n"
    "psraw       $0x6,%%xmm0                   \n"
    "psraw       $0x6,%%xmm1                   \n"
    "psraw       $0x6,%%xmm2                   \n"

--- a/source/row_table.cc
+++ b/source/row_table.cc
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -574,9 +574,9 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm punpcklbw  xmm3, xmm4                                                \
    __asm psubsw     xmm3, kYSub16                                             \
    __asm pmullw     xmm3, kYToRgb                                             \
-    __asm paddw      xmm0, xmm3           /* B += Y */                         \
-    __asm paddw      xmm1, xmm3           /* G += Y */                         \
-    __asm paddw      xmm2, xmm3           /* R += Y */                         \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
    __asm psraw      xmm0, 6                                                   \
    __asm psraw      xmm1, 6                                                   \
    __asm psraw      xmm2, 6                                                   \
@@ -744,9 +744,9 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
    punpcklbw  xmm3, xmm4
    psubsw     xmm3, kYSub16
    pmullw     xmm3, kYToRgb
-    paddw      xmm0, xmm3           // B += Y
-    paddw      xmm1, xmm3           // G += Y
-    paddw      xmm2, xmm3           // R += Y
+    paddsw     xmm0, xmm3           // B += Y
+    paddsw     xmm1, xmm3           // G += Y
+    paddsw     xmm2, xmm3           // R += Y
    psraw      xmm0, 6
    psraw      xmm1, 6
    psraw      xmm2, 6