AR30ToARGB using shifts and masking to vectorize

AR30ToARGB will vectorize if the output is masked together as an int instead of 4 byte stores. Performance is 2x faster Was AR30ToARGB_Opt (1585 ms) Now AR30ToARGB_Opt (746 ms) Bug: libyuv:777 Test:LibYUVConvertTest.AR30ToARGB_Opt Change-Id: Idd47ae599d5d125207bb53e618d6d7e784d4a37c Reviewed-on: https://chromium-review.googlesource.com/923169Reviewed-by: Miguel Casas <mcasas@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>

AR30ToARGB using shifts and masking to vectorize
AR30ToARGB will vectorize if the output is masked together as an int instead of 4 byte stores. Performance is 2x faster Was AR30ToARGB_Opt (1585 ms) Now AR30ToARGB_Opt (746 ms) Bug: libyuv:777 Test:LibYUVConvertTest.AR30ToARGB_Opt Change-Id: Idd47ae599d5d125207bb53e618d6d7e784d4a37c Reviewed-on: https://chromium-review.googlesource.com/923169Reviewed-by: Miguel Casas <mcasas@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
3d6b5658 · Frank Barchard · Commit Bot · 9c9215b2 · 3d6b5658 · 3d6b5658
Commit 3d6b5658 authored Feb 16, 2018 by Frank Barchard Committed by Commit Bot Feb 16, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 19 deletions

convert_from_argb.h include/libyuv/convert_from_argb.h +4 -0

row_common.cc source/row_common.cc +12 -19

convert_test.cc unit_test/convert_test.cc +10 -0

No files found.
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@@ -55,6 +55,10 @@ int ARGBToRGBA(const uint8_t* src_argb,
               int width,
               int height);

+// Aliases
+#define ARGBToAB30 ABGRToAR30
+#define ABGRToAB30 ARGBToAR30
+
 // Convert ABGR To AR30.
 LIBYUV_API
 int ABGRToAR30(const uint8_t* src_abgr,

--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -182,14 +182,11 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
  int x;
  for (x = 0; x < width; ++x) {
    uint32_t ar30 = *(uint32_t*)src_ar30;
-    uint32_t b = ar30 & 0x3ff;
-    uint32_t g = (ar30 >> 10) & 0x3ff;
-    uint32_t r = (ar30 >> 20) & 0x3ff;
-    uint32_t a = (ar30 >> 30) & 0x3;
-    dst_argb[0] = b >> 2;
-    dst_argb[1] = g >> 2;
-    dst_argb[2] = r >> 2;
-    dst_argb[3] = a * 0x55;
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
    dst_argb += 4;
    src_ar30 += 4;
  }
@@ -199,14 +196,11 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
  int x;
  for (x = 0; x < width; ++x) {
    uint32_t ar30 = *(uint32_t*)src_ar30;
-    uint32_t b = ar30 & 0x3ff;
-    uint32_t g = (ar30 >> 10) & 0x3ff;
-    uint32_t r = (ar30 >> 20) & 0x3ff;
-    uint32_t a = (ar30 >> 30) & 0x3;
-    dst_abgr[0] = r >> 2;
-    dst_abgr[1] = g >> 2;
-    dst_abgr[2] = b >> 2;
-    dst_abgr[3] = a * 0x55;
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
    dst_abgr += 4;
    src_ar30 += 4;
  }
@@ -217,10 +211,9 @@ void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
  for (x = 0; x < width; ++x) {
    uint32_t ar30 = *(uint32_t*)src_ar30;
    uint32_t b = ar30 & 0x3ff;
-    uint32_t g = (ar30 >> 10) & 0x3ff;
+    uint32_t ga = ar30 & 0xc00ffc00;
    uint32_t r = (ar30 >> 20) & 0x3ff;
-    uint32_t a = (ar30 >> 30) & 0x3;
-    *(uint32_t*)(dst_ab30) = r | (g << 10) | (b << 20) | (a << 30);
+    *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
    dst_ab30 += 4;
    src_ar30 += 4;
  }

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -513,15 +513,21 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
    MaskCpuFlags(disable_cpu_flags_);                                          \
+    double time0 = get_time();                                                 \
    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,         \
                          src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB,  \
                          kWidth, NEG kHeight);                                \
+    double time1 = get_time();                                                 \
    MaskCpuFlags(benchmark_cpu_info_);                                         \
    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV,       \
                            src_v + OFF, kStrideUV, dst_argb_opt + OFF,        \
                            kStrideB, kWidth, NEG kHeight);                    \
    }                                                                          \
+    double time2 = get_time();                                                 \
+    printf(" %8d us C - %8d us OPT\n",                                         \
+         static_cast<int>((time1 - time0) * 1e6),                              \
+         static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_));     \
    int max_diff = 0;                                                          \
    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
    align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight);               \
@@ -1952,6 +1958,10 @@ TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
 TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
 TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
 TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
+TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
+TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
+TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
+TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)

 TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
  // 2x2 frames