Commit 3d6b5658 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

AR30ToARGB using shifts and masking to vectorize

AR30ToARGB will vectorize if the output is masked
together as an int instead of 4 byte stores.
Performance is 2x faster
Was AR30ToARGB_Opt (1585 ms)
Now AR30ToARGB_Opt (746 ms)

Bug: libyuv:777
Test:LibYUVConvertTest.AR30ToARGB_Opt
Change-Id: Idd47ae599d5d125207bb53e618d6d7e784d4a37c
Reviewed-on: https://chromium-review.googlesource.com/923169Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 9c9215b2
......@@ -55,6 +55,10 @@ int ARGBToRGBA(const uint8_t* src_argb,
int width,
int height);
// Aliases
#define ARGBToAB30 ABGRToAR30
#define ABGRToAB30 ARGBToAR30
// Convert ABGR To AR30.
LIBYUV_API
int ABGRToAR30(const uint8_t* src_abgr,
......
......@@ -182,14 +182,11 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
uint32_t ar30 = *(uint32_t*)src_ar30;
uint32_t b = ar30 & 0x3ff;
uint32_t g = (ar30 >> 10) & 0x3ff;
uint32_t r = (ar30 >> 20) & 0x3ff;
uint32_t a = (ar30 >> 30) & 0x3;
dst_argb[0] = b >> 2;
dst_argb[1] = g >> 2;
dst_argb[2] = r >> 2;
dst_argb[3] = a * 0x55;
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
*(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
dst_argb += 4;
src_ar30 += 4;
}
......@@ -199,14 +196,11 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
int x;
for (x = 0; x < width; ++x) {
uint32_t ar30 = *(uint32_t*)src_ar30;
uint32_t b = ar30 & 0x3ff;
uint32_t g = (ar30 >> 10) & 0x3ff;
uint32_t r = (ar30 >> 20) & 0x3ff;
uint32_t a = (ar30 >> 30) & 0x3;
dst_abgr[0] = r >> 2;
dst_abgr[1] = g >> 2;
dst_abgr[2] = b >> 2;
dst_abgr[3] = a * 0x55;
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
*(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
dst_abgr += 4;
src_ar30 += 4;
}
......@@ -217,10 +211,9 @@ void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
for (x = 0; x < width; ++x) {
uint32_t ar30 = *(uint32_t*)src_ar30;
uint32_t b = ar30 & 0x3ff;
uint32_t g = (ar30 >> 10) & 0x3ff;
uint32_t ga = ar30 & 0xc00ffc00;
uint32_t r = (ar30 >> 20) & 0x3ff;
uint32_t a = (ar30 >> 30) & 0x3;
*(uint32_t*)(dst_ab30) = r | (g << 10) | (b << 20) | (a << 30);
*(uint32_t*)(dst_ab30) = r | ga | (b << 20);
dst_ab30 += 4;
src_ar30 += 4;
}
......
......@@ -513,15 +513,21 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \
double time0 = get_time(); \
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
kWidth, NEG kHeight); \
double time1 = get_time(); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
kStrideB, kWidth, NEG kHeight); \
} \
double time2 = get_time(); \
printf(" %8d us C - %8d us OPT\n", \
static_cast<int>((time1 - time0) * 1e6), \
static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_)); \
int max_diff = 0; \
/* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
......@@ -1952,6 +1958,10 @@ TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
// 2x2 frames
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment