Commit 6825b161 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

HalfFloat SSE2/AVX2 optimized port scheduling.

Uses 1 add instead of 2 leas to reduce port pressure on ports 1 and 5
used for SIMD instructions.

BUG=libyuv:670
TEST=~/iaca-lin64/bin/iaca.sh -arch HSW out/Release/obj/libyuv/row_gcc.o

Change-Id: I3965ee5dcb49941a535efa611b5988d977f5b65c
Reviewed-on: https://chromium-review.googlesource.com/433391Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
parent 7a54d0a3
......@@ -202,9 +202,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
uint8 b1 = src_argb[4] >> 3;
uint8 g1 = src_argb[5] >> 2;
uint8 r1 = src_argb[6] >> 3;
WRITEWORD(
dst_rgb,
b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27));
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
(r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
......@@ -238,9 +237,8 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb,
uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
WRITEWORD(
dst_rgb,
b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27));
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
(r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
......
// VERSION 2
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
......@@ -5457,12 +5456,13 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"pshufd $0x0,%3,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
"lea " MEMLEA(0x10,0) ",%0 \n"
"add $0x10,%0 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
"cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
......@@ -5473,8 +5473,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
"psrld $0xd,%%xmm2 \n"
"psrld $0xd,%%xmm3 \n"
"packssdw %%xmm3,%%xmm2 \n"
"movdqu %%xmm2," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
......@@ -5488,17 +5487,17 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
#endif // HAS_HALFFLOATROW_SSE2
#ifdef HAS_HALFFLOATROW_AVX2
// TODO(fbarchard): consider vadddw instead of vmulps
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
"lea " MEMLEA(0x20,0) ",%0 \n"
"add $0x20,%0 \n"
"vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
"vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
......@@ -5508,10 +5507,10 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
"vpsrld $0xd,%%ymm3,%%ymm3 \n"
"vpsrld $0xd,%%ymm2,%%ymm2 \n"
"vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
"vmovdqu %%ymm2," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -5526,26 +5525,25 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
"vbroadcastss %3, %%ymm4 \n"
"sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
"vmulps %%ymm2,%%ymm4,%%ymm2 \n"
"vmulps %%ymm3,%%ymm4,%%ymm3 \n"
"vcvtps2ph $3, %%ymm2, %%xmm2 \n"
"vcvtps2ph $3, %%ymm3, %%xmm3 \n"
"vmovdqu %%xmm2," MEMACCESS(1) " \n"
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
"add $0x20,%0 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -5560,22 +5558,21 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
asm volatile (
"sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
"vcvtps2ph $3, %%ymm2, %%xmm2 \n"
"vcvtps2ph $3, %%ymm3, %%xmm3 \n"
"vmovdqu %%xmm2," MEMACCESS(1) " \n"
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
"add $0x20,%0 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
......
......@@ -6070,11 +6070,12 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
mulss xmm4, kExpBias
pshufd xmm4, xmm4, 0
pxor xmm5, xmm5
sub edx, eax
// 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
lea eax, [eax + 16]
add eax, 16
movdqa xmm3, xmm2
punpcklwd xmm2, xmm5
cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
......@@ -6085,8 +6086,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
psrld xmm2, 13
psrld xmm3, 13
packssdw xmm2, xmm3
movdqu [edx], xmm2
lea edx, [edx + 16]
movdqu [eax + edx - 16], xmm2
sub ecx, 8
jg convertloop
ret
......@@ -6108,11 +6108,12 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vmulss xmm4, xmm4, kExpBias
vbroadcastss ymm4, xmm4
vpxor ymm5, ymm5, ymm5
sub edx, eax
// 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
lea eax, [eax + 32]
add eax, 32
vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
vpunpcklwd ymm2, ymm2, ymm5
vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
......@@ -6122,8 +6123,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
vpsrld ymm2, ymm2, 13
vpackssdw ymm2, ymm2, ymm3
vmovdqu [edx], ymm2
lea edx, [edx + 32]
vmovdqu [eax + edx - 32], ymm2
sub ecx, 16
jg convertloop
vzeroupper
......@@ -6142,21 +6142,21 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
mov edx, [esp + 8] /* dst */
vbroadcastss ymm4, [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */
sub edx, eax
// 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
lea eax, [eax + 32]
add eax, 32
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
vcvtdq2ps ymm3, ymm3
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
vmulps ymm3, ymm3, ymm4
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm3, ymm3, 3
vmovdqu [edx], xmm2
vmovdqu [edx + 16], xmm3
lea edx, [edx + 32]
vmovdqu [eax + edx + 32], xmm2
vmovdqu [eax + edx + 32 + 16], xmm3
sub ecx, 16
jg convertloop
vzeroupper
......
......@@ -45,9 +45,10 @@ static void ScalePlaneDown2(int src_width,
int y;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear
? ScaleRowDown2Linear_C
: ScaleRowDown2Box_C);
filtering == kFilterNone
? ScaleRowDown2_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_C
: ScaleRowDown2Box_C);
int row_stride = src_stride << 1;
if (!filtering) {
src_ptr += src_stride; // Point to odd rows.
......
......@@ -30,113 +30,107 @@ namespace libyuv {
#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
align_buffer_page_end( \
src_u, \
SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end( \
src_v, \
SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end( \
dst_u_c, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end( \
dst_v_c, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end( \
dst_u_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end( \
dst_v_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
} \
} \
memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_u_c, 2, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_v_c, 3, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_u_opt, 102, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_v_opt, 103, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
static_cast<int>(dst_y_opt[i * kWidth + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_EQ(0, max_diff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
int abs_diff = abs( \
static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
static_cast<int>( \
dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 3); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
int abs_diff = abs( \
static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
static_cast<int>( \
dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 3); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_u_c); \
free_aligned_buffer_page_end(dst_v_c); \
free_aligned_buffer_page_end(dst_y_opt); \
free_aligned_buffer_page_end(dst_u_opt); \
free_aligned_buffer_page_end(dst_v_opt); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
} \
} \
memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_u_c, 2, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_v_c, 3, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_u_opt, 102, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_v_opt, 103, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
static_cast<int>(dst_y_opt[i * kWidth + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_EQ(0, max_diff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
int abs_diff = abs( \
static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
static_cast<int>( \
dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 3); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
int abs_diff = abs( \
static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
static_cast<int>( \
dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 3); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_u_c); \
free_aligned_buffer_page_end(dst_v_c); \
free_aligned_buffer_page_end(dst_y_opt); \
free_aligned_buffer_page_end(dst_u_opt); \
free_aligned_buffer_page_end(dst_v_opt); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
}
#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
......@@ -172,19 +166,15 @@ TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
align_buffer_page_end(src_uv, \
kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end( \
dst_u_c, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end( \
dst_v_c, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end( \
dst_u_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end( \
dst_v_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
uint8* src_u = src_uv + OFF_U; \
uint8* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \
int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \
......@@ -288,88 +278,84 @@ TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2)
TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
align_buffer_page_end( \
src_u, \
SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end( \
src_v, \
SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end( \
dst_uv_c, \
SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end( \
dst_uv_opt, \
SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
} \
} \
memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_uv_c, 2, \
SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_uv_opt, 102, \
SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
static_cast<int>(dst_y_opt[i * kWidth + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 1); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
int abs_diff = \
abs(static_cast<int>( \
dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
static_cast<int>( \
dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 1); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \
free_aligned_buffer_page_end(dst_uv_opt); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
} \
} \
memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_uv_c, 2, \
SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_uv_opt, 102, \
SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
static_cast<int>(dst_y_opt[i * kWidth + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 1); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
int abs_diff = \
abs(static_cast<int>( \
dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
static_cast<int>( \
dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 1); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \
free_aligned_buffer_page_end(dst_uv_opt); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
}
#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
......@@ -393,24 +379,19 @@ TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
align_buffer_page_end(src_uv, \
2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end( \
dst_u_c, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end( \
dst_v_c, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end( \
dst_u_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end( \
dst_v_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
......@@ -1388,12 +1369,10 @@ TEST_F(LibYUVConvertTest, MJPGToI420) {
const int kSize = kImageSize + kOff;
align_buffer_page_end(orig_pixels, kSize);
align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
align_buffer_page_end(
dst_u_opt,
SUBSAMPLE(benchmark_width_, 2) * SUBSAMPLE(benchmark_height_, 2));
align_buffer_page_end(
dst_v_opt,
SUBSAMPLE(benchmark_width_, 2) * SUBSAMPLE(benchmark_height_, 2));
align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
SUBSAMPLE(benchmark_height_, 2));
align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
SUBSAMPLE(benchmark_height_, 2));
// EOI, SOI to make MJPG appear valid.
memset(orig_pixels, 0, kSize);
......@@ -1465,20 +1444,16 @@ TEST_F(LibYUVConvertTest, NV12Crop) {
uint8* src_uv = src_y + kWidth * kHeight;
align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
align_buffer_page_end(
dst_u,
SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
align_buffer_page_end(
dst_v,
SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
SUBSAMPLE(kDestHeight, SUBSAMP_Y));
align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
SUBSAMPLE(kDestHeight, SUBSAMP_Y));
align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight);
align_buffer_page_end(
dst_u_2,
SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
align_buffer_page_end(
dst_v_2,
SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
SUBSAMPLE(kDestHeight, SUBSAMP_Y));
align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
SUBSAMPLE(kDestHeight, SUBSAMP_Y));
for (int i = 0; i < kHeight * kWidth; ++i) {
src_y[i] = (fastrand() & 0xff);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment