Commit 05416e2d authored by fbarchard@google.com's avatar fbarchard@google.com

Box filter for YUV use rows with accumulation buffer for better memory behavior.…

Box filter for YUV use rows with accumulation buffer for better memory behavior.  The old code would do columns accumulated into registers, and then store the result once.  This was slow from a memory point of view.  The new code does a row of source at a time, updating an accumulation buffer every row.  The accumulation buffer is small, and should fit cache.  Before each accumulation of N rows, the buffer needs to be reset to zero.  If the memset is a bottleneck, it would be faster to do the first row without an add, storing to the accumulation buffer, and then add for the remaining rows.
BUG=425
TESTED=out\release\libyuv_unittest --gtest_filter=*ScaleTo1x1*
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/52659004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1428 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent b07de879
......@@ -30,13 +30,11 @@ extern "C" {
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_FIXEDDIV1_X86
#define HAS_FIXEDDIV_X86
#define HAS_SCALEADDROWS_SSE2
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
......@@ -50,17 +48,21 @@ extern "C" {
#define HAS_SCALEROWDOWN4_SSE2
#endif
// The following are available on VS2012.
// The following are available on VS2012:
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_SCALEADDROWS_AVX2
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
#endif
// The following are available on Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEADDROWS_NEON
#define HAS_SCALEARGBCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
......@@ -183,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height);
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
......@@ -289,14 +289,10 @@ void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRows_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRows_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
......@@ -442,10 +438,8 @@ void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRows_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
......
......@@ -733,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) {
int j;
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
......@@ -750,29 +750,29 @@ static void ScalePlaneBox(int src_width, int src_height,
const uint16* src_ptr, uint8* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C:
((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
#if defined(HAS_SCALEADDROWS_SSE2)
void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
ScaleAddRow_C;
#if defined(HAS_SCALEADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleAddRows = ScaleAddRows_Any_SSE2;
ScaleAddRow = ScaleAddRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
ScaleAddRows = ScaleAddRows_SSE2;
ScaleAddRow = ScaleAddRow_SSE2;
}
}
#endif
#if defined(HAS_SCALEADDROWS_AVX2)
#if defined(HAS_SCALEADDROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleAddRows = ScaleAddRows_Any_AVX2;
ScaleAddRow = ScaleAddRow_Any_AVX2;
if (IS_ALIGNED(src_width, 32)) {
ScaleAddRows = ScaleAddRows_AVX2;
ScaleAddRow = ScaleAddRow_AVX2;
}
}
#endif
#if defined(HAS_SCALEADDROWS_NEON)
#if defined(HAS_SCALEADDROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleAddRows = ScaleAddRows_Any_NEON;
ScaleAddRow = ScaleAddRow_Any_NEON;
if (IS_ALIGNED(src_width, 16)) {
ScaleAddRows = ScaleAddRows_NEON;
ScaleAddRow = ScaleAddRow_NEON;
}
}
#endif
......@@ -786,7 +786,11 @@ static void ScalePlaneBox(int src_width, int src_height,
y = max_y;
}
boxheight = MIN1((y >> 16) - iy);
ScaleAddRows(src, src_stride, (uint16*)(row16), src_width, boxheight);
memset(row16, 0, src_width * 2);
for (k = 0; k < boxheight; ++k) {
ScaleAddRow(src, (uint16 *)(row16), src_width);
src += src_stride;
}
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
dst_ptr += dst_stride;
}
......@@ -798,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint16* src_ptr, uint16* dst_ptr) {
int j;
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
......@@ -814,12 +818,12 @@ static void ScalePlaneBox_16(int src_width, int src_height,
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
ScaleAddRow_16_C;
#if defined(HAS_SCALEADDROWS_16_SSE2)
#if defined(HAS_SCALEADDROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
ScaleAddRows = ScaleAddRows_16_SSE2;
ScaleAddRow = ScaleAddRow_16_SSE2;
}
#endif
......@@ -832,7 +836,11 @@ static void ScalePlaneBox_16(int src_width, int src_height,
y = max_y;
}
boxheight = MIN1((y >> 16) - iy);
ScaleAddRows(src, src_stride, (uint32*)(row32), src_width, boxheight);
memset(row32, 0, src_width * 4);
for (k = 0; k < boxheight; ++k) {
ScaleAddRow(src, (uint32 *)(row32), src_width);
src += src_stride;
}
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
dst_ptr += dst_stride;
}
......
......@@ -169,25 +169,23 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
#endif
// Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROWS_SIMD, SCALEADDROWS_C, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint16* dst_ptr, int src_width, int src_height) { \
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
int n = src_width & ~MASK; \
if (n > 0) { \
SCALEADDROWS_SIMD(src_ptr, src_stride, dst_ptr, n, src_height); \
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \
SCALEADDROWS_C(src_ptr + n, src_stride, \
dst_ptr + n, src_width & MASK, src_height); \
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#ifdef HAS_SCALEADDROWS_SSE2
SAANY(ScaleAddRows_Any_SSE2, ScaleAddRows_SSE2, ScaleAddRows_C, 15)
#ifdef HAS_SCALEADDROW_SSE2
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
#endif
#ifdef HAS_SCALEADDROWS_AVX2
SAANY(ScaleAddRows_Any_AVX2, ScaleAddRows_AVX2, ScaleAddRows_C, 31)
#ifdef HAS_SCALEADDROW_AVX2
SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#endif
#ifdef HAS_SCALEADDROWS_NEON
SAANY(ScaleAddRows_Any_NEON, ScaleAddRows_NEON, ScaleAddRows_C, 15)
#ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif
#undef SAANY
......
......@@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
int x;
assert(src_width > 0);
assert(src_height > 0);
for (x = 0; x < src_width; ++x) {
const uint8* s = src_ptr + x;
unsigned int sum = 0u;
int y;
for (y = 0; y < src_height; ++y) {
sum += s[0];
s += src_stride;
}
// TODO(fbarchard): Consider limiting height to 256 to avoid overflow.
dst_ptr[x] = sum < 65535u ? sum : 65535u;
for (x = 0; x < src_width - 1; x += 2) {
dst_ptr[0] += src_ptr[0];
dst_ptr[1] += src_ptr[1];
src_ptr += 2;
dst_ptr += 2;
}
if (src_width & 1) {
dst_ptr[0] += src_ptr[0];
}
}
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height) {
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
int x;
assert(src_width > 0);
assert(src_height > 0);
for (x = 0; x < src_width; ++x) {
const uint16* s = src_ptr + x;
unsigned int sum = 0u;
int y;
for (y = 0; y < src_height; ++y) {
sum += s[0];
s += src_stride;
}
// No risk of overflow here now
dst_ptr[x] = sum;
for (x = 0; x < src_width - 1; x += 2) {
dst_ptr[0] += src_ptr[0];
dst_ptr[1] += src_ptr[1];
src_ptr += 2;
dst_ptr += 2;
}
if (src_width & 1) {
dst_ptr[0] += src_ptr[0];
}
}
......
......@@ -800,104 +800,61 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
}
}
// Reads 16xN bytes and produces 16 shorts at a time.
// Reads 16 bytes and accumulates to 16 shorts at a time.
__declspec(naked)
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
__asm {
push esi
push edi
push ebx
push ebp
mov esi, [esp + 16 + 4] // src_ptr
mov edx, [esp + 16 + 8] // src_stride
mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
mov eax, esi // row pointer
mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1
pxor xmm4, xmm4
mov eax, [esp + 4] // src_ptr
mov edx, [esp + 8] // dst_ptr
mov ecx, [esp + 12] // src_width
pxor xmm5, xmm5
// sum rows
xloop:
movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
movdqa xmm3, xmm2
punpcklbw xmm2, xmm4
punpckhbw xmm3, xmm4
movdqu xmm3, [eax] // read 16 bytes
lea eax, [eax + 16]
movdqu xmm0, [edx] // read 16 words from destination
movdqu xmm1, [edx + 16]
movdqa xmm2, xmm3
punpcklbw xmm2, xmm5
punpckhbw xmm3, xmm5
paddusw xmm0, xmm2 // sum 16 words
paddusw xmm1, xmm3
sub ebp, 1
jg xloop
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
lea edi, [edi + 32] // dst_ptr += 16
lea esi, [esi + 16] // src_ptr += 16
mov eax, esi // row pointer
mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1
movdqu [edx], xmm0 // write 16 words to destination
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 16
jg xloop
pop ebp
pop ebx
pop edi
pop esi
ret
}
}
// Reads 32xN bytes and produces 32 shorts at a time.
// Reads 32 bytes and accumulates to 32 shorts at a time.
__declspec(naked)
void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
__asm {
push esi
push edi
push ebx
push ebp
mov esi, [esp + 16 + 4] // src_ptr
mov edx, [esp + 16 + 8] // src_stride
mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
mov eax, esi // row pointer
mov ebp, ebx // height
vpxor ymm0, ymm0, ymm0 // clear accumulators
vpxor ymm1, ymm1, ymm1
vpxor ymm4, ymm4, ymm4
mov eax, [esp + 4] // src_ptr
mov edx, [esp + 8] // dst_ptr
mov ecx, [esp + 12] // src_width
vpxor ymm5, ymm5, ymm5
// sum rows
xloop:
vmovdqu ymm2, [eax] // read 16 pixels
vpermq ymm2, ymm2, 0xd8 // unmutate for vpunpck
lea eax, [eax + edx] // advance to next row
vpunpckhbw ymm3, ymm2, ymm4
vpunpcklbw ymm2, ymm2, ymm4
vmovdqu ymm3, [eax] // read 32 bytes
vpermq ymm3, ymm2, 0xd8 // unmutate for vpunpck
lea eax, [eax + 32]
vmovdqu ymm0, [edx] // read 32 words from destination
vmovdqu ymm1, [edx + 32]
vpunpcklbw ymm2, ymm3, ymm5
vpunpckhbw ymm3, ymm3, ymm5
vpaddusw ymm0, ymm0, ymm2 // sum 16 words
vpaddusw ymm1, ymm1, ymm3
sub ebp, 1
jg xloop
vmovdqu [edi], ymm0
vmovdqu [edi + 32], ymm1
lea edi, [edi + 64] // dst_ptr
lea esi, [esi + 32] // src_ptr
mov eax, esi // row pointer
mov ebp, ebx // height
vpxor ymm0, ymm0, ymm0 // clear accumulators
vpxor ymm1, ymm1, ymm1
vmovdqu [edx], ymm0 // write 32 words to destination
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 32
jg xloop
pop ebp
pop ebx
pop edi
pop esi
vzeroupper
ret
}
......
......@@ -78,7 +78,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
src_u + OFF, \
SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
......@@ -211,7 +211,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
memset(dst_y_opt, 101, kWidth * kHeight); \
memset(dst_uv_opt, 102, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
src_u + OFF, \
SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
......@@ -326,7 +326,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
src_uv + OFF, \
2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
......@@ -435,7 +435,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
} \
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
......@@ -538,7 +538,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
} \
memset(dst_argb_c, 1, kStrideB * kHeight); \
memset(dst_argb_opt, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
dst_argb_c, kWidth * BPP_B, \
......@@ -632,7 +632,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kStride; ++j) \
src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
dst_y_c, kWidth, \
dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
......@@ -690,6 +690,8 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
#if defined(__arm__) || defined (__aarch64__)
// arm version subsamples by summing 4 pixels then multiplying by matrix with
// 4x smaller coefficients which are rounded to nearest integer.
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
#else
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
......@@ -738,7 +740,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
memset(dst_y_opt, 101, kWidth * kHeight); \
memset(dst_uv_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
dst_y_c, kWidth, \
dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
......@@ -814,7 +816,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \
} \
memset(dst_argb_c, 1, kStrideB * kHeightB); \
memset(dst_argb_opt, 101, kStrideB * kHeightB); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
dst_argb_c, kStrideB, \
kWidth, NEG kHeight); \
......@@ -858,7 +860,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
} \
memset(dst_argb_c, 123, kStrideB * kHeightB); \
memset(dst_argb_opt, 123, kStrideB * kHeightB); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_A##To##FMT_B(src_argb, kStrideA, \
dst_argb_c, kStrideB, \
kWidth, kHeight); \
......@@ -948,7 +950,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither##N) { \
} \
memset(dst_argb_c, 1, kStrideB * kHeightB); \
memset(dst_argb_opt, 101, kStrideB * kHeightB); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \
dst_argb_c, kStrideB, \
NULL, kWidth, NEG kHeight); \
......@@ -992,7 +994,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither_Random) { \
} \
memset(dst_argb_c, 123, kStrideB * kHeightB); \
memset(dst_argb_opt, 123, kStrideB * kHeightB); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \
dst_argb_c, kStrideB, \
NULL, kWidth, kHeight); \
......@@ -1051,7 +1053,7 @@ TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \
} \
memset(dst_argb_c, 1, kStrideA * kHeightA); \
memset(dst_argb_opt, 101, kStrideA * kHeightA); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_ATOB(src_argb + OFF, kStrideA, \
dst_argb_c, kStrideA, \
kWidth, NEG kHeight); \
......@@ -1061,7 +1063,7 @@ TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \
dst_argb_opt, kStrideA, \
kWidth, NEG kHeight); \
} \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_ATOB(dst_argb_c, kStrideA, \
dst_argb_c, kStrideA, \
kWidth, NEG kHeight); \
......@@ -1470,7 +1472,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##Dither##N) { \
} \
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, \
src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment