Commit ca4749dd authored by fbarchard@google.com's avatar fbarchard@google.com

Scale Even sizes

BUG=none
TEST=build\release\libyuv_unittest.exe  --gtest_catch_exceptions=0 --gtest_filter=*ARGBScale*
Review URL: https://webrtc-codereview.appspot.com/570005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@262 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 2e786a73
......@@ -22,7 +22,7 @@ extern "C" {
enum FilterMode {
kFilterNone = 0, // Point sample; Fastest
kFilterBilinear = 1, // Faster than box, but lower quality scaling down.
kFilterBox = 2 // Highest quality
kFilterBox = 2 // Highest quality (not supported for ARGB)
};
int ARGBScale(const uint8* src_argb, int src_stride_argb,
......
......@@ -775,13 +775,14 @@ __declspec(naked) __declspec(align(16))
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
__asm {
push esi
push edi
push ebp
mov edi, [esp + 8 + 4] // dst
mov eax, [esp + 8 + 8] // v32
mov ebp, [esp + 8 + 12] // width
mov edx, [esp + 8 + 16] // dst_stride
mov ebx, [esp + 8 + 20] // height
mov edi, [esp + 12 + 4] // dst
mov eax, [esp + 12 + 8] // v32
mov ebp, [esp + 12 + 12] // width
mov edx, [esp + 12 + 16] // dst_stride
mov esi, [esp + 12 + 20] // height
lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4
......@@ -790,11 +791,12 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
mov ecx, ebp
rep stosd
add edi, edx
sub ebx, 1
sub esi, 1
jg convertloop
pop ebp
pop edi
pop esi
ret
}
}
......
......@@ -55,7 +55,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
asm volatile (
"1: \n"
// load even pixels into q0, odd into q1
"vld2.u8 {q0,q1}, [%0]! \n"
"vld2.u8 {q0,q1}, [%0]! \n"
"vst1.u8 {q0}, [%1]! \n" // store even pixels
"subs %2, %2, #16 \n" // 16 processed per loop
"bgt 1b \n"
......@@ -71,14 +71,14 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
"add %1, %0 \n"
"1: \n"
"vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
"vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
// row 2 add adjacent, add row 1 to row 2
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
......@@ -1399,6 +1399,10 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
}
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
// Normal formula for bilinear interpolation is:
// source_y_fraction * row1 + (1 - source_y_fraction) row0
// SSE2 version using the a single multiply of difference:
// source_y_fraction * (row1 - row0) + row0
#define HAS_SCALEFILTERROWS_SSE2
__declspec(naked) __declspec(align(16))
static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
......@@ -1424,8 +1428,6 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
pshufd xmm5, xmm5, 0
pxor xmm4, xmm4
// f * row1 + (1 - frac) row0
// frac * (row1 - row0) + row0
align 16
xloop:
movdqa xmm0, [esi] // row0
......@@ -3677,11 +3679,13 @@ void ScalePlane(const uint8* src, int src_stride,
// optimized, 3/8
ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
} else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
} else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
filtering != kFilterBilinear) {
// optimized, 1/4
ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
} else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
} else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
filtering != kFilterBilinear) {
// optimized, 1/8
ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
......
This diff is collapsed.
......@@ -20,129 +20,169 @@ namespace libyuv {
static int ARGBTestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f) {
const int b = 128;
int src_argb_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)) * 4;
int src_stride_argb = (2 * b + src_width) * 4;
int b = 128;
align_buffer_16(src_argb, src_argb_plane_size)
memset(src_argb, 1, src_argb_plane_size);
int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)) * 4;
int src_stride_y = (2 * b + src_width) * 4;
align_buffer_16(src_y, src_y_plane_size)
int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)) * 4;
int dst_stride_y = (2 * b + dst_width) * 4;
int dst_argb_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)) * 4;
int dst_stride_argb = (2 * b + dst_width) * 4;
srandom(time(NULL));
int i, j;
for (i = b; i < (src_height + b); ++i) {
for (j = b; j < (src_width + b) * 4; ++j) {
src_y[(i * src_stride_y) + j] = (random() & 0xff);
src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
}
}
const int runs = 1000;
align_buffer_16(dst_y_c, dst_y_plane_size)
align_buffer_16(dst_y_opt, dst_y_plane_size)
MaskCpuFlags(kCpuInitialized);
align_buffer_16(dst_argb_c, dst_argb_plane_size)
align_buffer_16(dst_argb_opt, dst_argb_plane_size)
memset(dst_argb_c, 2, dst_argb_plane_size);
memset(dst_argb_opt, 3, dst_argb_plane_size);
// Warm up both versions for consistent benchmarks.
MaskCpuFlags(0); // Disable all CPU optimization.
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
MaskCpuFlags(-1); // Enable all CPU optimization.
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
MaskCpuFlags(0); // Disable all CPU optimization.
double c_time = get_time();
for (i = 0; i < runs; ++i)
ARGBScale(src_y + (src_stride_y * b) + b * 4, src_stride_y,
for (i = 0; i < runs; ++i) {
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_y_c + (dst_stride_y * b) + b * 4, dst_stride_y,
dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
}
c_time = (get_time() - c_time) / runs;
MaskCpuFlags(-1);
MaskCpuFlags(-1); // Enable all CPU optimization.
double opt_time = get_time();
for (i = 0; i < runs; ++i)
ARGBScale(src_y + (src_stride_y * b) + b * 4, src_stride_y,
for (i = 0; i < runs; ++i) {
ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
src_width, src_height,
dst_y_opt + (dst_stride_y * b) + b * 4, dst_stride_y,
dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
dst_width, dst_height, f);
}
opt_time = (get_time() - opt_time) / runs;
printf ("filter %d - %8d us c - %8d us opt\n",
f, (int)(c_time*1e6), (int)(opt_time*1e6));
// Report performance of C vs OPT
printf("filter %d - %8d us C - %8d us OPT\n",
f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// of the buffers and look to see that the max difference isn't
// over 2.
int err = 0;
int max_diff = 0;
for (i = b; i < (dst_height + b); ++i) {
for (j = b * 4; j < (dst_width + b) * 4; ++j) {
int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
dst_y_opt[(i * dst_stride_y) + j]);
int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
dst_argb_opt[(i * dst_stride_argb) + j]);
if (abs_diff > max_diff)
max_diff = abs_diff;
}
}
if (max_diff > 2)
err++;
free_aligned_buffer_16(dst_y_c)
free_aligned_buffer_16(dst_y_opt)
free_aligned_buffer_16(src_y)
return err;
free_aligned_buffer_16(dst_argb_c)
free_aligned_buffer_16(dst_argb_opt)
free_aligned_buffer_16(src_argb)
return max_diff;
}
TEST_F(libyuvTest, ARGBScaleDownBy2) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 2;
const int dst_height = src_height / 2;
int err = 0;
for (int f = 0; f < 2; ++f) {
err += ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
int err = ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, ARGBScaleDownBy4) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 4;
const int dst_height = src_height / 4;
int err = 0;
for (int f = 0; f < 2; ++f) {
err += ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
int err = ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ARGBScaleDownBy5) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 5;
const int dst_height = src_height / 5;
for (int f = 0; f < 2; ++f) {
int err = ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ARGBScaleDownBy8) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 8;
const int dst_height = src_height / 8;
EXPECT_EQ(0, err);
for (int f = 0; f < 2; ++f) {
int err = ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ARGBScaleDownBy34) {
TEST_F(libyuvTest, ARGBScaleDownBy16) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 16;
const int dst_height = src_height / 16;
for (int f = 0; f < 2; ++f) {
int err = ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ARGBScaleDownBy34) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width * 3 / 4;
const int dst_height = src_height * 3 / 4;
int err = 0;
for (int f = 0; f < 2; ++f) {
err += ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
int err = ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, ARGBScaleDownBy38) {
......@@ -150,31 +190,27 @@ TEST_F(libyuvTest, ARGBScaleDownBy38) {
int src_height = 720;
int dst_width = src_width * 3 / 8;
int dst_height = src_height * 3 / 8;
int err = 0;
for (int f = 0; f < 2; ++f) {
err += ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
int err = ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, ARGBScalePlaneBilinear) {
TEST_F(libyuvTest, ARGBScaleTo1366) {
int src_width = 1280;
int src_height = 720;
int dst_width = 1366;
int dst_height = 768;
int err = 0;
for (int f = 0; f < 2; ++f) {
err += ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
int err = ARGBTestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
EXPECT_EQ(0, err);
}
} // namespace libyuv
......@@ -20,8 +20,7 @@ namespace libyuv {
static int TestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f) {
int b = 128;
const int b = 128;
int src_width_uv = (src_width + 1) >> 1;
int src_height_uv = (src_height + 1) >> 1;
......@@ -47,7 +46,6 @@ static int TestFilter(int src_width, int src_height,
srandom(time(NULL));
int i, j;
for (i = b; i < (src_height + b); ++i) {
for (j = b; j < (src_width + b); ++j) {
src_y[(i * src_stride_y) + j] = (random() & 0xff);
......@@ -69,10 +67,29 @@ static int TestFilter(int src_width, int src_height,
align_buffer_16(dst_u_opt, dst_uv_plane_size)
align_buffer_16(dst_v_opt, dst_uv_plane_size)
MaskCpuFlags(kCpuInitialized);
// Warm up both versions for consistent benchmarks.
MaskCpuFlags(0); // Disable all CPU optimization.
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
MaskCpuFlags(-1); // Enable all CPU optimization.
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
MaskCpuFlags(0); // Disable all CPU optimization.
double c_time = get_time();
for (i = 0; i < runs; ++i)
for (i = 0; i < runs; ++i) {
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
......@@ -81,13 +98,12 @@ static int TestFilter(int src_width, int src_height,
dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
}
c_time = (get_time() - c_time) / runs;
MaskCpuFlags(-1);
MaskCpuFlags(-1); // Enable all CPU optimization.
double opt_time = get_time();
for (i = 0; i < runs; ++i)
for (i = 0; i < runs; ++i) {
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
......@@ -96,24 +112,25 @@ static int TestFilter(int src_width, int src_height,
dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height, f);
}
opt_time = (get_time() - opt_time) / runs;
printf ("filter %d - %8d us c - %8d us opt\n",
f, (int)(c_time*1e6), (int)(opt_time*1e6));
// Report performance of C vs OPT
printf("filter %d - %8d us C - %8d us OPT\n",
f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// of the buffers and look to see that the max difference isn't
// over 2.
int err = 0;
int max_diff = 0;
for (i = b; i < (dst_height + b); ++i) {
for (j = b; j < (dst_width + b); ++j) {
int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
dst_y_opt[(i * dst_stride_y) + j]);
if (abs_diff > max_diff)
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
}
......@@ -121,19 +138,17 @@ static int TestFilter(int src_width, int src_height,
for (j = b; j < (dst_width_uv + b); ++j) {
int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
dst_u_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff)
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
dst_v_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff)
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
}
if (max_diff > 2)
err++;
free_aligned_buffer_16(dst_y_c)
free_aligned_buffer_16(dst_u_c)
free_aligned_buffer_16(dst_v_c)
......@@ -145,55 +160,91 @@ static int TestFilter(int src_width, int src_height,
free_aligned_buffer_16(src_u)
free_aligned_buffer_16(src_v)
return err;
return max_diff;
}
TEST_F(libyuvTest, ScaleDownBy2) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 2;
const int dst_height = src_height / 2;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_EQ(0, err);
for (int f = 0; f < 3; ++f) {
int err = TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ScaleDownBy4) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 4;
const int dst_height = src_height / 4;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
for (int f = 0; f < 3; ++f) {
int err = TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(2, err); // This is the only scale factor with error of 2.
}
}
TEST_F(libyuvTest, ScaleDownBy5) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 5;
const int dst_height = src_height / 5;
for (int f = 0; f < 3; ++f) {
int err = TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ScaleDownBy8) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 8;
const int dst_height = src_height / 8;
for (int f = 0; f < 3; ++f) {
int err = TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
EXPECT_EQ(0, err);
TEST_F(libyuvTest, ScaleDownBy16) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width / 16;
const int dst_height = src_height / 16;
for (int f = 0; f < 3; ++f) {
int err = TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ScaleDownBy34) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width * 3 / 4;
const int dst_height = src_height * 3 / 4;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_EQ(0, err);
for (int f = 0; f < 3; ++f) {
int err = TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ScaleDownBy38) {
......@@ -201,29 +252,27 @@ TEST_F(libyuvTest, ScaleDownBy38) {
int src_height = 720;
int dst_width = src_width * 3 / 8;
int dst_height = src_height * 3 / 8;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_EQ(0, err);
for (int f = 0; f < 3; ++f) {
int err = TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
TEST_F(libyuvTest, ScalePlaneBilinear) {
TEST_F(libyuvTest, ScaleTo1366) {
int src_width = 1280;
int src_height = 720;
int dst_width = 1366;
int dst_height = 768;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_EQ(0, err);
for (int f = 0; f < 3; ++f) {
int err = TestFilter(src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_GE(1, err);
}
}
} // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment