Commit 0e9b515f authored by frkoenig@google.com's avatar frkoenig@google.com

Neon 38 downscaler.

Fixed up unit tests for filters to use same image generation and comparison code.

Added timing information output from doing scale.
Review URL: http://webrtc-codereview.appspot.com/244016

git-svn-id: http://libyuv.googlecode.com/svn/trunk@48 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 891091c6
......@@ -286,6 +286,244 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
);
}
#define HAS_SCALEROWDOWN38_NEON
const uint8 shuf38[16] __attribute__ ((aligned(16))) =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
// 32 -> 12
static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vld1.u8 {q3}, [%3] \n"
"1: \n"
"vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.u8 {d4}, [%1]! \n"
"vst1.u32 {d5[0]}, [%1]! \n"
"subs %2, #12 \n"
"bhi 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(shuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
);
}
// 32x3 -> 12x1
static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vld1.u16 {q4}, [%4] \n"
"vld1.u8 {q5}, [%5] \n"
"vld1.u8 {q8}, [%6] \n"
"add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.u8 {d12, d13, d14, d15}, [r4]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
"vtrn.u8 d12, d13 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
"vtrn.u8 d14, d15 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
"vpaddl.u8 q6, q6 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
"vpaddl.u8 d15, d15 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 q0, q6 \n"
"vadd.u16 d4, d3, d7 \n"
"vadd.u16 d4, d15 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q4 \n"
"vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
"vmovl.u8 q7, d14 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
"vadd.u16 q1, q7 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q8 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d10 \n"
"vtbl.u8 d4, {d0, d1, d2}, d11 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bhi 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(mult38_div6), // %4
"r"(shuf38_2), // %5
"r"(mult38_div9) // %6
: "r4", "q0", "q1", "q2", "q3", "q4",
"q5", "q6", "q7", "q8", "memory", "cc"
);
}
// 32x2 -> 12x1
static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vld1.u16 {q4}, [%4] \n"
"vld1.u8 {q5}, [%5] \n"
"add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
"vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q4 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d10 \n"
"vtbl.u8 d4, {d0, d1, d2}, d11 \n"
"vst1.u8 {d3}, [%1]! \n"
"vst1.u32 {d4[0]}, [%1]! \n"
"subs %2, #12 \n"
"bhi 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(mult38_div6), // %4
"r"(shuf38_2) // %5
: "q0", "q1", "q2", "q3", "q4", "q5", "memory", "cc"
);
}
/**
* SSE2 downscalers with interpolation.
*
......@@ -3064,6 +3302,18 @@ static void ScalePlaneDown38(int src_width, int src_height,
uint8* dst_ptr, int dst_width);
void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
#if defined(HAS_SCALEROWDOWN38_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_NEON;
ScaleRowDown38_2 = ScaleRowDown38_NEON;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
}
} else
#endif
#if defined(HAS_SCALEROWDOWN38_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(dst_width % 24 == 0) && (src_stride % 16 == 0) &&
......
......@@ -27,142 +27,37 @@ using namespace libyuv;
free(var##_mem); \
var = 0;
TEST_F(libyuvTest, ScaleDownBy4) {
int b = 128;
int src_width = 1280;
int src_height = 720;
int src_width_uv = (src_width + 1) >> 1;
int src_height_uv = (src_height + 1) >> 1;
int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b));
int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b));
int src_stride_y = 2 * b + src_width;
int src_stride_uv = 2 * b + src_width_uv;
align_buffer_16(src_y, src_y_plane_size)
align_buffer_16(src_u, src_uv_plane_size)
align_buffer_16(src_v, src_uv_plane_size)
int dst_width = src_width >> 2;
int dst_height = src_height >> 2;
int dst_width_uv = (dst_width + 1) >> 1;
int dst_height_uv = (dst_height + 1) >> 1;
int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b));
int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b));
int dst_stride_y = 2 * b + dst_width;
int dst_stride_uv = 2 * b + dst_width_uv;
align_buffer_16(dst_y, dst_y_plane_size)
align_buffer_16(dst_u, dst_uv_plane_size)
align_buffer_16(dst_v, dst_uv_plane_size)
// create an image with random data reoccurring in 4x4 grid. When the image
// is filtered all the values should be the same.
srandom(time(NULL));
uint8 block_data[16];
int i, j;
// Pulling 16 random numbers there is an infinitesimally small
// chance that they are all 0. Then the output will be all 0.
// Output buffer is filled with 0, want to make sure that after the
// filtering something went into the output buffer.
// Avoid this by setting one of the values to 128. Also set the
// random data to at least 1 for when point sampling to prevent
// output all being 0.
block_data[0] = 128;
for (i = 1; i < 16; i++)
block_data[i] = (random() & 0xfe) + 1;
for (i = b; i < (src_height + b); i += 4) {
for (j = b; j < (src_width + b); j += 4) {
uint8 *ptr = src_y + (i * src_stride_y) + j;
int k, l;
for (k = 0; k < 4; ++k)
for (l = 0; l < 4; ++l)
ptr[k + src_stride_y * l] = block_data[k + 4 * l];
}
}
for (i = 1; i < 16; i++)
block_data[i] = (random() & 0xfe) + 1;
for (i = b; i < (src_height_uv + b); i += 4) {
for (j = b; j < (src_width_uv + b); j += 4) {
uint8 *ptru = src_u + (i * src_stride_uv) + j;
uint8 *ptrv = src_v + (i * src_stride_uv) + j;
int k, l;
for (k = 0; k < 4; ++k)
for (l = 0; l < 4; ++l) {
ptru[k + src_stride_uv * l] = block_data[k + 4 * l];
ptrv[k + src_stride_uv * l] = block_data[k + 4 * l];
}
}
}
int f;
int err = 0;
// currently three filter modes, defined as FilterMode in scale.h
for (f = 0; f < 3; ++f) {
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y + (dst_stride_y * b) + b, dst_stride_y,
dst_u + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height,
static_cast<FilterMode>(f));
#ifdef WIN32
#include <windows.h>
static double get_time()
{
LARGE_INTEGER t, f;
QueryPerformanceCounter(&t);
QueryPerformanceFrequency(&f);
return double(t.QuadPart)/double(f.QuadPart);
}
int value = dst_y[(dst_stride_y * b) + b];
#else
// catch the case that the output buffer is all 0
if (value == 0)
++err;
#include <sys/time.h>
#include <sys/resource.h>
for (i = b; i < (dst_height + b); ++i) {
for (j = b; j < (dst_width + b); ++j) {
if (value != dst_y[(i * dst_stride_y) + j])
++err;
}
}
static double get_time()
{
struct timeval t;
struct timezone tzp;
gettimeofday(&t, &tzp);
return t.tv_sec + t.tv_usec*1e-6;
}
value = dst_u[(dst_stride_uv * b) + b];
#endif
if (value == 0)
++err;
static int TestFilter(int src_width, int src_height,
int dst_width, int dst_height,
FilterMode f) {
for (i = b; i < (dst_height_uv + b); ++i) {
for (j = b; j < (dst_width_uv + b); ++j) {
if (value != dst_u[(i * dst_stride_uv) + j])
++err;
if (value != dst_v[(i * dst_stride_uv) + j])
++err;
}
}
}
free_aligned_buffer_16(src_y)
free_aligned_buffer_16(src_u)
free_aligned_buffer_16(src_v)
free_aligned_buffer_16(dst_y)
free_aligned_buffer_16(dst_u)
free_aligned_buffer_16(dst_v)
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, ScaleDownBy34) {
int b = 128;
int src_width = 1280;
int src_height = 720;
int src_width_uv = (src_width + 1) >> 1;
int src_height_uv = (src_height + 1) >> 1;
......@@ -176,9 +71,6 @@ TEST_F(libyuvTest, ScaleDownBy34) {
align_buffer_16(src_u, src_uv_plane_size)
align_buffer_16(src_v, src_uv_plane_size)
int dst_width = (src_width*3) >> 2;
int dst_height = (src_height*3) >> 2;
int dst_width_uv = (dst_width + 1) >> 1;
int dst_height_uv = (dst_height + 1) >> 1;
......@@ -205,12 +97,7 @@ TEST_F(libyuvTest, ScaleDownBy34) {
}
}
int f;
int err = 0;
// currently three filter modes, defined as FilterMode in scale.h
for (f = 0; f < 3; ++f) {
int max_diff = 0;
const int runs = 128;
align_buffer_16(dst_y_c, dst_y_plane_size)
align_buffer_16(dst_u_c, dst_uv_plane_size)
align_buffer_16(dst_v_c, dst_uv_plane_size)
......@@ -218,7 +105,10 @@ TEST_F(libyuvTest, ScaleDownBy34) {
align_buffer_16(dst_u_opt, dst_uv_plane_size)
align_buffer_16(dst_v_opt, dst_uv_plane_size)
libyuv::MaskCpuFlagsForTest(0);
libyuv::MaskCpuFlags(0);
double c_time = get_time();
for (i = 0; i < runs; ++i)
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
......@@ -226,10 +116,14 @@ TEST_F(libyuvTest, ScaleDownBy34) {
dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height,
static_cast<FilterMode>(f));
dst_width, dst_height, f);
c_time = (get_time() - c_time) / runs;
libyuv::MaskCpuFlags(-1);
double opt_time = get_time();
libyuv::MaskCpuFlagsForTest(-1);
for (i = 0; i < runs; ++i)
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
......@@ -237,13 +131,21 @@ TEST_F(libyuvTest, ScaleDownBy34) {
dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height,
static_cast<FilterMode>(f));
dst_width, dst_height, f);
opt_time = (get_time() - opt_time) / runs;
printf ("filter %d - %8d us c - %8d us opt\n",
f, (int)(c_time*1e6), (int)(opt_time*1e6));
::testing::Test::RecordProperty("C", (int)c_time);
::testing::Test::RecordProperty("Opt", (int)opt_time);
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// of the buffers and look to see that the max difference isn't
// over 2.
int err = 0;
int max_diff = 0;
for (i = b; i < (dst_height + b); ++i) {
for (j = b; j < (dst_width + b); ++j) {
int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
......@@ -276,11 +178,74 @@ TEST_F(libyuvTest, ScaleDownBy34) {
free_aligned_buffer_16(dst_y_opt)
free_aligned_buffer_16(dst_u_opt)
free_aligned_buffer_16(dst_v_opt)
}
free_aligned_buffer_16(src_y)
free_aligned_buffer_16(src_u)
free_aligned_buffer_16(src_v)
return err;
}
TEST_F(libyuvTest, ScaleDownBy2) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width >> 1;
const int dst_height = src_height >> 1;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter (src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, ScaleDownBy4) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = src_width >> 2;
const int dst_height = src_height >> 2;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter (src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, ScaleDownBy34) {
const int src_width = 1280;
const int src_height = 720;
const int dst_width = (src_width*3) >> 2;
const int dst_height = (src_height*3) >> 2;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter (src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, ScaleDownBy38) {
int src_width = 1280;
int src_height = 720;
int dst_width = (src_width*3) >> 3;
int dst_height = (src_height*3) >> 3;
int err = 0;
for (int f = 0; f < 3; ++f)
err += TestFilter (src_width, src_height,
dst_width, dst_height,
static_cast<FilterMode>(f));
EXPECT_EQ(0, err);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment