Commit f626bea0 authored by frkoenig@google.com's avatar frkoenig@google.com

Neon version of ScaleRowDown34.

Review URL: http://webrtc-codereview.appspot.com/250003

git-svn-id: http://libyuv.googlecode.com/svn/trunk@44 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 82ca3958
......@@ -168,6 +168,124 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
);
}
#define HAS_SCALEROWDOWN34_NEON
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vmov d2, d3 \n" // order needs to be d0, d1, d2
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bhi 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "d0", "d1", "d2", "d3", "memory", "cc"
);
}
static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vmov.u8 d16, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
"vmovl.u8 q4, d4 \n"
"vmovl.u8 q5, d5 \n"
"vmovl.u8 q6, d6 \n"
"vmovl.u8 q7, d7 \n"
// 3 * line_0 + line_1
"vmlal.u8 q4, d0, d16 \n"
"vmlal.u8 q5, d1, d16 \n"
"vmlal.u8 q6, d2, d16 \n"
"vmlal.u8 q7, d3, d16 \n"
// (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q4, #2 \n"
"vqrshrn.u16 d1, q5, #2 \n"
"vqrshrn.u16 d2, q6, #2 \n"
"vqrshrn.u16 d3, q7, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q4, d1 \n"
"vmlal.u8 q4, d0, d16 \n"
"vqrshrn.u16 d0, q4, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q4, d2 \n"
"vmlal.u8 q4, d3, d16 \n"
"vqrshrn.u16 d2, q4, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bhi 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "d17", "memory", "cc"
);
}
static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm__ volatile
(
"vmov.u8 d10, #3 \n"
"add %3, %0 \n"
"1: \n"
"vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1 \n"
"vmlal.u8 q3, d0, d10 \n"
"vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2 \n"
"vmlal.u8 q3, d3, d10 \n"
"vqrshrn.u16 d2, q3, #2 \n"
"vst3.u8 {d0, d1, d2}, [%1]! \n"
"subs %2, #24 \n"
"bhi 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "r4", "q0", "q1", "q2", "q3", "d10", "memory", "cc"
);
}
/**
* SSE2 downscalers with interpolation.
*
......@@ -2857,6 +2975,18 @@ static void ScalePlaneDown34(int src_width, int src_height,
uint8* dst_ptr, int dst_width);
void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
#if defined(HAS_SCALEROWDOWN34_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(dst_width % 24 == 0) && (dst_stride % 8 == 0)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_NEON;
ScaleRowDown34_1 = ScaleRowDown34_NEON;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
}
} else
#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(dst_width % 24 == 0) && (src_stride % 16 == 0) &&
......
......@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
#include "unit_test.h"
#include <stdlib.h>
......@@ -157,3 +158,129 @@ TEST_F(libyuvTest, ScaleDownBy4) {
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, ScaleDownBy34) {
int b = 128;
int src_width = 1280;
int src_height = 720;
int src_width_uv = (src_width + 1) >> 1;
int src_height_uv = (src_height + 1) >> 1;
int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b));
int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b));
int src_stride_y = 2 * b + src_width;
int src_stride_uv = 2 * b + src_width_uv;
align_buffer_16(src_y, src_y_plane_size)
align_buffer_16(src_u, src_uv_plane_size)
align_buffer_16(src_v, src_uv_plane_size)
int dst_width = (src_width*3) >> 2;
int dst_height = (src_height*3) >> 2;
int dst_width_uv = (dst_width + 1) >> 1;
int dst_height_uv = (dst_height + 1) >> 1;
int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b));
int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b));
int dst_stride_y = 2 * b + dst_width;
int dst_stride_uv = 2 * b + dst_width_uv;
srandom(time(NULL));
int i, j;
for (i = b; i < (src_height + b); ++i) {
for (j = b; j < (src_width + b); ++j) {
src_y[(i * src_stride_y) + j] = (random() & 0xff);
}
}
for (i = b; i < (src_height_uv + b); ++i) {
for (j = b; j < (src_width_uv + b); ++j) {
src_u[(i * src_stride_uv) + j] = (random() & 0xff);
src_v[(i * src_stride_uv) + j] = (random() & 0xff);
}
}
int f;
int err = 0;
// currently three filter modes, defined as FilterMode in scale.h
for (f = 0; f < 3; ++f) {
int max_diff = 0;
align_buffer_16(dst_y_c, dst_y_plane_size)
align_buffer_16(dst_u_c, dst_uv_plane_size)
align_buffer_16(dst_v_c, dst_uv_plane_size)
align_buffer_16(dst_y_opt, dst_y_plane_size)
align_buffer_16(dst_u_opt, dst_uv_plane_size)
align_buffer_16(dst_v_opt, dst_uv_plane_size)
libyuv::MaskCpuFlagsForTest(0);
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height,
static_cast<FilterMode>(f));
libyuv::MaskCpuFlagsForTest(-1);
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
dst_width, dst_height,
static_cast<FilterMode>(f));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
// of the buffers and look to see that the max difference isn't
// over 2.
for (i = b; i < (dst_height + b); ++i) {
for (j = b; j < (dst_width + b); ++j) {
int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
dst_y_opt[(i * dst_stride_y) + j]);
if (abs_diff > max_diff)
max_diff = abs_diff;
}
}
for (i = b; i < (dst_height_uv + b); ++i) {
for (j = b; j < (dst_width_uv + b); ++j) {
int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
dst_u_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff)
max_diff = abs_diff;
abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
dst_v_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff)
max_diff = abs_diff;
}
}
if (max_diff > 2)
err++;
free_aligned_buffer_16(dst_y_c)
free_aligned_buffer_16(dst_u_c)
free_aligned_buffer_16(dst_v_c)
free_aligned_buffer_16(dst_y_opt)
free_aligned_buffer_16(dst_u_opt)
free_aligned_buffer_16(dst_v_opt)
}
free_aligned_buffer_16(src_y)
free_aligned_buffer_16(src_u)
free_aligned_buffer_16(src_v)
EXPECT_EQ(0, err);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment