Commit 93d003f8 authored by frkoenig@google.com's avatar frkoenig@google.com

YUVToARGB, YUVToBGRA,YUVToABGR optimized.


Review URL: http://webrtc-codereview.appspot.com/267022

git-svn-id: http://libyuv.googlecode.com/svn/trunk@86 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent bc8f28eb
......@@ -62,6 +62,7 @@
['arm_neon==1', {
'sources' : [
'source/rotate_neon.cc',
'source/row_neon.cc',
],
}],
],
......
......@@ -23,6 +23,7 @@
# sources
'unit_test/compare_test.cc',
'unit_test/planar_test.cc',
'unit_test/rotate_test.cc',
'unit_test/scale_test.cc',
'unit_test/unit_test.cc',
......
......@@ -1188,7 +1188,11 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && (width % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
......@@ -1227,7 +1231,11 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSSE3)
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && (width % 16 == 0)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
......@@ -1266,7 +1274,11 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSSE3)
#if defined(HAS_FASTCONVERTYUVTOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && (width % 16 == 0)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_NEON;
} else
#elif defined(HAS_FASTCONVERTYUVTOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
......
......@@ -19,6 +19,27 @@
#define YUV_DISABLE_ASM
#endif
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_NEON
void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#define HAS_FASTCONVERTYUVTOBGRAROW_NEON
void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#define HAS_FASTCONVERTYUVTOABGRROW_NEON
void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#endif
// The following are available on all x86 platforms
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(YUV_DISABLE_ASM)
......
/*
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "row.h"
#define YUVTORGB \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.u32 {d2[0]}, [%1]! \n" \
"vld1.u32 {d2[1]}, [%2]! \n" \
\
"veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
\
"vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
\
"vmull.s8 q9, d2, d25 \n"/* u/v G component */\
\
"vmov.u8 d1, #0 \n"/* split odd/even y apart */\
"vtrn.u8 d0, d1 \n" \
\
"vsub.s16 q0, q0, q15 \n"/* offset y */\
"vmul.s16 q0, q0, q14 \n" \
\
"vadd.s16 d18, d19 \n" \
\
"vqadd.s16 d20, d0, d16 \n" \
"vqadd.s16 d21, d1, d16 \n" \
\
"vqadd.s16 d22, d0, d17 \n" \
"vqadd.s16 d23, d1, d17 \n" \
\
"vqadd.s16 d16, d0, d18 \n" \
"vqadd.s16 d17, d1, d18 \n" \
\
"vqrshrun.s16 d0, q10, #6 \n" \
"vqrshrun.s16 d1, q11, #6 \n" \
"vqrshrun.s16 d2, q8, #6 \n" \
\
"vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
"vmovl.u8 q11, d1 \n" \
"vmovl.u8 q8, d2 \n" \
\
"vtrn.u8 d20, d21 \n" \
"vtrn.u8 d22, d23 \n" \
"vtrn.u8 d16, d17 \n" \
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) || \
defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON) || \
defined(HAS_FASTCONVERTYUVTOABGRROW_NEON)
static const vec8 kUVToRB[8] = { 127, 127, 127, 127, 102, 102, 102, 102 };
static const vec8 kUVToG[8] = { -25, -25, -25, -25, -52, -52, -52, -52 };
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
"1: \n"
YUVTORGB
"vmov.u8 d21, d16 \n"
"vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n"
"bhi 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+r"(width) // %4
: "r"(kUVToRB),
"r"(kUVToG)
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON)
void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
"1: \n"
YUVTORGB
"vswp.u8 d20, d22 \n"
"vmov.u8 d21, d16 \n"
"vmov.u8 d19, #255 \n"
"vst4.u8 {d19, d20, d21, d22}, [%3]! \n"
"subs %4, %4, #8 \n"
"bhi 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+r"(width) // %4
: "r"(kUVToRB),
"r"(kUVToG)
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif
#if defined(HAS_FASTCONVERTYUVTOABGRROW_NEON)
void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%5] \n"
"vld1.u8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
"1: \n"
YUVTORGB
"vswp.u8 d20, d22 \n"
"vmov.u8 d21, d16 \n"
"vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n"
"bhi 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
"+r"(width) // %4
: "r"(kUVToRB),
"r"(kUVToG)
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif
/*
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "unit_test.h"
#include <stdlib.h>
#include <time.h>
#include "libyuv/rotate.h"
#include "libyuv/planar_functions.h"
#include "libyuv/cpu_id.h"
namespace libyuv {
TEST_F (libyuvTest, BenchmarkI420ToARGB_C) {
align_buffer_16(src_y, _benchmark_width * _benchmark_height);
align_buffer_16(src_u, ((_benchmark_width * _benchmark_height) >> 2));
align_buffer_16(src_v, ((_benchmark_width * _benchmark_height) >> 2));
align_buffer_16(dst_argb, ((_benchmark_width << 2) * _benchmark_height));
MaskCpuFlags(kCpuInitialized);
for (int i = 0; i < _benchmark_iterations; ++i)
I420ToARGB(src_y, _benchmark_width,
src_u, _benchmark_width >> 1,
src_v, _benchmark_width >> 1,
dst_argb, _benchmark_width << 2,
_benchmark_width, _benchmark_height);
MaskCpuFlags(-1);
EXPECT_EQ(0, 0);
free_aligned_buffer_16(src_y)
free_aligned_buffer_16(src_u)
free_aligned_buffer_16(src_v)
free_aligned_buffer_16(dst_argb)
}
TEST_F (libyuvTest, BenchmarkI420ToARGB_OPT) {
align_buffer_16(src_y, _benchmark_width * _benchmark_height);
align_buffer_16(src_u, (_benchmark_width * _benchmark_height) >> 2);
align_buffer_16(src_v, (_benchmark_width * _benchmark_height) >> 2);
align_buffer_16(dst_argb, (_benchmark_width << 2) * _benchmark_height);
for (int i = 0; i < _benchmark_iterations; ++i)
I420ToARGB(src_y, _benchmark_width,
src_u, _benchmark_width >> 1,
src_v, _benchmark_width >> 1,
dst_argb, _benchmark_width << 2,
_benchmark_width, _benchmark_height);
free_aligned_buffer_16(src_y)
free_aligned_buffer_16(src_u)
free_aligned_buffer_16(src_v)
free_aligned_buffer_16(dst_argb)
}
#define TESTI420TO(FMT) \
TEST_F (libyuvTest, I420To##FMT##_CvsOPT) { \
const int src_width = 1280; \
const int src_height = 720; \
align_buffer_16(src_y, src_width * src_height); \
align_buffer_16(src_u, (src_width * src_height) >> 2); \
align_buffer_16(src_v, (src_width * src_height) >> 2); \
align_buffer_16(dst_rgb_c, (src_width << 2) * src_height); \
align_buffer_16(dst_rgb_opt, (src_width << 2) * src_height); \
srandom(time(NULL)); \
for (int i = 0; i < src_height; ++i) \
for (int j = 0; j < src_width; ++j) \
src_y[(i * src_height) + j] = (random() & 0xff); \
for (int i = 0; i < src_height >> 1; ++i) \
for (int j = 0; j < src_width >> 1; ++j) { \
src_u[(i * src_height >> 1) + j] = (random() & 0xff); \
src_v[(i * src_height >> 1) + j] = (random() & 0xff); \
} \
MaskCpuFlags(kCpuInitialized); \
I420To##FMT(src_y, src_width, \
src_u, src_width >> 1, \
src_v, src_width >> 1, \
dst_rgb_c, src_width << 2, \
src_width, src_height); \
MaskCpuFlags(-1); \
I420To##FMT(src_y, src_width, \
src_u, src_width >> 1, \
src_v, src_width >> 1, \
dst_rgb_opt, src_width << 2, \
src_width, src_height); \
int err = 0; \
int i = 0; \
for (int i = 0; i < src_height; ++i) { \
for (int j = 0; j < src_width << 2; ++j) { \
int diff = (int)(dst_rgb_c[i * src_height + j]) - \
(int)(dst_rgb_opt[i * src_height + j]); \
if (abs(diff) > 2) \
err++; \
} \
} \
EXPECT_EQ(err, 0); \
free_aligned_buffer_16(src_y) \
free_aligned_buffer_16(src_u) \
free_aligned_buffer_16(src_v) \
free_aligned_buffer_16(dst_rgb_c) \
free_aligned_buffer_16(dst_rgb_opt) \
}
TESTI420TO(ARGB)
TESTI420TO(BGRA)
TESTI420TO(ABGR)
}
......@@ -16,7 +16,7 @@
#define align_buffer_16(var, size) \
uint8 *var; \
uint8 *var##_mem; \
var##_mem = reinterpret_cast<uint8*>(calloc(size+15, sizeof(uint8))); \
var##_mem = reinterpret_cast<uint8*>(calloc((size)+15, sizeof(uint8))); \
var = reinterpret_cast<uint8*> \
((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment