Commit 4807dea4 authored by fbarchard@google.com's avatar fbarchard@google.com

NV12ToARGBRow_NEON and SetRow8_NEON

BUG=115
TEST=./libyuv_unittest --gtest_filter=*NV12ToARGB*
Review URL: https://webrtc-codereview.appspot.com/869006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@395 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 82069e7a
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 394 Version: 395
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 394 #define LIBYUV_VERSION 395
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -18,12 +18,29 @@ extern "C" { ...@@ -18,12 +18,29 @@ extern "C" {
// This module is for GCC Neon // This module is for GCC Neon
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
// TODO(fbarchard): Make a fetch macro so different subsamples can be done. // Read 8 Y, 4 U and 4 V from 422
// TODO(fbarchard): Rework register usage to produce RGB in d21 - d23. #define READYUV422 \
#define YUV422TORGB \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.u8 {d0}, [%0]! \n" \
"vld1.u32 {d2[0]}, [%1]! \n" \ "vld1.u32 {d2[0]}, [%1]! \n" \
"vld1.u32 {d2[1]}, [%2]! \n" \ "vld1.u32 {d2[1]}, [%2]! \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.u8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n" \
// Read 8 Y and 4 VU from NV21
#define READNV21 \
"vld1.u8 {d0}, [%0]! \n" \
"vld1.u8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d3, d2 \n" \
"vtrn.u32 d2, d3 \n" \
#define YUV422TORGB \
"veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
"vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
"vmull.s8 q9, d2, d25 \n"/* u/v G component */\ "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
...@@ -47,6 +64,7 @@ extern "C" { ...@@ -47,6 +64,7 @@ extern "C" {
"vtrn.u8 d20, d21 \n" \ "vtrn.u8 d20, d21 \n" \
"vtrn.u8 d22, d23 \n" \ "vtrn.u8 d22, d23 \n" \
"vtrn.u8 d16, d17 \n" \ "vtrn.u8 d16, d17 \n" \
"vmov.u8 d21, d16 \n"
#if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) || \ #if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) || \
defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON) defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON)
...@@ -70,11 +88,11 @@ void I422ToARGBRow_NEON(const uint8* y_buf, ...@@ -70,11 +88,11 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
READYUV422
YUV422TORGB YUV422TORGB
"vmov.u8 d21, d16 \n" "subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
...@@ -83,8 +101,8 @@ void I422ToARGBRow_NEON(const uint8* y_buf, ...@@ -83,8 +101,8 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kUVToRB), // %5 : "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6 "r"(&kUVToG) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
"q10", "q11", "q12", "q13", "q14", "q15" "q12", "q13", "q14", "q15"
); );
} }
#endif // HAS_I422TOARGBROW_NEON #endif // HAS_I422TOARGBROW_NEON
...@@ -103,12 +121,12 @@ void I422ToBGRARow_NEON(const uint8* y_buf, ...@@ -103,12 +121,12 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
READYUV422
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n" "vswp.u8 d20, d22 \n"
"vmov.u8 d21, d16 \n"
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
"vst4.8 {d19, d20, d21, d22}, [%3]! \n" "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"subs %4, %4, #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
...@@ -137,12 +155,12 @@ void I422ToABGRRow_NEON(const uint8* y_buf, ...@@ -137,12 +155,12 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
READYUV422
YUV422TORGB YUV422TORGB
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n" "vswp.u8 d20, d22 \n"
"vmov.u8 d21, d16 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"vst4.8 {d20, d21, d22, d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"subs %4, %4, #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
...@@ -171,11 +189,11 @@ void I422ToRGBARow_NEON(const uint8* y_buf, ...@@ -171,11 +189,11 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
READYUV422
YUV422TORGB YUV422TORGB
"vmov.u8 d21, d16 \n" "subs %4, %4, #8 \n"
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
"vst4.8 {d19, d20, d21, d22}, [%3]! \n" "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"subs %4, %4, #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
...@@ -190,6 +208,68 @@ void I422ToRGBARow_NEON(const uint8* y_buf, ...@@ -190,6 +208,68 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
} }
#endif // HAS_I422TORGBAROW_NEON #endif // HAS_I422TORGBAROW_NEON
#ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n"
READNV12
YUV422TORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(uv_buf), // %1
"+r"(rgb_buf), // %2
"+r"(width) // %3
: "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
}
#endif // HAS_NV12TOARGBROW_NEON
#ifdef HAS_NV21TOARGBROW_NEON
void NV21ToARGBRow_NEON(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width) {
asm volatile (
"vld1.u8 {d24}, [%4] \n"
"vld1.u8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n"
READNV21
YUV422TORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(y_buf), // %0
"+r"(uv_buf), // %1
"+r"(rgb_buf), // %2
"+r"(width) // %3
: "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
}
#endif // HAS_NV21TOARGBROW_NEON
#ifdef HAS_SPLITUV_NEON #ifdef HAS_SPLITUV_NEON
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
...@@ -198,9 +278,9 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { ...@@ -198,9 +278,9 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
"vst1.u8 {q0}, [%1]! \n" // store U "vst1.u8 {q0}, [%1]! \n" // store U
"vst1.u8 {q1}, [%2]! \n" // Store V "vst1.u8 {q1}, [%2]! \n" // Store V
"subs %3, %3, #16 \n" // 16 processed per loop
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
...@@ -218,10 +298,9 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -218,10 +298,9 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload "vldm %0!, {q0, q1, q2, q3} \n" // load 64
"vldm %0!,{q0, q1, q2, q3} \n" // load 64
"vstm %1!,{q0, q1, q2, q3} \n" // store 64
"subs %2, %2, #64 \n" // 64 processed per loop "subs %2, %2, #64 \n" // 64 processed per loop
"vstm %1!, {q0, q1, q2, q3} \n" // store 64
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
...@@ -232,6 +311,32 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ...@@ -232,6 +311,32 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
} }
#endif // HAS_COPYROW_NEON #endif // HAS_COPYROW_NEON
#ifdef HAS_SETROW_NEON
// SetRow8 writes 'count' bytes using a 32 bit value repeated
void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( // NOLINT
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"vst1.u32 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
: "q0", "memory", "cc");
}
// TODO(fbarchard): Make fully assembler
// SetRow32 writes 'count' words using a 32 bit value repeated
void SetRows32_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
SetRow8_NEON(dst, v32, width << 2);
dst += dst_stride;
}
}
#endif // HAS_SETROW_NEON
#ifdef HAS_MIRRORROW_NEON #ifdef HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile ( asm volatile (
...@@ -264,10 +369,10 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -264,10 +369,10 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // src += 16 "vld1.8 {q0}, [%0]! \n" // src += 16
"subs %2, #16 \n"
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
"vst1.8 {d1}, [%1]! \n" "vst1.8 {d1}, [%1]! \n"
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16 "vst1.8 {d0}, [%1], r3 \n" // dst -= 16
"subs %2, #16 \n"
"bge 1b \n" "bge 1b \n"
// add 16 back to the counter. if the result is 0 there is no // add 16 back to the counter. if the result is 0 there is no
...@@ -287,9 +392,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -287,9 +392,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ // http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
"3: \n" "3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"subs %2, #2 \n"
"vst1.8 {d1[0]}, [%1]! \n" "vst1.8 {d1[0]}, [%1]! \n"
"vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2 "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
"subs %2, #2 \n"
"bge 3b \n" "bge 3b \n"
"adds %2, #2 \n" "adds %2, #2 \n"
...@@ -335,10 +440,10 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { ...@@ -335,10 +440,10 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld2.8 {d0, d1}, [%0]! \n" // src += 16 "vld2.8 {d0, d1}, [%0]! \n" // src += 16
"subs %3, #8 \n"
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
"vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
"subs %3, #8 \n"
"bge 1b \n" "bge 1b \n"
// add 8 back to the counter. if the result is 0 there is no // add 8 back to the counter. if the result is 0 there is no
...@@ -353,9 +458,9 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { ...@@ -353,9 +458,9 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
"sub %2, #1 \n" "sub %2, #1 \n"
"3: \n" "3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"subs %3, %3, #1 \n"
"vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1 "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
"vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1 "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
"subs %3, %3, #1 \n"
"bgt 3b \n" "bgt 3b \n"
"4: \n" "4: \n"
: "+r"(src), // %0 : "+r"(src), // %0
...@@ -373,12 +478,11 @@ void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) { ...@@ -373,12 +478,11 @@ void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d2 \n" // swap G, R "vswp.u8 d1, d2 \n" // swap G, R
"vswp.u8 d0, d3 \n" // swap B, A "vswp.u8 d0, d3 \n" // swap B, A
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -394,11 +498,10 @@ void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) { ...@@ -394,11 +498,10 @@ void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d0, d2 \n" // swap R, B "vswp.u8 d0, d2 \n" // swap R, B
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -414,11 +517,10 @@ void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) { ...@@ -414,11 +517,10 @@ void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmov.u8 d4, d0 \n" // move A after RGB "vmov.u8 d4, d0 \n" // move A after RGB
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -435,10 +537,9 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -435,10 +537,9 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -455,11 +556,10 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -455,11 +556,10 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "vswp.u8 d1, d3 \n" // swap R, B
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
...@@ -475,11 +575,10 @@ void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { ...@@ -475,11 +575,10 @@ void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmov.u8 d0, d4 \n" // move A before RGB. "vmov.u8 d0, d4 \n" // move A before RGB.
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_rgba), // %1 "+r"(dst_rgba), // %1
...@@ -495,10 +594,9 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ...@@ -495,10 +594,9 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1 "+r"(dst_rgb24), // %1
...@@ -514,11 +612,10 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ...@@ -514,11 +612,10 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "vswp.u8 d1, d3 \n" // swap R, B
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_raw), // %1 "+r"(dst_raw), // %1
...@@ -534,10 +631,9 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ...@@ -534,10 +631,9 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
...@@ -553,10 +649,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -553,10 +649,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
"vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
...@@ -567,20 +662,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ...@@ -567,20 +662,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
} }
#endif // HAS_UYVYTOYROW_NEON #endif // HAS_UYVYTOYROW_NEON
#endif // HAS_UYVYTOYROW_NEON
#ifdef HAS_YUY2TOYROW_NEON #ifdef HAS_YUY2TOYROW_NEON
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) { int pix) {
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.u8 {d1}, [%1]! \n" // store 8 U. "vst1.u8 {d1}, [%1]! \n" // store 8 U.
"vst1.u8 {d3}, [%2]! \n" // store 8 V. "vst1.u8 {d3}, [%2]! \n" // store 8 V.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
...@@ -598,11 +689,10 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ...@@ -598,11 +689,10 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
asm volatile ( asm volatile (
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vst1.u8 {d0}, [%1]! \n" // store 8 U. "vst1.u8 {d0}, [%1]! \n" // store 8 U.
"vst1.u8 {d2}, [%2]! \n" // store 8 V. "vst1.u8 {d2}, [%2]! \n" // store 8 V.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
...@@ -621,14 +711,13 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ...@@ -621,14 +711,13 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"adds %1, %0, %1 \n" // stride + src_yuy2 "adds %1, %0, %1 \n" // stride + src_yuy2
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V "vrhadd.u8 d3, d3, d7 \n" // average rows of V
"vst1.u8 {d1}, [%2]! \n" // store 8 U. "vst1.u8 {d1}, [%2]! \n" // store 8 U.
"vst1.u8 {d3}, [%3]! \n" // store 8 V. "vst1.u8 {d3}, [%3]! \n" // store 8 V.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1 "+r"(stride_yuy2), // %1
...@@ -648,14 +737,13 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -648,14 +737,13 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"adds %1, %0, %1 \n" // stride + src_uyvy "adds %1, %0, %1 \n" // stride + src_uyvy
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"pld [%0, #192] \n" // preload
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V "vrhadd.u8 d2, d2, d6 \n" // average rows of V
"vst1.u8 {d0}, [%2]! \n" // store 8 U. "vst1.u8 {d0}, [%2]! \n" // store 8 U.
"vst1.u8 {d2}, [%3]! \n" // store 8 V. "vst1.u8 {d2}, [%3]! \n" // store 8 V.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1 "+r"(stride_uyvy), // %1
...@@ -666,6 +754,8 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ...@@ -666,6 +754,8 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
: "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
); );
} }
#endif // HAS_UYVYTOYROW_NEON
#endif // __ARM_NEON__ #endif // __ARM_NEON__
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "libyuv/convert_argb.h" #include "libyuv/convert_argb.h"
#include "libyuv/convert_from.h" #include "libyuv/convert_from.h"
#include "libyuv/compare.h"
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h" #include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
...@@ -256,6 +257,8 @@ TESTATOPLANAR(ARGB, 4, I422, 2, 1) ...@@ -256,6 +257,8 @@ TESTATOPLANAR(ARGB, 4, I422, 2, 1)
// TODO(fbarchard): Implement and test 411 and 444 // TODO(fbarchard): Implement and test 411 and 444
TESTATOPLANAR(YUY2, 2, I420, 2, 2) TESTATOPLANAR(YUY2, 2, I420, 2, 2)
TESTATOPLANAR(UYVY, 2, I420, 2, 2) TESTATOPLANAR(UYVY, 2, I420, 2, 2)
TESTATOPLANAR(YUY2, 2, I422, 2, 1)
TESTATOPLANAR(UYVY, 2, I422, 2, 1)
TESTATOPLANAR(V210, 16 / 6, I420, 2, 2) TESTATOPLANAR(V210, 16 / 6, I420, 2, 2)
TESTATOPLANAR(I400, 1, I420, 2, 2) TESTATOPLANAR(I400, 1, I420, 2, 2)
TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2) TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2)
...@@ -302,6 +305,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \ ...@@ -302,6 +305,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +) \ TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +) \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -) TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -)
TESTATOB(I400, 1, 1, I400, 1)
TESTATOB(ARGB, 4, 4, ARGB, 4) TESTATOB(ARGB, 4, 4, ARGB, 4)
TESTATOB(ARGB, 4, 4, BGRA, 4) TESTATOB(ARGB, 4, 4, BGRA, 4)
TESTATOB(ARGB, 4, 4, ABGR, 4) TESTATOB(ARGB, 4, 4, ABGR, 4)
...@@ -984,4 +988,18 @@ TEST_F(libyuvTest, TestAffine) { ...@@ -984,4 +988,18 @@ TEST_F(libyuvTest, TestAffine) {
#endif #endif
} }
TEST_F(libyuvTest, Test565) {
SIMD_ALIGNED(uint8 orig_pixels[256][4]);
SIMD_ALIGNED(uint8 pixels565[256][2]);
for (int i = 0; i < 256; ++i) {
for (int j = 0; j < 4; ++j) {
orig_pixels[i][j] = i;
}
}
ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
EXPECT_EQ(610919429u, checksum);
}
} // namespace libyuv } // namespace libyuv
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment