Commit 8f04ca5b authored by ashok.bhat@gmail.com's avatar ashok.bhat@gmail.com

Row AArch64 Neon implementation - Part 5

BUG=319
TESTED=libyuv_unittest
R=fbarchard@chromium.org, fbarchard@google.com

Change-Id: Ia76096088ddd771388f01dd86110089db2faedfc
Signed-off-by: 's avatarAshok Bhat <ashok.bhat@arm.com>

Review URL: https://webrtc-codereview.appspot.com/21189004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1055 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent cb8be2fb
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1054 Version: 1055
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -301,8 +301,8 @@ extern "C" { ...@@ -301,8 +301,8 @@ extern "C" {
// #define HAS_ARGBTORGB565ROW_NEON // #define HAS_ARGBTORGB565ROW_NEON
// #define HAS_ARGBTOARGB1555ROW_NEON // #define HAS_ARGBTOARGB1555ROW_NEON
// #define HAS_ARGBTOARGB4444ROW_NEON // #define HAS_ARGBTOARGB4444ROW_NEON
// #define HAS_ARGBTOYROW_NEON #define HAS_ARGBTOYROW_NEON
// #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYJROW_NEON
// #define HAS_ARGBTOUV444ROW_NEON // #define HAS_ARGBTOUV444ROW_NEON
// #define HAS_ARGBTOUV422ROW_NEON // #define HAS_ARGBTOUV422ROW_NEON
// #define HAS_ARGBTOUV411ROW_NEON // #define HAS_ARGBTOUV411ROW_NEON
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1054 #define LIBYUV_VERSION 1055
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -711,13 +711,15 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ...@@ -711,13 +711,15 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVRow = ARGBToUVRow_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
......
...@@ -60,6 +60,13 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, ...@@ -60,6 +60,13 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
} }
} }
} }
#elif defined(HAS_ARGBTOUV444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToUV444Row = ARGBToUV444Row_NEON;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
...@@ -76,10 +83,8 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, ...@@ -76,10 +83,8 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
#elif defined(HAS_ARGBTOYROW_NEON) #elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) { if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToYRow = ARGBToYRow_Any_NEON; ARGBToYRow = ARGBToYRow_Any_NEON;
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
ARGBToUV444Row = ARGBToUV444Row_NEON;
} }
} }
#endif #endif
...@@ -134,6 +139,13 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, ...@@ -134,6 +139,13 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
} }
} }
} }
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
...@@ -153,12 +165,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, ...@@ -153,12 +165,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
} }
#endif #endif
...@@ -228,13 +234,15 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, ...@@ -228,13 +234,15 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 32) { }
#endif
#if defined(HAS_ARGBTOUV411ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
ARGBToUV411Row = ARGBToUV411Row_Any_NEON; ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToUV411Row = ARGBToUV411Row_NEON; ARGBToUV411Row = ARGBToUV411Row_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
...@@ -296,13 +304,15 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, ...@@ -296,13 +304,15 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVRow = ARGBToUVRow_NEON;
} }
} }
}
#endif #endif
#if defined(HAS_MERGEUVROW_SSE2) #if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
...@@ -399,13 +409,15 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, ...@@ -399,13 +409,15 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVRow = ARGBToUVRow_NEON;
} }
} }
}
#endif #endif
#if defined(HAS_MERGEUVROW_SSE2) #if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
...@@ -493,6 +505,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, ...@@ -493,6 +505,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
} }
} }
} }
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
...@@ -510,12 +529,6 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, ...@@ -510,12 +529,6 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
} }
#endif #endif
...@@ -594,6 +607,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, ...@@ -594,6 +607,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
} }
} }
} }
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
...@@ -611,12 +631,6 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, ...@@ -611,12 +631,6 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
} }
#endif #endif
...@@ -1022,13 +1036,15 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, ...@@ -1022,13 +1036,15 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON; ARGBToYJRow = ARGBToYJRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON; ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_NEON; ARGBToUVJRow = ARGBToUVJRow_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
......
...@@ -332,13 +332,15 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -332,13 +332,15 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVRow = ARGBToUVRow_NEON;
} }
} }
}
#endif #endif
switch (src_fourcc_bayer) { switch (src_fourcc_bayer) {
......
...@@ -1580,28 +1580,28 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, ...@@ -1580,28 +1580,28 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
#ifdef HAS_ARGBTOYROW_NEON #ifdef HAS_ARGBTOYROW_NEON
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q12", "q13" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
); );
} }
#endif // HAS_ARGBTOYROW_NEON #endif // HAS_ARGBTOYROW_NEON
...@@ -1609,26 +1609,26 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1609,26 +1609,26 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
#ifdef HAS_ARGBTOYJROW_NEON #ifdef HAS_ARGBTOYJROW_NEON
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "movi v4.8b, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "movi v5.8b, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "movi v6.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q12", "q13" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
); );
} }
#endif // HAS_ARGBTOYJROW_NEON #endif // HAS_ARGBTOYJROW_NEON
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment