Commit 2df5743b authored by ashok.bhat@gmail.com's avatar ashok.bhat@gmail.com

Row AArch64 Neon implementation - Part 6

BUG=319
TESTED=libyuv_unittest
R=fbarchard@google.com

Change-Id: I5d93eb184ba873d5e7637a3b5a830be39a967c6f
Signed-off-by: 's avatarAshok Bhat <ashok.bhat@arm.com>

Review URL: https://webrtc-codereview.appspot.com/15239004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1069 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 4d5c3f34
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1067
Version: 1069
License: BSD
License File: LICENSE
......
......@@ -319,19 +319,19 @@ extern "C" {
// #define HAS_RGB565TOYROW_NEON
// #define HAS_ARGB1555TOYROW_NEON
// #define HAS_ARGB4444TOYROW_NEON
// #define HAS_BGRATOYROW_NEON
// #define HAS_ABGRTOYROW_NEON
// #define HAS_RGBATOYROW_NEON
// #define HAS_RGB24TOYROW_NEON
// #define HAS_RAWTOYROW_NEON
// #define HAS_INTERPOLATEROW_NEON
// #define HAS_ARGBBLENDROW_NEON
// #define HAS_ARGBATTENUATEROW_NEON
// #define HAS_ARGBQUANTIZEROW_NEON
// #define HAS_ARGBSHADEROW_NEON
// #define HAS_ARGBGRAYROW_NEON
// #define HAS_ARGBSEPIAROW_NEON
// #define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_ABGRTOYROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_RGB24TOYROW_NEON
#define HAS_RAWTOYROW_NEON
#define HAS_INTERPOLATEROW_NEON
#define HAS_ARGBBLENDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
#define HAS_ARGBQUANTIZEROW_NEON
#define HAS_ARGBSHADEROW_NEON
#define HAS_ARGBGRAYROW_NEON
#define HAS_ARGBSEPIAROW_NEON
#define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBMULTIPLYROW_NEON
#define HAS_ARGBADDROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1067
#define LIBYUV_VERSION 1069
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -782,13 +782,15 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
if (IS_ALIGNED(width, 8)) {
BGRAToYRow = BGRAToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_BGRATOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
BGRAToUVRow = BGRAToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
BGRAToUVRow = BGRAToUVRow_NEON;
}
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
......@@ -851,13 +853,15 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
if (IS_ALIGNED(width, 8)) {
ABGRToYRow = ABGRToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_ABGRTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ABGRToUVRow = ABGRToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_NEON;
}
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
......@@ -920,13 +924,15 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
if (IS_ALIGNED(width, 8)) {
RGBAToYRow = RGBAToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_RGBATOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
RGBAToUVRow = RGBAToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RGBAToUVRow = RGBAToUVRow_NEON;
}
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
......@@ -983,15 +989,16 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
if (IS_ALIGNED(width, 8)) {
RGB24ToYRow = RGB24ToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_RGB24TOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RGB24ToUVRow = RGB24ToUVRow_NEON;
}
}
}
#else // HAS_RGB24TOYROW_NEON
#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
......@@ -1019,7 +1026,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB24TOYROW_NEON
{
#if !defined(HAS_RGB24TOYROW_NEON)
......@@ -1100,15 +1106,16 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
if (IS_ALIGNED(width, 8)) {
RAWToYRow = RAWToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_RAWTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
RAWToUVRow = RAWToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RAWToUVRow = RAWToUVRow_NEON;
}
}
}
#else // HAS_RAWTOYROW_NEON
#endif
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
......@@ -1136,7 +1143,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RAWTOYROW_NEON
{
// Allocate 2 rows of ARGB.
......
......@@ -245,14 +245,32 @@ YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
#endif
#ifdef HAS_ARGBTOYROW_NEON
YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_ARGBTOYJROW_NEON
YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_BGRATOYROW_NEON
YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_ABGRTOYROW_NEON
YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_RGBATOYROW_NEON
YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_RGB24TOYROW_NEON
YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
#endif
#ifdef HAS_RAWTOYROW_NEON
YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
#endif
#ifdef HAS_RGB565TOYROW_NEON
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
#endif
#ifdef HAS_ARGB1555TOYROW_NEON
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
#endif
#ifdef HAS_ARGB4444TOYROW_NEON
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
#endif
#ifdef HAS_YUY2TOYROW_NEON
......@@ -342,14 +360,32 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGBTOUVROW_NEON
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
#endif
#ifdef HAS_ARGBTOUVJROW_NEON
UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_NEON
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_NEON
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_NEON
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
#endif
#ifdef HAS_RGB24TOUVROW_NEON
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
#endif
#ifdef HAS_RAWTOUVROW_NEON
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
#endif
#ifdef HAS_RGB565TOUVROW_NEON
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGB1555TOUVROW_NEON
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGB4444TOUVROW_NEON
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
#endif
#ifdef HAS_YUY2TOUVROW_NEON
......
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
......@@ -2466,28 +2466,28 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
#ifdef HAS_BGRATOYROW_NEON
void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R
"vmlal.u8 q8, d2, d5 \n" // G
"vmlal.u8 q8, d3, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
"umull v16.8h, v1.8b, v4.8b \n" // R
"umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // B
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_BGRATOYROW_NEON
......@@ -2495,28 +2495,28 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
#ifdef HAS_ABGRTOYROW_NEON
void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R
"vmlal.u8 q8, d1, d5 \n" // G
"vmlal.u8 q8, d2, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
"umull v16.8h, v0.8b, v4.8b \n" // R
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // B
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_ABGRTOYROW_NEON
......@@ -2524,28 +2524,28 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
#ifdef HAS_RGBATOYROW_NEON
void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B
"vmlal.u8 q8, d2, d5 \n" // G
"vmlal.u8 q8, d3, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
"umull v16.8h, v1.8b, v4.8b \n" // B
"umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // R
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_RGBATOYROW_NEON
......@@ -2553,28 +2553,28 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
#ifdef HAS_RGB24TOYROW_NEON
void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
"vmlal.u8 q8, d1, d5 \n" // G
"vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
"umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_RGB24TOYROW_NEON
......@@ -2582,28 +2582,28 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
#ifdef HAS_RAWTOYROW_NEON
void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
"vmlal.u8 q8, d1, d5 \n" // G
"vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
"umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
#endif // HAS_RAWTOYROW_NEON
......@@ -2613,10 +2613,12 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
asm volatile (
"cmp %4, #0 \n"
"beq 100f \n"
"add %2, %1 \n"
"cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n"
......@@ -2624,85 +2626,85 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
"dup v5.16b, %w4 \n"
"dup v4.16b, %w5 \n"
// General purpose row blend.
"1: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"umull v2.8h, v0.8b, v4.8b \n"
"umull2 v3.8h, v0.16b, v4.16b \n"
"umlal v2.8h, v1.8b, v5.8b \n"
"umlal2 v3.8h, v1.16b, v5.16b \n"
"rshrn v0.8b, v2.8h, #8 \n"
"rshrn2 v0.16b, v3.8h, #8 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"st1 {v0.16b}, [%0], #16 \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"st1 {v0.16b}, [%0], #16 \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"st1 {v0.16b}, [%0], #16 \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
"ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n"
"ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"st1 {v0.16b}, [%0], #16 \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"st1 {v0.16b}, [%0], #16 \n"
"bgt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(src_ptr1), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
"+r"(y1_fraction), // %4
"+r"(y0_fraction) // %5
:
: "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
: "cc", "memory", "v0", "v1", "v3", "v4", "v5"
);
}
#endif // HAS_INTERPOLATEROW_NEON
......@@ -2712,54 +2714,58 @@ void InterpolateRow_NEON(uint8* dst_ptr,
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
"subs %3, #8 \n"
"subs %3, %3, #8 \n"
"blt 89f \n"
// Blend 8 pixels.
"8: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0.
MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a
"vmull.u8 q11, d5, d3 \n" // dg * a
"vmull.u8 q12, d6, d3 \n" // dr * a
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
"vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
"vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
"vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255
"umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
"uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
"uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
"uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
"uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
"uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
"uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
"uqadd v0.8b, v0.8b, v4.8b \n" // + sb
"uqadd v1.8b, v1.8b, v5.8b \n" // + sg
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
"movi v3.8b, #255 \n" // a = 255
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB.
"bge 8b \n"
"89: \n"
"adds %3, #8-1 \n"
"adds %3, %3, #8-1 \n"
"blt 99f \n"
// Blend 1 pixels.
"1: \n"
MEMACCESS(0)
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
"ld4 {v0.b-v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
MEMACCESS(1)
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
"ld4 {v4.b-v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a
"vmull.u8 q11, d5, d3 \n" // dg * a
"vmull.u8 q12, d6, d3 \n" // dr * a
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
"vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
"vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
"vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255
"umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
"uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
"uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
"uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
"uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
"uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
"uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
"uqadd v0.8b, v0.8b, v4.8b \n" // + sb
"uqadd v1.8b, v1.8b, v5.8b \n" // + sg
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
"movi v3.8b, #255 \n" // a = 255
MEMACCESS(2)
"vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
"st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel.
"bge 1b \n"
"99: \n"
......@@ -2769,7 +2775,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2
"+r"(width) // %3
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18"
);
}
#endif // HAS_ARGBBLENDROW_NEON
......@@ -2781,22 +2788,22 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Attenuate 8 pixels.
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a
"vmull.u8 q11, d1, d3 \n" // g * a
"vmull.u8 q12, d2, d3 \n" // r * a
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
"umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a
"umull v6.8h, v2.8b, v3.8b \n" // r * a
"uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
"uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q10", "q11", "q12"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
#endif // HAS_ARGBATTENUATEROW_NEON
......@@ -2807,41 +2814,41 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
asm volatile (
"vdup.u16 q8, %2 \n"
"vshr.u16 q8, q8, #1 \n" // scale >>= 1
"vdup.u16 q9, %3 \n" // interval multiply.
"vdup.u16 q10, %4 \n" // interval add
"dup v4.8h, %w2 \n"
"ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
"dup v5.8h, %w3 \n" // interval multiply.
"dup v6.8h, %w4 \n" // interval add
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
"ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255)
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q2, d4 \n"
"vqdmulh.s16 q0, q0, q8 \n" // b * scale
"vqdmulh.s16 q1, q1, q8 \n" // g
"vqdmulh.s16 q2, q2, q8 \n" // r
"vmul.u16 q0, q0, q9 \n" // b * interval_size
"vmul.u16 q1, q1, q9 \n" // g
"vmul.u16 q2, q2, q9 \n" // r
"vadd.u16 q0, q0, q10 \n" // b + interval_offset
"vadd.u16 q1, q1, q10 \n" // g
"vadd.u16 q2, q2, q10 \n" // r
"vqmovn.u16 d0, q0 \n"
"vqmovn.u16 d2, q1 \n"
"vqmovn.u16 d4, q2 \n"
MEMACCESS(0)
"vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n"
"uxtl v2.8h, v2.8b \n"
"sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
"sqdmulh v1.8h, v1.8h, v4.8h \n" // g
"sqdmulh v2.8h, v2.8h, v4.8h \n" // r
"mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
"mul v1.8h, v1.8h, v5.8h \n" // g
"mul v2.8h, v2.8h, v5.8h \n" // r
"add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
"add v1.8h, v1.8h, v6.8h \n" // g
"add v2.8h, v2.8h, v6.8h \n" // r
"uqxtn v0.8b, v0.8h \n"
"uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n"
MEMACCESS(0)
"st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
"r"(interval_size), // %3
"r"(interval_offset) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
#endif // HAS_ARGBQUANTIZEROW_NEON
......@@ -2853,36 +2860,36 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
asm volatile (
"vdup.u32 q0, %3 \n" // duplicate scale value.
"vzip.u8 d0, d1 \n" // d0 aarrggbb.
"vshr.u16 q0, q0, #1 \n" // scale / 2.
"dup v0.4s, %w3 \n" // duplicate scale value.
"zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
"ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
"ld4 {v4.8b-v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
"vmovl.u8 q11, d22 \n"
"vmovl.u8 q12, d24 \n"
"vmovl.u8 q13, d26 \n"
"vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
"vqrdmulh.s16 q11, q11, d0[1] \n" // g
"vqrdmulh.s16 q12, q12, d0[2] \n" // r
"vqrdmulh.s16 q13, q13, d0[3] \n" // a
"vqmovn.u16 d20, q10 \n"
"vqmovn.u16 d22, q11 \n"
"vqmovn.u16 d24, q12 \n"
"vqmovn.u16 d26, q13 \n"
MEMACCESS(1)
"vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n"
"uxtl v6.8h, v6.8b \n"
"uxtl v7.8h, v7.8b \n"
"sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
"sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
"sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
"sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
"uqxtn v4.8b, v4.8h \n"
"uqxtn v5.8b, v5.8h \n"
"uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n"
MEMACCESS(1)
"st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(value) // %3
: "cc", "memory", "q0", "q10", "q11", "q12", "q13"
: "cc", "memory", "v0", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBSHADEROW_NEON
......@@ -2893,28 +2900,28 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBGRAYROW_NEON
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
"movi v24.8b, #15 \n" // B * 0.11400 coefficient
"movi v25.8b, #75 \n" // G * 0.58700 coefficient
"movi v26.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
"vmov d1, d0 \n" // G
"vmov d2, d0 \n" // R
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R
"sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
"mov v1.8b, v0.8b \n" // G
"mov v2.8b, v0.8b \n" // R
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q12", "q13"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
);
}
#endif // HAS_ARGBGRAYROW_NEON
......@@ -2927,40 +2934,40 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBSEPIAROW_NEON
void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
asm volatile (
"vmov.u8 d20, #17 \n" // BB coefficient
"vmov.u8 d21, #68 \n" // BG coefficient
"vmov.u8 d22, #35 \n" // BR coefficient
"vmov.u8 d24, #22 \n" // GB coefficient
"vmov.u8 d25, #88 \n" // GG coefficient
"vmov.u8 d26, #45 \n" // GR coefficient
"vmov.u8 d28, #24 \n" // BB coefficient
"vmov.u8 d29, #98 \n" // BG coefficient
"vmov.u8 d30, #50 \n" // BR coefficient
"movi v20.8b, #17 \n" // BB coefficient
"movi v21.8b, #68 \n" // BG coefficient
"movi v22.8b, #35 \n" // BR coefficient
"movi v24.8b, #22 \n" // GB coefficient
"movi v25.8b, #88 \n" // GG coefficient
"movi v26.8b, #45 \n" // GR coefficient
"movi v28.8b, #24 \n" // BB coefficient
"movi v29.8b, #98 \n" // BG coefficient
"movi v30.8b, #50 \n" // BR coefficient
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B
"vmlal.u8 q2, d1, d21 \n" // G
"vmlal.u8 q2, d2, d22 \n" // R
"vmull.u8 q3, d0, d24 \n" // B to Sepia G
"vmlal.u8 q3, d1, d25 \n" // G
"vmlal.u8 q3, d2, d26 \n" // R
"vmull.u8 q8, d0, d28 \n" // B to Sepia R
"vmlal.u8 q8, d1, d29 \n" // G
"vmlal.u8 q8, d2, d30 \n" // R
"vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
"vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
"vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
MEMACCESS(0)
"vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"umlal v4.8h, v1.8b, v21.8b \n" // G
"umlal v4.8h, v2.8b, v22.8b \n" // R
"umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
"umlal v5.8h, v1.8b, v25.8b \n" // G
"umlal v5.8h, v2.8b, v26.8b \n" // R
"umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
"umlal v6.8h, v1.8b, v29.8b \n" // G
"umlal v6.8h, v2.8b, v30.8b \n" // R
"uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
MEMACCESS(0)
"st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
: "cc", "memory", "q0", "q1", "q2", "q3",
"q10", "q11", "q12", "q13", "q14", "q15"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
);
}
#endif // HAS_ARGBSEPIAROW_NEON
......@@ -2973,60 +2980,60 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
"ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
"sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
"ld4 {v16.8b-v19.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
"vmovl.u8 q9, d18 \n" // g
"vmovl.u8 q10, d20 \n" // r
"vmovl.u8 q15, d22 \n" // a
"vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
"vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
"vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
"vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
"vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
"vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
"vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
"vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
"vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
"vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
"vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
"vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
"vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
"vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
"vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
"vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
"vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
MEMACCESS(1)
"vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g
"uxtl v18.8h, v18.8b \n" // r
"uxtl v19.8h, v19.8b \n" // a
"mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
"mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
"mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
"mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
"mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
"mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
"mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
"mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
"sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
"sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
"sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
"sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
"mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
"mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
"mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
"mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
"sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
"sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
"sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
"sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
"mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
"mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
"mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
"mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
"sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
"sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
"sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
"sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
"sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
"sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
MEMACCESS(1)
"st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(matrix_argb) // %3
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
"v18", "v19", "v22", "v23", "v24", "v25"
);
}
#endif // HAS_ARGBCOLORMATRIXROW_NEON
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment