Commit 2df5743b authored by ashok.bhat@gmail.com's avatar ashok.bhat@gmail.com

Row AArch64 Neon implementation - Part 6

BUG=319
TESTED=libyuv_unittest
R=fbarchard@google.com

Change-Id: I5d93eb184ba873d5e7637a3b5a830be39a967c6f
Signed-off-by: 's avatarAshok Bhat <ashok.bhat@arm.com>

Review URL: https://webrtc-codereview.appspot.com/15239004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1069 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 4d5c3f34
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1067 Version: 1069
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -319,19 +319,19 @@ extern "C" { ...@@ -319,19 +319,19 @@ extern "C" {
// #define HAS_RGB565TOYROW_NEON // #define HAS_RGB565TOYROW_NEON
// #define HAS_ARGB1555TOYROW_NEON // #define HAS_ARGB1555TOYROW_NEON
// #define HAS_ARGB4444TOYROW_NEON // #define HAS_ARGB4444TOYROW_NEON
// #define HAS_BGRATOYROW_NEON #define HAS_BGRATOYROW_NEON
// #define HAS_ABGRTOYROW_NEON #define HAS_ABGRTOYROW_NEON
// #define HAS_RGBATOYROW_NEON #define HAS_RGBATOYROW_NEON
// #define HAS_RGB24TOYROW_NEON #define HAS_RGB24TOYROW_NEON
// #define HAS_RAWTOYROW_NEON #define HAS_RAWTOYROW_NEON
// #define HAS_INTERPOLATEROW_NEON #define HAS_INTERPOLATEROW_NEON
// #define HAS_ARGBBLENDROW_NEON #define HAS_ARGBBLENDROW_NEON
// #define HAS_ARGBATTENUATEROW_NEON #define HAS_ARGBATTENUATEROW_NEON
// #define HAS_ARGBQUANTIZEROW_NEON #define HAS_ARGBQUANTIZEROW_NEON
// #define HAS_ARGBSHADEROW_NEON #define HAS_ARGBSHADEROW_NEON
// #define HAS_ARGBGRAYROW_NEON #define HAS_ARGBGRAYROW_NEON
// #define HAS_ARGBSEPIAROW_NEON #define HAS_ARGBSEPIAROW_NEON
// #define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBMULTIPLYROW_NEON
#define HAS_ARGBADDROW_NEON #define HAS_ARGBADDROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON #define HAS_ARGBSUBTRACTROW_NEON
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1067 #define LIBYUV_VERSION 1069
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -782,13 +782,15 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, ...@@ -782,13 +782,15 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
BGRAToYRow = BGRAToYRow_NEON; BGRAToYRow = BGRAToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_BGRATOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
BGRAToUVRow = BGRAToUVRow_Any_NEON; BGRAToUVRow = BGRAToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
BGRAToUVRow = BGRAToUVRow_NEON; BGRAToUVRow = BGRAToUVRow_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
...@@ -851,13 +853,15 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, ...@@ -851,13 +853,15 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ABGRToYRow = ABGRToYRow_NEON; ABGRToYRow = ABGRToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ABGRTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ABGRToUVRow = ABGRToUVRow_Any_NEON; ABGRToUVRow = ABGRToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_NEON; ABGRToUVRow = ABGRToUVRow_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
...@@ -920,13 +924,15 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, ...@@ -920,13 +924,15 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
RGBAToYRow = RGBAToYRow_NEON; RGBAToYRow = RGBAToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_RGBATOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
RGBAToUVRow = RGBAToUVRow_Any_NEON; RGBAToUVRow = RGBAToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
RGBAToUVRow = RGBAToUVRow_NEON; RGBAToUVRow = RGBAToUVRow_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
...@@ -983,15 +989,16 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, ...@@ -983,15 +989,16 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
RGB24ToYRow = RGB24ToYRow_NEON; RGB24ToYRow = RGB24ToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_RGB24TOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON; RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
RGB24ToUVRow = RGB24ToUVRow_NEON; RGB24ToUVRow = RGB24ToUVRow_NEON;
} }
} }
} #endif
#else // HAS_RGB24TOYROW_NEON
#if defined(HAS_RGB24TOARGBROW_SSSE3) #if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
...@@ -1019,7 +1026,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, ...@@ -1019,7 +1026,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
} }
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB24TOYROW_NEON
{ {
#if !defined(HAS_RGB24TOYROW_NEON) #if !defined(HAS_RGB24TOYROW_NEON)
...@@ -1100,15 +1106,16 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, ...@@ -1100,15 +1106,16 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
RAWToYRow = RAWToYRow_NEON; RAWToYRow = RAWToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_RAWTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
RAWToUVRow = RAWToUVRow_Any_NEON; RAWToUVRow = RAWToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
RAWToUVRow = RAWToUVRow_NEON; RAWToUVRow = RAWToUVRow_NEON;
} }
} }
} #endif
#else // HAS_RAWTOYROW_NEON
#if defined(HAS_RAWTOARGBROW_SSSE3) #if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3; RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
...@@ -1136,7 +1143,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, ...@@ -1136,7 +1143,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
} }
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RAWTOYROW_NEON
{ {
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
......
...@@ -245,14 +245,32 @@ YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16) ...@@ -245,14 +245,32 @@ YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
#endif #endif
#ifdef HAS_ARGBTOYROW_NEON #ifdef HAS_ARGBTOYROW_NEON
YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_ARGBTOYJROW_NEON
YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8) YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_BGRATOYROW_NEON
YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8) YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_ABGRTOYROW_NEON
YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8) YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_RGBATOYROW_NEON
YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8) YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
#endif
#ifdef HAS_RGB24TOYROW_NEON
YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8) YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
#endif
#ifdef HAS_RAWTOYROW_NEON
YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
#endif
#ifdef HAS_RGB565TOYROW_NEON
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
#endif
#ifdef HAS_ARGB1555TOYROW_NEON
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
#endif
#ifdef HAS_ARGB4444TOYROW_NEON
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
#endif #endif
#ifdef HAS_YUY2TOYROW_NEON #ifdef HAS_YUY2TOYROW_NEON
...@@ -342,14 +360,32 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) ...@@ -342,14 +360,32 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
#endif #endif
#ifdef HAS_ARGBTOUVROW_NEON #ifdef HAS_ARGBTOUVROW_NEON
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
#endif
#ifdef HAS_ARGBTOUVJROW_NEON
UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15) UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_NEON
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_NEON
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_NEON
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
#endif
#ifdef HAS_RGB24TOUVROW_NEON
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15) UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
#endif
#ifdef HAS_RAWTOUVROW_NEON
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
#endif
#ifdef HAS_RGB565TOUVROW_NEON
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGB1555TOUVROW_NEON
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGB4444TOUVROW_NEON
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
#endif #endif
#ifdef HAS_YUY2TOUVROW_NEON #ifdef HAS_YUY2TOUVROW_NEON
......
/* /*
* Copyright 2011 The LibYuv Project Authors. All rights reserved. * Copyright 2014 The LibYuv Project Authors. All rights reserved.
* *
* Use of this source code is governed by a BSD-style license * Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source * that can be found in the LICENSE file in the root of the source
...@@ -2466,28 +2466,28 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { ...@@ -2466,28 +2466,28 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
#ifdef HAS_BGRATOYROW_NEON #ifdef HAS_BGRATOYROW_NEON
void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "movi v4.8b, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R "umull v16.8h, v1.8b, v4.8b \n" // R
"vmlal.u8 q8, d2, d5 \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
"vmlal.u8 q8, d3, d6 \n" // B "umlal v16.8h, v3.8b, v6.8b \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
); );
} }
#endif // HAS_BGRATOYROW_NEON #endif // HAS_BGRATOYROW_NEON
...@@ -2495,28 +2495,28 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { ...@@ -2495,28 +2495,28 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
#ifdef HAS_ABGRTOYROW_NEON #ifdef HAS_ABGRTOYROW_NEON
void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "movi v4.8b, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R "umull v16.8h, v0.8b, v4.8b \n" // R
"vmlal.u8 q8, d1, d5 \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"vmlal.u8 q8, d2, d6 \n" // B "umlal v16.8h, v2.8b, v6.8b \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
); );
} }
#endif // HAS_ABGRTOYROW_NEON #endif // HAS_ABGRTOYROW_NEON
...@@ -2524,28 +2524,28 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { ...@@ -2524,28 +2524,28 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
#ifdef HAS_RGBATOYROW_NEON #ifdef HAS_RGBATOYROW_NEON
void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B "umull v16.8h, v1.8b, v4.8b \n" // B
"vmlal.u8 q8, d2, d5 \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
"vmlal.u8 q8, d3, d6 \n" // R "umlal v16.8h, v3.8b, v6.8b \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
); );
} }
#endif // HAS_RGBATOYROW_NEON #endif // HAS_RGBATOYROW_NEON
...@@ -2553,28 +2553,28 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { ...@@ -2553,28 +2553,28 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
#ifdef HAS_RGB24TOYROW_NEON #ifdef HAS_RGB24TOYROW_NEON
void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"vmlal.u8 q8, d1, d5 \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"vmlal.u8 q8, d2, d6 \n" // R "umlal v16.8h, v2.8b, v6.8b \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
); );
} }
#endif // HAS_RGB24TOYROW_NEON #endif // HAS_RGB24TOYROW_NEON
...@@ -2582,28 +2582,28 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { ...@@ -2582,28 +2582,28 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
#ifdef HAS_RAWTOYROW_NEON #ifdef HAS_RAWTOYROW_NEON
void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "movi v4.8b, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"vmlal.u8 q8, d1, d5 \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"vmlal.u8 q8, d2, d6 \n" // R "umlal v16.8h, v2.8b, v6.8b \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
); );
} }
#endif // HAS_RAWTOYROW_NEON #endif // HAS_RAWTOYROW_NEON
...@@ -2613,10 +2613,12 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ...@@ -2613,10 +2613,12 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
void InterpolateRow_NEON(uint8* dst_ptr, void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) { int dst_width, int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"add %2, %1 \n"
"cmp %4, #64 \n" "cmp %4, #64 \n"
"beq 75f \n" "beq 75f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
...@@ -2624,85 +2626,85 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2624,85 +2626,85 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"cmp %4, #192 \n" "cmp %4, #192 \n"
"beq 25f \n" "beq 25f \n"
"vdup.8 d5, %4 \n" "dup v5.16b, %w4 \n"
"rsb %4, #256 \n" "dup v4.16b, %w5 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n" "umull v2.8h, v0.8b, v4.8b \n"
"vmull.u8 q14, d1, d4 \n" "umull2 v3.8h, v0.16b, v4.16b \n"
"vmlal.u8 q13, d2, d5 \n" "umlal v2.8h, v1.8b, v5.8b \n"
"vmlal.u8 q14, d3, d5 \n" "umlal2 v3.8h, v1.16b, v5.16b \n"
"vrshrn.u16 d0, q13, #8 \n" "rshrn v0.8b, v2.8h, #8 \n"
"vrshrn.u16 d1, q14, #8 \n" "rshrn2 v0.16b, v3.8h, #8 \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 1b \n" "bgt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 25b \n" "bgt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 50b \n" "bgt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"vrhadd.u8 q0, q1 \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 75b \n" "bgt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "st1 {v0.16b}, [%0], #16 \n"
"bgt 100b \n" "bgt 100b \n"
"99: \n" "99: \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(src_stride), // %2 "+r"(src_ptr1), // %2
"+r"(dst_width), // %3 "+r"(dst_width), // %3
"+r"(source_y_fraction) // %4 "+r"(y1_fraction), // %4
"+r"(y0_fraction) // %5
: :
: "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
); );
} }
#endif // HAS_INTERPOLATEROW_NEON #endif // HAS_INTERPOLATEROW_NEON
...@@ -2712,54 +2714,58 @@ void InterpolateRow_NEON(uint8* dst_ptr, ...@@ -2712,54 +2714,58 @@ void InterpolateRow_NEON(uint8* dst_ptr,
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"subs %3, #8 \n" "subs %3, %3, #8 \n"
"blt 89f \n" "blt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
"8: \n" "8: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a "umull v16.8h, v4.8b, v3.8b \n" // db * a
"vmull.u8 q11, d5, d3 \n" // dg * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a
"vmull.u8 q12, d6, d3 \n" // dr * a "umull v18.8h, v6.8b, v3.8b \n" // dr * a
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
"vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
"vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
"vqadd.u8 q0, q0, q2 \n" // + sbg "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
"vqadd.u8 d2, d2, d6 \n" // + sr "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
"vmov.u8 d3, #255 \n" // a = 255 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
"movi v3.8b, #255 \n" // a = 255
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB.
"bge 8b \n" "bge 8b \n"
"89: \n" "89: \n"
"adds %3, #8-1 \n" "adds %3, %3, #8-1 \n"
"blt 99f \n" "blt 99f \n"
// Blend 1 pixels. // Blend 1 pixels.
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. "ld4 {v0.b-v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "ld4 {v4.b-v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop. "subs %3, %3, #1 \n" // 1 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a "umull v16.8h, v4.8b, v3.8b \n" // db * a
"vmull.u8 q11, d5, d3 \n" // dg * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a
"vmull.u8 q12, d6, d3 \n" // dr * a "umull v18.8h, v6.8b, v3.8b \n" // dr * a
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
"vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
"vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
"vqadd.u8 q0, q0, q2 \n" // + sbg "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
"vqadd.u8 d2, d2, d6 \n" // + sr "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
"vmov.u8 d3, #255 \n" // a = 255 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
"movi v3.8b, #255 \n" // a = 255
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. "st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel.
"bge 1b \n" "bge 1b \n"
"99: \n" "99: \n"
...@@ -2769,7 +2775,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2769,7 +2775,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18"
); );
} }
#endif // HAS_ARGBBLENDROW_NEON #endif // HAS_ARGBBLENDROW_NEON
...@@ -2781,22 +2788,22 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2781,22 +2788,22 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Attenuate 8 pixels. // Attenuate 8 pixels.
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a "umull v4.8h, v0.8b, v3.8b \n" // b * a
"vmull.u8 q11, d1, d3 \n" // g * a "umull v5.8h, v1.8b, v3.8b \n" // g * a
"vmull.u8 q12, d2, d3 \n" // r * a "umull v6.8h, v2.8b, v3.8b \n" // r * a
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1", "q10", "q11", "q12" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
); );
} }
#endif // HAS_ARGBATTENUATEROW_NEON #endif // HAS_ARGBATTENUATEROW_NEON
...@@ -2807,41 +2814,41 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2807,41 +2814,41 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) { int interval_offset, int width) {
asm volatile ( asm volatile (
"vdup.u16 q8, %2 \n" "dup v4.8h, %w2 \n"
"vshr.u16 q8, q8, #1 \n" // scale >>= 1 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
"vdup.u16 q9, %3 \n" // interval multiply. "dup v5.8h, %w3 \n" // interval multiply.
"vdup.u16 q10, %4 \n" // interval add "dup v6.8h, %w4 \n" // interval add
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255) "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"vmovl.u8 q1, d2 \n" "uxtl v1.8h, v1.8b \n"
"vmovl.u8 q2, d4 \n" "uxtl v2.8h, v2.8b \n"
"vqdmulh.s16 q0, q0, q8 \n" // b * scale "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
"vqdmulh.s16 q1, q1, q8 \n" // g "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
"vqdmulh.s16 q2, q2, q8 \n" // r "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
"vmul.u16 q0, q0, q9 \n" // b * interval_size "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
"vmul.u16 q1, q1, q9 \n" // g "mul v1.8h, v1.8h, v5.8h \n" // g
"vmul.u16 q2, q2, q9 \n" // r "mul v2.8h, v2.8h, v5.8h \n" // r
"vadd.u16 q0, q0, q10 \n" // b + interval_offset "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
"vadd.u16 q1, q1, q10 \n" // g "add v1.8h, v1.8h, v6.8h \n" // g
"vadd.u16 q2, q2, q10 \n" // r "add v2.8h, v2.8h, v6.8h \n" // r
"vqmovn.u16 d0, q0 \n" "uqxtn v0.8b, v0.8h \n"
"vqmovn.u16 d2, q1 \n" "uqxtn v1.8b, v1.8h \n"
"vqmovn.u16 d4, q2 \n" "uqxtn v2.8b, v2.8h \n"
MEMACCESS(0) MEMACCESS(0)
"vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
: "r"(scale), // %2 : "r"(scale), // %2
"r"(interval_size), // %3 "r"(interval_size), // %3
"r"(interval_offset) // %4 "r"(interval_offset) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
); );
} }
#endif // HAS_ARGBQUANTIZEROW_NEON #endif // HAS_ARGBQUANTIZEROW_NEON
...@@ -2853,36 +2860,36 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ...@@ -2853,36 +2860,36 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) { uint32 value) {
asm volatile ( asm volatile (
"vdup.u32 q0, %3 \n" // duplicate scale value. "dup v0.4s, %w3 \n" // duplicate scale value.
"vzip.u8 d0, d1 \n" // d0 aarrggbb. "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
"vshr.u16 q0, q0, #1 \n" // scale / 2. "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "ld4 {v4.8b-v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255) "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"vmovl.u8 q11, d22 \n" "uxtl v5.8h, v5.8b \n"
"vmovl.u8 q12, d24 \n" "uxtl v6.8h, v6.8b \n"
"vmovl.u8 q13, d26 \n" "uxtl v7.8h, v7.8b \n"
"vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
"vqrdmulh.s16 q11, q11, d0[1] \n" // g "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
"vqrdmulh.s16 q12, q12, d0[2] \n" // r "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
"vqrdmulh.s16 q13, q13, d0[3] \n" // a "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
"vqmovn.u16 d20, q10 \n" "uqxtn v4.8b, v4.8h \n"
"vqmovn.u16 d22, q11 \n" "uqxtn v5.8b, v5.8h \n"
"vqmovn.u16 d24, q12 \n" "uqxtn v6.8b, v6.8h \n"
"vqmovn.u16 d26, q13 \n" "uqxtn v7.8b, v7.8h \n"
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. "st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(value) // %3 : "r"(value) // %3
: "cc", "memory", "q0", "q10", "q11", "q12", "q13" : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
); );
} }
#endif // HAS_ARGBSHADEROW_NEON #endif // HAS_ARGBSHADEROW_NEON
...@@ -2893,28 +2900,28 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2893,28 +2900,28 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBGRAYROW_NEON #ifdef HAS_ARGBGRAYROW_NEON
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "movi v24.8b, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "movi v25.8b, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "movi v26.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "umull v4.8h, v0.8b, v24.8b \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "umlal v4.8h, v1.8b, v25.8b \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "umlal v4.8h, v2.8b, v26.8b \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
"vmov d1, d0 \n" // G "mov v1.8b, v0.8b \n" // G
"vmov d2, d0 \n" // R "mov v2.8b, v0.8b \n" // R
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q12", "q13" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
); );
} }
#endif // HAS_ARGBGRAYROW_NEON #endif // HAS_ARGBGRAYROW_NEON
...@@ -2927,40 +2934,40 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2927,40 +2934,40 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBSEPIAROW_NEON #ifdef HAS_ARGBSEPIAROW_NEON
void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"vmov.u8 d20, #17 \n" // BB coefficient "movi v20.8b, #17 \n" // BB coefficient
"vmov.u8 d21, #68 \n" // BG coefficient "movi v21.8b, #68 \n" // BG coefficient
"vmov.u8 d22, #35 \n" // BR coefficient "movi v22.8b, #35 \n" // BR coefficient
"vmov.u8 d24, #22 \n" // GB coefficient "movi v24.8b, #22 \n" // GB coefficient
"vmov.u8 d25, #88 \n" // GG coefficient "movi v25.8b, #88 \n" // GG coefficient
"vmov.u8 d26, #45 \n" // GR coefficient "movi v26.8b, #45 \n" // GR coefficient
"vmov.u8 d28, #24 \n" // BB coefficient "movi v28.8b, #24 \n" // BB coefficient
"vmov.u8 d29, #98 \n" // BG coefficient "movi v29.8b, #98 \n" // BG coefficient
"vmov.u8 d30, #50 \n" // BR coefficient "movi v30.8b, #50 \n" // BR coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"vmlal.u8 q2, d1, d21 \n" // G "umlal v4.8h, v1.8b, v21.8b \n" // G
"vmlal.u8 q2, d2, d22 \n" // R "umlal v4.8h, v2.8b, v22.8b \n" // R
"vmull.u8 q3, d0, d24 \n" // B to Sepia G "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
"vmlal.u8 q3, d1, d25 \n" // G "umlal v5.8h, v1.8b, v25.8b \n" // G
"vmlal.u8 q3, d2, d26 \n" // R "umlal v5.8h, v2.8b, v26.8b \n" // R
"vmull.u8 q8, d0, d28 \n" // B to Sepia R "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
"vmlal.u8 q8, d1, d29 \n" // G "umlal v6.8h, v1.8b, v29.8b \n" // G
"vmlal.u8 q8, d2, d30 \n" // R "umlal v6.8h, v2.8b, v30.8b \n" // R
"vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
"vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
"vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
MEMACCESS(0) MEMACCESS(0)
"vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
: :
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"q10", "q11", "q12", "q13", "q14", "q15" "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
); );
} }
#endif // HAS_ARGBSEPIAROW_NEON #endif // HAS_ARGBSEPIAROW_NEON
...@@ -2973,60 +2980,60 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ...@@ -2973,60 +2980,60 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) { const int8* matrix_argb, int width) {
asm volatile ( asm volatile (
MEMACCESS(3) MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v16.8b-v19.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"vmovl.u8 q9, d18 \n" // g "uxtl v17.8h, v17.8b \n" // g
"vmovl.u8 q10, d20 \n" // r "uxtl v18.8h, v18.8b \n" // r
"vmovl.u8 q15, d22 \n" // a "uxtl v19.8h, v19.8b \n" // a
"vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
"vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
"vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
"vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
"vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
"vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
"vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
"vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
"vqadd.s16 q15, q15, q7 \n" // Accumulate A "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
"vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
"vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
"vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
"vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
"vqadd.s16 q15, q15, q7 \n" // Accumulate A "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
"vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
"vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
"vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
"vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
"vqadd.s16 q15, q15, q7 \n" // Accumulate A "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
"vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. "st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(matrix_argb) // %3 : "r"(matrix_argb) // %3
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
"q10", "q11", "q12", "q13", "q14", "q15" "v18", "v19", "v22", "v23", "v24", "v25"
); );
} }
#endif // HAS_ARGBCOLORMATRIXROW_NEON #endif // HAS_ARGBCOLORMATRIXROW_NEON
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment