Commit f7d9b9fb authored by fbarchard@google.com's avatar fbarchard@google.com

change vector range notation to a list of registers for clang compatibility.…

change vector range notation to a list of registers for clang compatibility. break compare into 2 neon files for consistency with other neon64 files.
BUG=357
TESTED=local ios build
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/30379004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1085 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent a62a97f1
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1083
Version: 1084
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1083
#define LIBYUV_VERSION 1084
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -73,6 +73,7 @@
'sources': [
# sources.
'source/compare_neon.cc',
'source/compare_neon64.cc',
'source/rotate_neon.cc',
'source/rotate_neon64.cc',
'source/row_neon.cc',
......
......@@ -57,45 +57,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
......
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
......@@ -825,7 +825,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store U
......@@ -855,7 +855,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2)
"st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n"
:
"+r"(src_u), // %0
......@@ -875,10 +875,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1)
"st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
......@@ -1010,10 +1010,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
"st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
......@@ -1031,12 +1031,12 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop.
"mov v3.8b, v1.8b \n" // move g
"mov v4.8b, v0.8b \n" // move r
MEMACCESS(1)
"st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
......@@ -1170,10 +1170,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
"st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
......@@ -1190,12 +1190,12 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop.
"mov v4.8b, v2.8b \n" // mov g
"mov v5.8b, v1.8b \n" // mov b
MEMACCESS(1)
"st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
"st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
......@@ -1212,7 +1212,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
......@@ -1232,7 +1232,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
......@@ -1253,7 +1253,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
......@@ -1277,7 +1277,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
......@@ -1302,10 +1302,10 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row YUY2.
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
MEMACCESS(2)
......@@ -1332,10 +1332,10 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row UYVY.
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
MEMACCESS(2)
......@@ -1388,7 +1388,7 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
"mov v2.s[0], %w3 \n" // selector
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
"ld1 {v0.16b,v1.16b}, [%0], 32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
"tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
......@@ -1412,7 +1412,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
......@@ -1467,7 +1467,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
"st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
......@@ -1489,7 +1489,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
"mov v3.8b, v2.8b \n"
MEMACCESS(1)
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
......@@ -1497,7 +1497,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
"st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
......@@ -1586,7 +1586,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
......@@ -1614,7 +1614,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
......@@ -1646,7 +1646,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlsl v4.8h, v1.8b, v25.8b \n" // G
......@@ -1691,7 +1691,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.16b-v3.16b}, [%0], #64 \n" // load 16 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
......@@ -1741,12 +1741,12 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.16b-v3.16b}, [%0], #64 \n" // load 16 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
MEMACCESS(0)
"ld4 {v4.16b-v7.16b}, [%0], #64 \n" // load next 16 ARGB pixels.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16 ARGB pixels.
"uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
......@@ -2474,7 +2474,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R
"umlal v16.8h, v2.8b, v5.8b \n" // G
......@@ -2503,7 +2503,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R
"umlal v16.8h, v1.8b, v5.8b \n" // G
......@@ -2532,7 +2532,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B
"umlal v16.8h, v2.8b, v5.8b \n" // G
......@@ -2561,7 +2561,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G
......@@ -2590,7 +2590,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RAW.
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G
......@@ -2720,9 +2720,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// Blend 8 pixels.
"8: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0.
MEMACCESS(1)
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
......@@ -2738,7 +2738,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
"movi v3.8b, #255 \n" // a = 255
MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB.
"b.ge 8b \n"
"89: \n"
......@@ -2748,9 +2748,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// Blend 1 pixels.
"1: \n"
MEMACCESS(0)
"ld4 {v0.b-v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
MEMACCESS(1)
"ld4 {v4.b-v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
......@@ -2766,7 +2766,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
"movi v3.8b, #255 \n" // a = 255
MEMACCESS(2)
"st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%2], #4 \n" // store 1 pixel.
"b.ge 1b \n"
"99: \n"
......@@ -2789,7 +2789,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
// Attenuate 8 pixels.
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a
......@@ -2798,7 +2798,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
MEMACCESS(1)
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -2824,7 +2824,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 pixels of ARGB.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n"
......@@ -2842,7 +2842,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n"
MEMACCESS(0)
"st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB.
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
......@@ -2869,7 +2869,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v4.8b-v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n"
......@@ -2884,7 +2884,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n"
MEMACCESS(1)
"st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -2907,7 +2907,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G
......@@ -2916,7 +2916,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"mov v1.8b, v0.8b \n" // G
"mov v2.8b, v0.8b \n" // R
MEMACCESS(1)
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -2947,7 +2947,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 ARGB pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"umlal v4.8h, v1.8b, v21.8b \n" // G
......@@ -2962,7 +2962,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
MEMACCESS(0)
"st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels.
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
......@@ -2988,7 +2988,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v16.8b-v19.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g
......@@ -3027,7 +3027,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
MEMACCESS(1)
"st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB pixels.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
......@@ -3049,9 +3049,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
......@@ -3062,7 +3062,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"b.gt 1b \n"
: "+r"(src_argb0), // %0
......@@ -3084,16 +3084,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"b.gt 1b \n"
: "+r"(src_argb0), // %0
......@@ -3115,16 +3115,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"b.gt 1b \n"
: "+r"(src_argb0), // %0
......@@ -3159,7 +3159,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"mov v1.8b, v0.8b \n"
"mov v2.8b, v0.8b \n"
MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......@@ -3218,7 +3218,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"subs %3, %3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2)
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
......
......@@ -28,7 +28,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n"
// load even pixels into v0, odd into v1
MEMACCESS(0)
"ld2 {v0.16b, v1.16b}, [%0], #32 \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
......@@ -51,7 +51,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
......@@ -80,7 +80,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n"
......@@ -142,11 +142,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #24 \n"
"mov v2.8b, v3.8b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -166,9 +166,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"add %3, %3, %0 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %2, %2, #24 \n"
// filter src line 0 with src line 1
......@@ -205,7 +205,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"uqrshrn v2.8b, v16.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
......@@ -228,9 +228,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"add %3, %3, %0 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %2, %2, #24 \n"
// average src line 0 with src line 1
"urhadd v0.8b, v0.8b, v4.8b \n"
......@@ -252,7 +252,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"uqrshrn v2.8b, v4.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -285,9 +285,9 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
"ld1 {v3.16b}, [%3] \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #12 \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
......@@ -325,11 +325,11 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b-v7.8b}, [%3], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b-v19.8b}, [%4], #32 \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32 \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
......@@ -451,9 +451,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b-v7.8b}, [%3], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
......@@ -673,14 +673,14 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
"1: \n"
MEMACCESS (0)
"ld4 {v0.16b - v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
MEMACCESS (1)
"ld4 {v16.16b - v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
......@@ -690,7 +690,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
MEMACCESS (2)
"st4 {v0.8b - v3.8b}, [%2], #32 \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r" (src_ptr), // %0
"+r" (src_stride), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment