Commit f7d9b9fb authored by fbarchard@google.com's avatar fbarchard@google.com

change vector range notation to a list of registers for clang compatibility.…

change vector range notation to a list of registers for clang compatibility. break compare into 2 neon files for consistency with other neon64 files.
BUG=357
TESTED=local ios build
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/30379004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1085 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent a62a97f1
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1083
Version: 1084
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1083
#define LIBYUV_VERSION 1084
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -73,6 +73,7 @@
'sources': [
# sources.
'source/compare_neon.cc',
'source/compare_neon64.cc',
'source/rotate_neon.cc',
'source/rotate_neon64.cc',
'source/row_neon.cc',
......
......@@ -57,45 +57,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
......
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
This diff is collapsed.
......@@ -28,7 +28,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n"
// load even pixels into v0, odd into v1
MEMACCESS(0)
"ld2 {v0.16b, v1.16b}, [%0], #32 \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
......@@ -51,7 +51,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
......@@ -80,7 +80,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n"
......@@ -142,11 +142,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #24 \n"
"mov v2.8b, v3.8b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -166,9 +166,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"add %3, %3, %0 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %2, %2, #24 \n"
// filter src line 0 with src line 1
......@@ -205,7 +205,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"uqrshrn v2.8b, v16.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
......@@ -228,9 +228,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"add %3, %3, %0 \n"
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %2, %2, #24 \n"
// average src line 0 with src line 1
"urhadd v0.8b, v0.8b, v4.8b \n"
......@@ -252,7 +252,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"uqrshrn v2.8b, v4.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b-v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
......@@ -285,9 +285,9 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
"ld1 {v3.16b}, [%3] \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #12 \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
......@@ -325,11 +325,11 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b-v7.8b}, [%3], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b-v19.8b}, [%4], #32 \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32 \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
......@@ -451,9 +451,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b-v3.8b}, [%0], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b-v7.8b}, [%3], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
......@@ -673,14 +673,14 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
"1: \n"
MEMACCESS (0)
"ld4 {v0.16b - v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
MEMACCESS (1)
"ld4 {v16.16b - v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
......@@ -690,7 +690,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
MEMACCESS (2)
"st4 {v0.8b - v3.8b}, [%2], #32 \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r" (src_ptr), // %0
"+r" (src_stride), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment