Commit aec76f2e authored by fbarchard@google.com's avatar fbarchard@google.com

add stride to pointer in C and pass as register to inline.

BUG=357
TESTED=clang on ios
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/29489004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1086 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f7d9b9fb
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1084
Version: 1086
License: BSD
License File: LICENSE
......
......@@ -58,6 +58,13 @@ extern "C" {
#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
#define LIBYUV_DISABLE_NEON
#endif
// clang >= 3.5.0 required for Arm64.
#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
#define LIBYUV_DISABLE_NEON
#endif // clang >= 3.5
#endif // __clang__
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1084
#define LIBYUV_VERSION 1086
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -25,7 +25,7 @@
'conditions': [
['(target_arch == "armv7" or target_arch == "armv7s" or \
(target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
and target_subarch != 64 and (arm_neon == 1 or arm_neon_optional == 1)',
and (arm_neon == 1 or arm_neon_optional == 1)',
{
'build_neon': 1,
}],
......@@ -47,11 +47,6 @@
'-mfpu=vfpv3-d16',
],
'conditions': [
['target_arch != "arm64"', {
'cflags': [
'-mfpu=neon',
],
}],
# Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
['use_lto == 1', {
'cflags!': [
......@@ -60,6 +55,9 @@
],
}],
],
'cflags': [
'-mfpu=neon',
],
'include_dirs': [
'include',
'.',
......@@ -93,11 +91,6 @@
# Allows libyuv.a redistributable library without external dependencies.
'standalone_static_library': 1,
'conditions': [
['OS == "ios" and target_subarch == 64', {
'defines': [
'LIBYUV_DISABLE_NEON'
],
}],
['OS != "ios" and libyuv_disable_jpeg != 1', {
'defines': [
'HAVE_JPEG'
......@@ -126,15 +119,6 @@
'dependencies': [
'libyuv_neon',
],
'conditions': [
# TODO LIBYUV_NEON is temporary disabled. When all arm64 port has
# been done, enable it.
['target_arch !="arm64"', {
'defines': [
'LIBYUV_NEON',
]
}],
],
}],
# MemorySanitizer does not support assembly code yet.
# http://crbug.com/344505
......@@ -151,6 +135,7 @@
# 'LIBYUV_DISABLE_MIPS',
# Enable the following macro to build libyuv as a shared library (dll).
# 'LIBYUV_USING_SHARED_LIBRARY',
# TODO(fbarchard): Make these into gyp defines.
],
'include_dirs': [
'include',
......
......@@ -15,7 +15,8 @@
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
(defined(_M_IX86) || defined(_M_X64))
#include <immintrin.h> // For _xgetbv()
#endif
......
......@@ -1297,8 +1297,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
#ifdef HAS_YUY2TOUVROW_NEON
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
"add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
......@@ -1314,7 +1314,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
"+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
......@@ -1327,8 +1327,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
#ifdef HAS_UYVYTOUVROW_NEON
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile (
"add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
......@@ -1344,7 +1344,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
"+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
......@@ -1357,9 +1357,9 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
#ifdef HAS_HALFROW_NEON
void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
const uint8* src_uvb = src_uv + src_uv_stride;
asm volatile (
// change the stride to row 2 pointer
"add %x1, %x0, %w1, sxtw \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
......@@ -1371,7 +1371,7 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
"st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1
"+r"(src_uvb), // %1
"+r"(dst_uv), // %2
"+r"(pix) // %3
:
......@@ -1682,11 +1682,11 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
"movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient
"movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient
"movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient
"movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient
"movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient
"movi v20.8h, #56 \n" // UB / VR 0.875 / 2 coefficient
"movi v21.8h, #37 \n" // UG -0.5781 / 2 coefficient
"movi v22.8h, #19 \n" // UR -0.2969 / 2 coefficient
"movi v23.8h, #9 \n" // VB -0.1406 / 2 coefficient
"movi v24.8h, #47 \n" // VG -0.7344 / 2 coefficient
"movi v25.16b, #0x80 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -1732,11 +1732,11 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
"movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient
"movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient
"movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient
"movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient
"movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient
"movi v20.8h, #56 \n" // UB / VR 0.875 / 2 coefficient
"movi v21.8h, #37 \n" // UG -0.5781 / 2 coefficient
"movi v22.8h, #19 \n" // UR -0.2969 / 2 coefficient
"movi v23.8h, #9 \n" // VB -0.1406 / 2 coefficient
"movi v24.8h, #47 \n" // VG -0.7344 / 2 coefficient
"movi v25.16b, #0x80 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -1800,16 +1800,18 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
#ifdef HAS_ARGBTOUVROW_NEON
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -1908,11 +1910,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_bgra
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -1959,11 +1961,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_abgr
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -2010,11 +2012,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgba
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -2061,11 +2063,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgb24
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -2112,11 +2114,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_raw
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -2164,11 +2166,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -2236,11 +2238,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -2308,11 +2310,11 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #37 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #19 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #9 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #47 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
......@@ -2748,9 +2750,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// Blend 1 pixels.
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
......@@ -2766,7 +2768,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
"movi v3.8b, #255 \n" // a = 255
MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%2], #4 \n" // store 1 pixel.
"st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
"b.ge 1b \n"
"99: \n"
......
......@@ -21,7 +21,8 @@ extern "C" {
#endif
// This module is for Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
(defined(_M_IX86) || defined(_M_X64))
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
......
......@@ -105,12 +105,12 @@ asm volatile (
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3)
"ld1 {v1.16b}, [%3], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4], #16 \n"
"ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
"ld1 {v3.16b}, [%5], #16 \n"
"subs %2, %2, #4 \n"
"ld1 {v3.16b}, [%4], #16 \n"
"subs %5, %5, #4 \n"
"uaddlp v0.8h, v0.16b \n"
"uadalp v0.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n"
......@@ -122,10 +122,10 @@ asm volatile (
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_ptr1), // %3
"+r"(src_ptr2), // %4
"+r"(src_ptr3) // %5
"+r"(src_ptr1), // %2
"+r"(src_ptr2), // %3
"+r"(src_ptr3), // %4
"+r"(dst_width) // %5
:
: "v0", "v1", "v2", "v3", "memory", "cc"
);
......@@ -144,7 +144,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #24 \n"
"mov v2.8b, v3.8b \n" // order v0, v1, v2
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
......@@ -309,6 +309,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
MEMACCESS(5)
......@@ -317,7 +318,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"ld1 {v30.16b}, [%6] \n"
MEMACCESS(7)
"ld1 {v31.8h}, [%7] \n"
"add %3, %3, %0 \n"
"add %2, %2, %0 \n"
"1: \n"
// 00 40 01 41 02 42 03 43
......@@ -327,10 +328,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32 \n"
"subs %2, %2, #12 \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %4, %4, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
......@@ -420,9 +421,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride), // %3
"+r"(src_ptr1) // %4
"+r"(tmp_src_stride), // %2
"+r"(src_ptr1), // %3
"+r"(dst_width) // %4
: "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7
......@@ -438,12 +439,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
MEMACCESS(4)
"ld1 {v30.8h}, [%4] \n"
MEMACCESS(5)
"ld1 {v31.16b}, [%5] \n"
"add %3, %3, %0 \n"
"add %2, %2, %0 \n"
"1: \n"
// 00 40 01 41 02 42 03 43
......@@ -454,7 +457,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
"subs %2, %2, #12 \n"
"subs %3, %3, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
......@@ -528,12 +531,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
"+r"(dst_width) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
"v18", "v19", "v30", "v31", "memory", "cc"
);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment