Commit aec76f2e authored by fbarchard@google.com's avatar fbarchard@google.com

add stride to pointer in C and pass as register to inline.

BUG=357
TESTED=clang on ios
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/29489004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1086 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f7d9b9fb
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1084 Version: 1086
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -58,6 +58,13 @@ extern "C" { ...@@ -58,6 +58,13 @@ extern "C" {
#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37 #if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
#define LIBYUV_DISABLE_NEON #define LIBYUV_DISABLE_NEON
#endif #endif
// clang >= 3.5.0 required for Arm64.
#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
#define LIBYUV_DISABLE_NEON
#endif // clang >= 3.5
#endif // __clang__
// The following are available on all x86 platforms: // The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1084 #define LIBYUV_VERSION 1086
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
'conditions': [ 'conditions': [
['(target_arch == "armv7" or target_arch == "armv7s" or \ ['(target_arch == "armv7" or target_arch == "armv7s" or \
(target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\ (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
and target_subarch != 64 and (arm_neon == 1 or arm_neon_optional == 1)', and (arm_neon == 1 or arm_neon_optional == 1)',
{ {
'build_neon': 1, 'build_neon': 1,
}], }],
...@@ -47,11 +47,6 @@ ...@@ -47,11 +47,6 @@
'-mfpu=vfpv3-d16', '-mfpu=vfpv3-d16',
], ],
'conditions': [ 'conditions': [
['target_arch != "arm64"', {
'cflags': [
'-mfpu=neon',
],
}],
# Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug. # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
['use_lto == 1', { ['use_lto == 1', {
'cflags!': [ 'cflags!': [
...@@ -60,6 +55,9 @@ ...@@ -60,6 +55,9 @@
], ],
}], }],
], ],
'cflags': [
'-mfpu=neon',
],
'include_dirs': [ 'include_dirs': [
'include', 'include',
'.', '.',
...@@ -93,11 +91,6 @@ ...@@ -93,11 +91,6 @@
# Allows libyuv.a redistributable library without external dependencies. # Allows libyuv.a redistributable library without external dependencies.
'standalone_static_library': 1, 'standalone_static_library': 1,
'conditions': [ 'conditions': [
['OS == "ios" and target_subarch == 64', {
'defines': [
'LIBYUV_DISABLE_NEON'
],
}],
['OS != "ios" and libyuv_disable_jpeg != 1', { ['OS != "ios" and libyuv_disable_jpeg != 1', {
'defines': [ 'defines': [
'HAVE_JPEG' 'HAVE_JPEG'
...@@ -126,15 +119,6 @@ ...@@ -126,15 +119,6 @@
'dependencies': [ 'dependencies': [
'libyuv_neon', 'libyuv_neon',
], ],
'conditions': [
# TODO LIBYUV_NEON is temporary disabled. When all arm64 port has
# been done, enable it.
['target_arch !="arm64"', {
'defines': [
'LIBYUV_NEON',
]
}],
],
}], }],
# MemorySanitizer does not support assembly code yet. # MemorySanitizer does not support assembly code yet.
# http://crbug.com/344505 # http://crbug.com/344505
...@@ -151,6 +135,7 @@ ...@@ -151,6 +135,7 @@
# 'LIBYUV_DISABLE_MIPS', # 'LIBYUV_DISABLE_MIPS',
# Enable the following macro to build libyuv as a shared library (dll). # Enable the following macro to build libyuv as a shared library (dll).
# 'LIBYUV_USING_SHARED_LIBRARY', # 'LIBYUV_USING_SHARED_LIBRARY',
# TODO(fbarchard): Make these into gyp defines.
], ],
'include_dirs': [ 'include_dirs': [
'include', 'include',
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
#endif #endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \ #if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && \ !defined(__native_client__) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
(defined(_M_IX86) || defined(_M_X64))
#include <immintrin.h> // For _xgetbv() #include <immintrin.h> // For _xgetbv()
#endif #endif
......
This diff is collapsed.
...@@ -21,7 +21,8 @@ extern "C" { ...@@ -21,7 +21,8 @@ extern "C" {
#endif #endif
// This module is for Visual C. // This module is for Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
(defined(_M_IX86) || defined(_M_X64))
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
......
...@@ -105,12 +105,12 @@ asm volatile ( ...@@ -105,12 +105,12 @@ asm volatile (
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3) MEMACCESS(3)
"ld1 {v1.16b}, [%3], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4) MEMACCESS(4)
"ld1 {v2.16b}, [%4], #16 \n" "ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5) MEMACCESS(5)
"ld1 {v3.16b}, [%5], #16 \n" "ld1 {v3.16b}, [%4], #16 \n"
"subs %2, %2, #4 \n" "subs %5, %5, #4 \n"
"uaddlp v0.8h, v0.16b \n" "uaddlp v0.8h, v0.16b \n"
"uadalp v0.8h, v1.16b \n" "uadalp v0.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n" "uadalp v0.8h, v2.16b \n"
...@@ -122,10 +122,10 @@ asm volatile ( ...@@ -122,10 +122,10 @@ asm volatile (
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(src_ptr1), // %2
"+r"(src_ptr1), // %3 "+r"(src_ptr2), // %3
"+r"(src_ptr2), // %4 "+r"(src_ptr3), // %4
"+r"(src_ptr3) // %5 "+r"(dst_width) // %5
: :
: "v0", "v1", "v2", "v3", "memory", "cc" : "v0", "v1", "v2", "v3", "memory", "cc"
); );
...@@ -144,7 +144,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ...@@ -144,7 +144,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
"mov v2.8b, v3.8b \n" // order v0, v1, v2 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1) MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
...@@ -309,6 +309,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -309,6 +309,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2; const uint8* src_ptr1 = src_ptr + src_stride * 2;
ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(5) MEMACCESS(5)
...@@ -317,7 +318,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -317,7 +318,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"ld1 {v30.16b}, [%6] \n" "ld1 {v30.16b}, [%6] \n"
MEMACCESS(7) MEMACCESS(7)
"ld1 {v31.8h}, [%7] \n" "ld1 {v31.8h}, [%7] \n"
"add %3, %3, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
// 00 40 01 41 02 42 03 43 // 00 40 01 41 02 42 03 43
...@@ -327,10 +328,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -327,10 +328,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3) MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4) MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32 \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %2, %2, #12 \n" "subs %4, %4, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
...@@ -420,9 +421,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -420,9 +421,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(tmp_src_stride), // %2
"+r"(src_stride), // %3 "+r"(src_ptr1), // %3
"+r"(src_ptr1) // %4 "+r"(dst_width) // %4
: "r"(&kMult38_Div6), // %5 : "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6 "r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7 "r"(&kMult38_Div9) // %7
...@@ -438,12 +439,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ...@@ -438,12 +439,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(4) MEMACCESS(4)
"ld1 {v30.8h}, [%4] \n" "ld1 {v30.8h}, [%4] \n"
MEMACCESS(5) MEMACCESS(5)
"ld1 {v31.16b}, [%5] \n" "ld1 {v31.16b}, [%5] \n"
"add %3, %3, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
// 00 40 01 41 02 42 03 43 // 00 40 01 41 02 42 03 43
...@@ -454,7 +457,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -454,7 +457,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3) MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
"subs %2, %2, #12 \n" "subs %3, %3, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
...@@ -528,12 +531,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ...@@ -528,12 +531,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
MEMACCESS(1) MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(tmp_src_stride), // %2
"+r"(src_stride) // %3 "+r"(dst_width) // %3
: "r"(&kMult38_Div6), // %4 : "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5 "r"(&kShuf38_2) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
"v18", "v19", "v30", "v31", "memory", "cc" "v18", "v19", "v30", "v31", "memory", "cc"
); );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment