Commit aec76f2e authored by fbarchard@google.com's avatar fbarchard@google.com

add stride to pointer in C and pass as register to inline.

BUG=357
TESTED=clang on ios
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/29489004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1086 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f7d9b9fb
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1084
Version: 1086
License: BSD
License File: LICENSE
......
......@@ -58,6 +58,13 @@ extern "C" {
#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
#define LIBYUV_DISABLE_NEON
#endif
// clang >= 3.5.0 required for Arm64.
#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
#define LIBYUV_DISABLE_NEON
#endif // clang >= 3.5
#endif // __clang__
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1084
#define LIBYUV_VERSION 1086
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -25,7 +25,7 @@
'conditions': [
['(target_arch == "armv7" or target_arch == "armv7s" or \
(target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
and target_subarch != 64 and (arm_neon == 1 or arm_neon_optional == 1)',
and (arm_neon == 1 or arm_neon_optional == 1)',
{
'build_neon': 1,
}],
......@@ -47,11 +47,6 @@
'-mfpu=vfpv3-d16',
],
'conditions': [
['target_arch != "arm64"', {
'cflags': [
'-mfpu=neon',
],
}],
# Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
['use_lto == 1', {
'cflags!': [
......@@ -60,6 +55,9 @@
],
}],
],
'cflags': [
'-mfpu=neon',
],
'include_dirs': [
'include',
'.',
......@@ -93,11 +91,6 @@
# Allows libyuv.a redistributable library without external dependencies.
'standalone_static_library': 1,
'conditions': [
['OS == "ios" and target_subarch == 64', {
'defines': [
'LIBYUV_DISABLE_NEON'
],
}],
['OS != "ios" and libyuv_disable_jpeg != 1', {
'defines': [
'HAVE_JPEG'
......@@ -126,15 +119,6 @@
'dependencies': [
'libyuv_neon',
],
'conditions': [
# TODO LIBYUV_NEON is temporary disabled. When all arm64 port has
# been done, enable it.
['target_arch !="arm64"', {
'defines': [
'LIBYUV_NEON',
]
}],
],
}],
# MemorySanitizer does not support assembly code yet.
# http://crbug.com/344505
......@@ -151,6 +135,7 @@
# 'LIBYUV_DISABLE_MIPS',
# Enable the following macro to build libyuv as a shared library (dll).
# 'LIBYUV_USING_SHARED_LIBRARY',
# TODO(fbarchard): Make these into gyp defines.
],
'include_dirs': [
'include',
......
......@@ -15,7 +15,8 @@
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
(defined(_M_IX86) || defined(_M_X64))
#include <immintrin.h> // For _xgetbv()
#endif
......
This diff is collapsed.
......@@ -21,7 +21,8 @@ extern "C" {
#endif
// This module is for Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
(defined(_M_IX86) || defined(_M_X64))
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
......
......@@ -105,12 +105,12 @@ asm volatile (
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3)
"ld1 {v1.16b}, [%3], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4], #16 \n"
"ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
"ld1 {v3.16b}, [%5], #16 \n"
"subs %2, %2, #4 \n"
"ld1 {v3.16b}, [%4], #16 \n"
"subs %5, %5, #4 \n"
"uaddlp v0.8h, v0.16b \n"
"uadalp v0.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n"
......@@ -122,10 +122,10 @@ asm volatile (
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_ptr1), // %3
"+r"(src_ptr2), // %4
"+r"(src_ptr3) // %5
"+r"(src_ptr1), // %2
"+r"(src_ptr2), // %3
"+r"(src_ptr3), // %4
"+r"(dst_width) // %5
:
: "v0", "v1", "v2", "v3", "memory", "cc"
);
......@@ -144,7 +144,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #24 \n"
"mov v2.8b, v3.8b \n" // order v0, v1, v2
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
......@@ -309,6 +309,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
const uint8* src_ptr1 = src_ptr + src_stride * 2;
ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
MEMACCESS(5)
......@@ -317,7 +318,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"ld1 {v30.16b}, [%6] \n"
MEMACCESS(7)
"ld1 {v31.8h}, [%7] \n"
"add %3, %3, %0 \n"
"add %2, %2, %0 \n"
"1: \n"
// 00 40 01 41 02 42 03 43
......@@ -327,10 +328,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32 \n"
"subs %2, %2, #12 \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %4, %4, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
......@@ -420,9 +421,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride), // %3
"+r"(src_ptr1) // %4
"+r"(tmp_src_stride), // %2
"+r"(src_ptr1), // %3
"+r"(dst_width) // %4
: "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7
......@@ -438,12 +439,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile (
MEMACCESS(4)
"ld1 {v30.8h}, [%4] \n"
MEMACCESS(5)
"ld1 {v31.16b}, [%5] \n"
"add %3, %3, %0 \n"
"add %2, %2, %0 \n"
"1: \n"
// 00 40 01 41 02 42 03 43
......@@ -454,7 +457,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"
"subs %2, %2, #12 \n"
"subs %3, %3, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
......@@ -528,12 +531,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
"+r"(dst_width) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
"v18", "v19", "v30", "v31", "memory", "cc"
);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment