Commit 5bf29b59 authored by fbarchard@google.com's avatar fbarchard@google.com

p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/547007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f906ae13
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 254 Version: 255
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 254 #define LIBYUV_VERSION 255
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -163,6 +163,7 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -163,6 +163,7 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"movd %2,%%xmm0 \n" "movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n" "movdqa %4,%%xmm6 \n"
".p2align 4 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm1 \n" "movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
...@@ -331,7 +332,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, ...@@ -331,7 +332,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm1 \n" "movdqa (%0),%%xmm1 \n"
"movdqa (%0,%1,1),%%xmm2 \n" "movdqa (%0,%1,1),%%xmm2 \n"
......
...@@ -74,6 +74,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -74,6 +74,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
mov ecx, [esp + 4 + 16] // pix mov ecx, [esp + 4 + 16] // pix
sub edi, eax sub edi, eax
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
pavgb xmm0, [eax + edx] pavgb xmm0, [eax + edx]
...@@ -92,6 +93,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -92,6 +93,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) { uint8* dst_uv, int pix) {
asm volatile ( asm volatile (
"sub %0,%1 \n" "sub %0,%1 \n"
".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"pavgb (%0,%3),%%xmm0 \n" "pavgb (%0,%3),%%xmm0 \n"
...@@ -467,6 +469,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ...@@ -467,6 +469,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 16
convertloop: convertloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
...@@ -506,6 +509,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, ...@@ -506,6 +509,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n" "movdqa 0x10(%0),%%xmm1 \n"
......
...@@ -291,6 +291,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, ...@@ -291,6 +291,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
uint8* dst_frame, int width) { uint8* dst_frame, int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
".p2align 4 \n"
"1: \n" "1: \n"
"movq (%1),%%xmm2 \n" "movq (%1),%%xmm2 \n"
"movq (%1,%2,1),%%xmm3 \n" "movq (%1,%2,1),%%xmm3 \n"
...@@ -326,6 +327,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, ...@@ -326,6 +327,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
uint8* dst_frame, int width) { uint8* dst_frame, int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
".p2align 4 \n"
"1: \n" "1: \n"
"movq (%1),%%xmm2 \n" "movq (%1),%%xmm2 \n"
"movq (%1,%2,1),%%xmm3 \n" "movq (%1,%2,1),%%xmm3 \n"
......
...@@ -57,6 +57,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, ...@@ -57,6 +57,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
asm volatile ( asm volatile (
"movd %3,%%xmm5 \n" "movd %3,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n"
".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
......
...@@ -288,6 +288,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -288,6 +288,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 4 \n"
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" "movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n" "movq (%0,%3),%%xmm1 \n"
...@@ -499,6 +500,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, ...@@ -499,6 +500,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n" "movdqa (%0,%3),%%xmm1 \n"
...@@ -639,6 +641,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -639,6 +641,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 4 \n"
"1: \n" "1: \n"
"movdqa (%0),%%xmm0 \n" "movdqa (%0),%%xmm0 \n"
"movdqa (%0,%4),%%xmm1 \n" "movdqa (%0,%4),%%xmm1 \n"
......
...@@ -32,6 +32,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ...@@ -32,6 +32,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"sub %4, #8 \n" "sub %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
"1: \n" "1: \n"
"mov r9, %0 \n" "mov r9, %0 \n"
...@@ -198,6 +199,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, ...@@ -198,6 +199,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"sub %6, #8 \n" "sub %6, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
"1: \n" "1: \n"
"mov r9, %0 \n" "mov r9, %0 \n"
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment