Commit 5bf29b59 authored by fbarchard@google.com's avatar fbarchard@google.com

p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/547007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f906ae13
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 254
Version: 255
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 254
#define LIBYUV_VERSION 255
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -163,6 +163,7 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
......@@ -331,7 +332,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm1 \n"
"movdqa (%0,%1,1),%%xmm2 \n"
......
......@@ -74,6 +74,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
mov ecx, [esp + 4 + 16] // pix
sub edi, eax
align 16
convertloop:
movdqa xmm0, [eax]
pavgb xmm0, [eax + edx]
......@@ -92,6 +93,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
asm volatile (
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"pavgb (%0,%3),%%xmm0 \n"
......@@ -467,6 +469,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 16
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
......@@ -506,6 +509,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
......
......@@ -291,6 +291,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
uint8* dst_frame, int width) {
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movq (%1),%%xmm2 \n"
"movq (%1,%2,1),%%xmm3 \n"
......@@ -326,6 +327,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
uint8* dst_frame, int width) {
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movq (%1),%%xmm2 \n"
"movq (%1,%2,1),%%xmm3 \n"
......
......@@ -57,6 +57,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
asm volatile (
"movd %3,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
......
......@@ -288,6 +288,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
......@@ -499,6 +500,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n"
......@@ -639,6 +641,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%4),%%xmm1 \n"
......
......@@ -32,6 +32,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"sub %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
"1: \n"
"mov r9, %0 \n"
......@@ -198,6 +199,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"sub %6, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 4 \n"
"1: \n"
"mov r9, %0 \n"
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment