Commit 1702ec78 authored by fbarchard@google.com's avatar fbarchard@google.com

use movdqu on 2nd source for blend

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/479001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@235 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent d2f4413d
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 234 Version: 235
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 234 #define LIBYUV_VERSION 235
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -2048,14 +2048,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2048,14 +2048,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"movdqu (%0),%%xmm3 \n" // first 4 pixels "movdqu (%0),%%xmm3 \n" // first 4 pixels
"movdqa %%xmm3,%%xmm0 \n" "movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n" "pxor %%xmm4,%%xmm3 \n"
"movdqa (%1),%%xmm2 \n" "movdqu (%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n" "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n" "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n" "pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n" "paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n" "pmullw %%xmm3,%%xmm2 \n"
"movdqa (%1),%%xmm1 \n" "movdqu (%1),%%xmm1 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n" "pmullw %%xmm3,%%xmm1 \n"
...@@ -2070,14 +2070,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2070,14 +2070,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"jle 9f \n" "jle 9f \n"
"movdqa %%xmm3,%%xmm0 \n" // next 4 pixels "movdqa %%xmm3,%%xmm0 \n" // next 4 pixels
"pxor %%xmm4,%%xmm3 \n" "pxor %%xmm4,%%xmm3 \n"
"movdqa 0x10(%1),%%xmm2 \n" "movdqu 0x10(%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n" "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n" "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n" "pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n" "paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n" "pmullw %%xmm3,%%xmm2 \n"
"movdqa 0x10(%1),%%xmm1 \n" "movdqu 0x10(%1),%%xmm1 \n"
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm0 \n"
......
...@@ -2075,11 +2075,13 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -2075,11 +2075,13 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
#endif // HAS_YUY2TOYROW_SSE2 #endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time // Blend 8 pixels at a time.
// Destination aligned to 16 bytes, multiple of 4 pixels // src_argb0 unaligned.
// src_argb1 and dst_argb aligned to 16 bytes.
// width must be multiple of 4 pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_argb0 mov eax, [esp + 4 + 4] // src_argb0
...@@ -2100,14 +2102,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2100,14 +2102,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movdqu xmm3, [eax] movdqu xmm3, [eax]
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
movdqa xmm2, [esi] // _r_b movdqu xmm2, [esi] // _r_b
psrlw xmm3, 8 // alpha psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h pshuflw xmm3, xmm3,0F5h
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [esi] // _a_g movdqu xmm1, [esi] // _a_g
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255 por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha pmullw xmm1, xmm3 // _a_g * alpha
...@@ -2123,14 +2125,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2123,14 +2125,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
movdqa xmm2, [esi + 16] // _r_b movdqu xmm2, [esi + 16] // _r_b
psrlw xmm3, 8 // alpha psrlw xmm3, 8 // alpha
pshufhw xmm3, xmm3,0F5h // 8 alpha words pshufhw xmm3, xmm3,0F5h // 8 alpha words
pshuflw xmm3, xmm3,0F5h pshuflw xmm3, xmm3,0F5h
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [esi + 16] // _a_g movdqu xmm1, [esi + 16] // _a_g
lea esi, [esi + 32] lea esi, [esi + 32]
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255 por xmm0, xmm4 // set alpha to 255
...@@ -2150,7 +2152,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2150,7 +2152,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
} }
} }
// Blend 1 pixel at a time, unaligned // Blend 1 pixel at a time, unaligned.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
...@@ -2247,7 +2249,7 @@ static const uvec8 kShuffleAlpha = { ...@@ -2247,7 +2249,7 @@ static const uvec8 kShuffleAlpha = {
// with.. // with..
// pshufb xmm3, kShuffleAlpha // alpha // pshufb xmm3, kShuffleAlpha // alpha
// Destination aligned to 16 bytes, multiple of 4 pixels // Destination aligned to 16 bytes, multiple of 4 pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
...@@ -2272,11 +2274,11 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2272,11 +2274,11 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
pshufb xmm3, kShuffleAlpha // alpha pshufb xmm3, kShuffleAlpha // alpha
movdqa xmm2, [esi] // _r_b movdqu xmm2, [esi] // _r_b
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [esi] // _a_g movdqu xmm1, [esi] // _a_g
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255 por xmm0, xmm4 // set alpha to 255
pmullw xmm1, xmm3 // _a_g * alpha pmullw xmm1, xmm3 // _a_g * alpha
...@@ -2292,12 +2294,12 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2292,12 +2294,12 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
movdqa xmm2, [esi + 16] // _r_b movdqu xmm2, [esi + 16] // _r_b
pshufb xmm3, kShuffleAlpha // alpha pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
movdqa xmm1, [esi + 16] // _a_g movdqu xmm1, [esi + 16] // _a_g
lea esi, [esi + 32] lea esi, [esi + 32]
psrlw xmm1, 8 // _a_g psrlw xmm1, 8 // _a_g
por xmm0, xmm4 // set alpha to 255 por xmm0, xmm4 // set alpha to 255
...@@ -2331,7 +2333,7 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2331,7 +2333,7 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
dst_argb += count * 4; dst_argb += count * 4;
width -= count; width -= count;
} }
// Do multiple of 4 pixels // Do multiple of 4 pixels.
if (width & ~3) { if (width & ~3) {
ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3); ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment