Commit 446f91d0 authored by fbarchard@google.com's avatar fbarchard@google.com

Use vbroadcastf128 to copy m128 to ymm duplicating the value to high and low 128…

Use vbroadcastf128 to copy m128 to ymm duplicating the value to high and low 128 bits.  Allows shared variables.
BUG=none
TEST=avx2 unittests still pass.
R=mflodman@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/2324004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@803 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 0d19fc5e
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 802 Version: 803
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 802 #define LIBYUV_VERSION 803
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -5880,14 +5880,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, ...@@ -5880,14 +5880,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
uint8* dst_argb, const float* poly, uint8* dst_argb, const float* poly,
int width) { int width) {
asm volatile ( asm volatile (
"vmovdqu "MEMACCESS(3)",%%xmm4 \n" "vbroadcastf128 "MEMACCESS(3)",%%ymm4 \n"
"vmovdqu "MEMACCESS2(0x10,3)",%%xmm5 \n" "vbroadcastf128 "MEMACCESS2(0x10,3)",%%ymm5 \n"
"vmovdqu "MEMACCESS2(0x20,3)",%%xmm6 \n" "vbroadcastf128 "MEMACCESS2(0x20,3)",%%ymm6 \n"
"vmovdqu "MEMACCESS2(0x30,3)",%%xmm7 \n" "vbroadcastf128 "MEMACCESS2(0x30,3)",%%ymm7 \n"
"vpermq $0x44,%%ymm4,%%ymm4 \n"
"vpermq $0x44,%%ymm5,%%ymm5 \n"
"vpermq $0x44,%%ymm6,%%ymm6 \n"
"vpermq $0x44,%%ymm7,%%ymm7 \n"
// 2 pixel loop. // 2 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
......
...@@ -30,16 +30,6 @@ static const vec8 kARGBToYJ = { ...@@ -30,16 +30,6 @@ static const vec8 kARGBToYJ = {
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
}; };
static const lvec8 kARGBToY_AVX = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0,
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
static const lvec8 kARGBToYJ_AVX = {
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0,
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
};
static const vec8 kARGBToU = { static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
}; };
...@@ -48,12 +38,6 @@ static const vec8 kARGBToUJ = { ...@@ -48,12 +38,6 @@ static const vec8 kARGBToUJ = {
127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
}; };
// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
static const lvec8 kARGBToU_AVX = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
static const vec8 kARGBToV = { static const vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
}; };
...@@ -62,13 +46,8 @@ static const vec8 kARGBToVJ = { ...@@ -62,13 +46,8 @@ static const vec8 kARGBToVJ = {
-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
}; };
static const lvec8 kARGBToV_AVX = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
};
// vpermd for vphaddw + vpackuswb vpermd. // vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kShufARGBToY_AVX = { static const lvec32 kPermdARGBToY_AVX = {
0, 4, 1, 5, 2, 6, 3, 7 0, 4, 1, 5, 2, 6, 3, 7
}; };
...@@ -124,16 +103,6 @@ static const uvec8 kAddY16 = { ...@@ -124,16 +103,6 @@ static const uvec8 kAddY16 = {
static const vec16 kAddYJ64 = { static const vec16 kAddYJ64 = {
64, 64, 64, 64, 64, 64, 64, 64 64, 64, 64, 64, 64, 64, 64, 64
}; };
static const lvec16 kAddYJ64_AVX = {
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
};
static const ulvec8 kAddY16_AVX = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
static const uvec8 kAddUV128 = { static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
...@@ -144,13 +113,6 @@ static const uvec16 kAddUVJ128 = { ...@@ -144,13 +113,6 @@ static const uvec16 kAddUVJ128 = {
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
}; };
static const ulvec8 kAddUV128_AVX = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting RGB24 to ARGB. // Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = { static const uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
...@@ -737,9 +699,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -737,9 +699,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
vmovdqa ymm6, kShufARGBToY_AVX vbroadcastf128 ymm4, kARGBToY
vmovdqa ymm5, kAddY16_AVX vbroadcastf128 ymm5, kAddY16
vmovdqa ymm4, kARGBToY_AVX vmovdqa ymm6, kPermdARGBToY_AVX
align 16 align 16
convertloop: convertloop:
...@@ -777,9 +739,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -777,9 +739,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
vmovdqa ymm4, kARGBToYJ_AVX vbroadcastf128 ymm4, kARGBToYJ
vmovdqa ymm5, kAddYJ64_AVX vbroadcastf128 ymm5, kAddYJ64
vmovdqa ymm6, kShufARGBToY_AVX vmovdqa ymm6, kPermdARGBToY_AVX
align 16 align 16
convertloop: convertloop:
...@@ -1229,9 +1191,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1229,9 +1191,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
vmovdqa ymm7, kARGBToU_AVX vbroadcastf128 ymm5, kAddUV128
vmovdqa ymm6, kARGBToV_AVX vbroadcastf128 ymm6, kARGBToV
vmovdqa ymm5, kAddUV128_AVX vbroadcastf128 ymm7, kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 16 align 16
...@@ -6640,8 +6602,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -6640,8 +6602,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_bayer mov edx, [esp + 8] // dst_bayer
mov ecx, [esp + 12] // shuffler mov ecx, [esp + 12] // shuffler
vmovdqa xmm5, [ecx] vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
vpermq ymm5, ymm5, 0x44 // same shuffle in high as low.
mov ecx, [esp + 16] // pix mov ecx, [esp + 16] // pix
align 16 align 16
...@@ -6825,18 +6786,13 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, ...@@ -6825,18 +6786,13 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
uint8* dst_argb, const float* poly, uint8* dst_argb, const float* poly,
int width) { int width) {
__asm { __asm {
mov eax, [esp + 12] /* poly */
vmovdqu xmm4, [eax] // C0
vmovdqu xmm5, [eax + 16] // C1
vmovdqu xmm6, [eax + 32] // C2
vmovdqu xmm7, [eax + 48] // C3
vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords
vpermq ymm5, ymm5, 0x44
vpermq ymm6, ymm6, 0x44
vpermq ymm7, ymm7, 0x44
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */ mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* poly */
vbroadcastf128 ymm4, [ecx] // C0
vbroadcastf128 ymm5, [ecx + 16] // C1
vbroadcastf128 ymm6, [ecx + 32] // C2
vbroadcastf128 ymm7, [ecx + 48] // C3
mov ecx, [esp + 16] /* width */ mov ecx, [esp + 16] /* width */
// 2 pixel loop. // 2 pixel loop.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment