Commit caf6e247 authored by fbarchard@google.com's avatar fbarchard@google.com

remove vpermq from ARGBToUV

BUG=none
TEST=convert_test
Review URL: https://webrtc-codereview.appspot.com/1107005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@575 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent d8b73cac
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 574 Version: 575
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 574 #define LIBYUV_VERSION 575
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -49,11 +49,17 @@ static const lvec8 kARGBToV_AVX = { ...@@ -49,11 +49,17 @@ static const lvec8 kARGBToV_AVX = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
}; };
// Unshuffle for vphaddw + vpackuswb vpermd. // vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kShufARGBToY_AVX = { static const lvec32 kShufARGBToY_AVX = {
0, 4, 1, 5, 2, 6, 3, 7 0, 4, 1, 5, 2, 6, 3, 7
}; };
// vpshufb for vphaddw + vpackuswb packed to shorts.
static const lvec8 kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
};
// Constants for BGRA. // Constants for BGRA.
static const vec8 kBGRAToY = { static const vec8 kBGRAToY = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
...@@ -785,11 +791,11 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -785,11 +791,11 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4 vpmaddubsw ymm3, ymm3, ymm4
lea eax, [eax + 128] lea eax, [eax + 128]
vphaddw ymm0, ymm0, ymm1 vphaddw ymm0, ymm0, ymm1 // mutates.
vphaddw ymm2, ymm2, ymm3 vphaddw ymm2, ymm2, ymm3
vpsrlw ymm0, ymm0, 7 vpsrlw ymm0, ymm0, 7
vpsrlw ymm2, ymm2, 7 vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2 vpackuswb ymm0, ymm0, ymm2 // mutates.
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
vpaddb ymm0, ymm0, ymm5 vpaddb ymm0, ymm0, ymm5
sub ecx, 32 sub ecx, 32
...@@ -1125,40 +1131,37 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1125,40 +1131,37 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
align 16 align 16
convertloop: convertloop:
/* step 1 - subsample 32x2 argb pixels to 16x1 */ /* step 1 - subsample 32x2 argb pixels to 16x1 */
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + 64] vmovdqu ymm2, [eax + 64]
vmovdqu ymm3, [eax + 96] vmovdqu ymm3, [eax + 96]
vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32] vpavgb ymm1, ymm1, [eax + esi + 32]
vpavgb ymm2, ymm2, [eax + esi + 64] vpavgb ymm2, ymm2, [eax + esi + 64]
vpavgb ymm3, ymm3, [eax + esi + 96] vpavgb ymm3, ymm3, [eax + esi + 96]
lea eax, [eax + 128] lea eax, [eax + 128]
vshufps ymm4, ymm0, ymm1, 0x88 vshufps ymm4, ymm0, ymm1, 0x88
vshufps ymm0, ymm0, ymm1, 0xdd vshufps ymm0, ymm0, ymm1, 0xdd
vpavgb ymm0, ymm0, ymm4 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. vshufps ymm4, ymm2, ymm3, 0x88
vshufps ymm4, ymm2, ymm3, 0x88 vshufps ymm2, ymm2, ymm3, 0xdd
vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
vpavgb ymm2, ymm2, ymm4
vpermq ymm2, ymm2, 0xd8 // TODO(fbarchard): Remove.
// step 2 - convert to U and V // step 2 - convert to U and V
// from here down is very similar to Y code except // from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V // instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V vpmaddubsw ymm0, ymm0, ymm6 // V
vpmaddubsw ymm2, ymm2, ymm6 vpmaddubsw ymm2, ymm2, ymm6
vphaddw ymm1, ymm1, ymm3 vphaddw ymm1, ymm1, ymm3 // mutates
vpermq ymm1, ymm1, 0xd8 // TODO(fbarchard): Remove. vphaddw ymm0, ymm0, ymm2
vphaddw ymm0, ymm0, ymm2 vpsraw ymm1, ymm1, 8
vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. vpsraw ymm0, ymm0, 8
vpsraw ymm1, ymm1, 8 vpacksswb ymm0, ymm1, ymm0 // mutates
vpsraw ymm0, ymm0, 8 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
vpacksswb ymm0, ymm1, ymm0 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
vpermq ymm0, ymm0, 0xd8 vpaddb ymm0, ymm0, ymm5 // -> unsigned
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values // step 3 - store 16 U and 16 V values
sub ecx, 32 sub ecx, 32
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment