Commit 4ec55a21 authored by fbarchard@google.com's avatar fbarchard@google.com

Use macros to simplify I422ToARGB for AVX code.

BUG=269
TESTED=local build with Visual C
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/24079004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1133 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent a063a66d
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1132 Version: 1133
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1132 #define LIBYUV_VERSION 1133
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -977,16 +977,16 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -977,16 +977,16 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi] movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4 pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16] movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4 pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32] movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4 pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48] movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4 pavgb xmm3, xmm4
lea eax, [eax + 64] lea eax, [eax + 64]
...@@ -1048,16 +1048,16 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1048,16 +1048,16 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi] movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4 pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16] movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4 pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32] movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4 pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48] movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4 pavgb xmm3, xmm4
lea eax, [eax + 64] lea eax, [eax + 64]
...@@ -1304,16 +1304,16 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1304,16 +1304,16 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi] movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4 pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16] movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4 pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32] movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4 pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48] movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4 pavgb xmm3, xmm4
lea eax, [eax + 64] lea eax, [eax + 64]
...@@ -1375,16 +1375,16 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1375,16 +1375,16 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi] movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4 pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16] movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4 pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32] movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4 pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48] movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4 pavgb xmm3, xmm4
lea eax, [eax + 64] lea eax, [eax + 64]
...@@ -1446,16 +1446,16 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1446,16 +1446,16 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi] movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4 pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16] movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4 pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32] movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4 pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi + 48] movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4 pavgb xmm3, xmm4
lea eax, [eax + 64] lea eax, [eax + 64]
...@@ -1529,6 +1529,43 @@ static const lvec16 kUVBiasR_AVX = { ...@@ -1529,6 +1529,43 @@ static const lvec16 kUVBiasR_AVX = {
BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
}; };
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 __asm { \
__asm vmovq xmm0, qword ptr [esi] /* U */ \
__asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
__asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
}
// Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2 __asm { \
/* Step 1: Find 8 UV contributions to 16 R,G,B values */ \
__asm vpmaddubsw ymm2, ymm0, kUVToR_AVX /* scale R UV */ \
__asm vpmaddubsw ymm1, ymm0, kUVToG_AVX /* scale G UV */ \
__asm vpmaddubsw ymm0, ymm0, kUVToB_AVX /* scale B UV */ \
__asm vpsubw ymm2, ymm2, kUVBiasR_AVX /* unbias back to signed */ \
__asm vpsubw ymm1, ymm1, kUVBiasG_AVX \
__asm vpsubw ymm0, ymm0, kUVBiasB_AVX \
/* Step 2: Find Y contribution to 16 R,G,B values */ \
__asm vmovdqu xmm3, [eax] /* NOLINT */ \
__asm lea eax, [eax + 16] \
__asm vpermq ymm3, ymm3, 0xd8 \
__asm vpunpcklbw ymm3, ymm3, ymm4 \
__asm vpsubsw ymm3, ymm3, kYSub16_AVX \
__asm vpmullw ymm3, ymm3, kYToRgb_AVX \
__asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \
__asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \
__asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
__asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
__asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
__asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
}
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
...@@ -1551,35 +1588,8 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -1551,35 +1588,8 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
align 4 align 4
convertloop: convertloop:
vmovq xmm0, qword ptr [esi] // U READYUV422_AVX2
vmovq xmm1, qword ptr [esi + edi] // V YUVTORGB_AVX2
lea esi, [esi + 8]
vpunpcklbw ymm0, ymm0, ymm1 // UV
vpermq ymm0, ymm0, 0xd8
vpunpcklwd ymm0, ymm0, ymm0 // UVUV
vpmaddubsw ymm2, ymm0, kUVToR_AVX // scale R UV
vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
vpmaddubsw ymm0, ymm0, kUVToB_AVX // scale B UV
vpsubw ymm2, ymm2, kUVBiasR_AVX // unbias back to signed
vpsubw ymm1, ymm1, kUVBiasG_AVX
vpsubw ymm0, ymm0, kUVBiasB_AVX
// Step 2: Find Y contribution to 16 R,G,B values
vmovdqu xmm3, [eax] // NOLINT
lea eax, [eax + 16]
vpermq ymm3, ymm3, 0xd8
vpunpcklbw ymm3, ymm3, ymm4
vpsubsw ymm3, ymm3, kYSub16_AVX
vpmullw ymm3, ymm3, kYToRgb_AVX
vpaddsw ymm0, ymm0, ymm3 // B += Y
vpaddsw ymm1, ymm1, ymm3 // G += Y
vpaddsw ymm2, ymm2, ymm3 // R += Y
vpsraw ymm0, ymm0, 6
vpsraw ymm1, ymm1, 6
vpsraw ymm2, ymm2, 6
vpackuswb ymm0, ymm0, ymm0 // B
vpackuswb ymm1, ymm1, ymm1 // G
vpackuswb ymm2, ymm2, ymm2 // R
// Step 3: Weave into ARGB // Step 3: Weave into ARGB
vpunpcklbw ymm0, ymm0, ymm1 // BG vpunpcklbw ymm0, ymm0, ymm1 // BG
...@@ -1624,35 +1634,8 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -1624,35 +1634,8 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
align 4 align 4
convertloop: convertloop:
vmovq xmm0, qword ptr [esi] // U READYUV422_AVX2
vmovq xmm1, qword ptr [esi + edi] // V YUVTORGB_AVX2
lea esi, [esi + 8]
vpunpcklbw ymm0, ymm0, ymm1 // UV
vpermq ymm0, ymm0, 0xd8
vpunpcklwd ymm0, ymm0, ymm0 // UVUV
vpmaddubsw ymm2, ymm0, kUVToR_AVX // scale R UV
vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
vpmaddubsw ymm0, ymm0, kUVToB_AVX // scale B UV
vpsubw ymm2, ymm2, kUVBiasR_AVX // unbias back to signed
vpsubw ymm1, ymm1, kUVBiasG_AVX
vpsubw ymm0, ymm0, kUVBiasB_AVX
// Step 2: Find Y contribution to 16 R,G,B values
vmovdqu xmm3, [eax] // NOLINT
lea eax, [eax + 16]
vpermq ymm3, ymm3, 0xd8
vpunpcklbw ymm3, ymm3, ymm4
vpsubsw ymm3, ymm3, kYSub16_AVX
vpmullw ymm3, ymm3, kYToRgb_AVX
vpaddsw ymm0, ymm0, ymm3 // B += Y
vpaddsw ymm1, ymm1, ymm3 // G += Y
vpaddsw ymm2, ymm2, ymm3 // R += Y
vpsraw ymm0, ymm0, 6
vpsraw ymm1, ymm1, 6
vpsraw ymm2, ymm2, 6
vpackuswb ymm0, ymm0, ymm0 // B
vpackuswb ymm1, ymm1, ymm1 // G
vpackuswb ymm2, ymm2, ymm2 // R
// Step 3: Weave into BGRA // Step 3: Weave into BGRA
vpunpcklbw ymm1, ymm1, ymm0 // GB vpunpcklbw ymm1, ymm1, ymm0 // GB
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment