Commit 827de16b authored by fbarchard@google.com's avatar fbarchard@google.com

I422ToRGB24Row_SSSE3 in 1 pass. Internally converts to ARGB then packs down to RGB.

BUG=116
TEST=libyuv unittest
Review URL: https://webrtc-codereview.appspot.com/863013

git-svn-id: http://libyuv.googlecode.com/svn/trunk@399 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 6b5a8eff
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 398 Version: 399
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -100,6 +100,8 @@ extern "C" { ...@@ -100,6 +100,8 @@ extern "C" {
#define HAS_RGBATOARGBROW_SSSE3 #define HAS_RGBATOARGBROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3
#define HAS_I422TORGB24ROW_SSSE3
#define HAS_I422TORAWROW_SSSE3
#endif #endif
// The following are disabled when SSSE3 is available: // The following are disabled when SSSE3 is available:
...@@ -436,6 +438,19 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -436,6 +438,19 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
uint8* rgba_buf, uint8* rgba_buf,
int width); int width);
// RGB24/RAW are unaligned.
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -528,6 +543,19 @@ void I422ToRGBARow_Any_SSSE3(const uint8* y_buf, ...@@ -528,6 +543,19 @@ void I422ToRGBARow_Any_SSSE3(const uint8* y_buf,
uint8* rgba_buf, uint8* rgba_buf,
int width); int width);
// RGB24/RAW are unaligned.
void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YToARGBRow_SSE2(const uint8* y_buf, void YToARGBRow_SSE2(const uint8* y_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 398 #define LIBYUV_VERSION 399
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -928,10 +928,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, ...@@ -928,10 +928,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3; I422ToRGB24Row = I422ToRGB24Row_SSSE3;
if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
I422ToRGB24Row = I422ToRGB24Row_SSSE3;
}
} }
} }
#endif #endif
...@@ -982,10 +979,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, ...@@ -982,10 +979,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRAWRow = I422ToRAWRow_Any_SSSE3; I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3; I422ToRAWRow = I422ToRAWRow_SSSE3;
if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
I422ToRAWRow = I422ToRAWRow_SSSE3;
}
} }
} }
#endif #endif
......
...@@ -1023,9 +1023,9 @@ YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) ...@@ -1023,9 +1023,9 @@ YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
#endif #endif
#ifdef HAS_I422TORGB24ROW_SSSE3 #ifdef HAS_I422TORGB24ROW_SSSE3
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \ // I422ToRGB24Row_SSSE3 is unaligned.
I422ToRGB24Row_C, 1) YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1) YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1)
#endif #endif
#ifdef HAS_I422TORGBAROW_SSSE3 #ifdef HAS_I422TORGBAROW_SSSE3
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
......
...@@ -122,6 +122,16 @@ static const uvec8 kShuffleMaskARGBToRAW = { ...@@ -122,6 +122,16 @@ static const uvec8 kShuffleMaskARGBToRAW = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
}; };
// Shuffle table for converting ARGBToRGB24 for I420ToRGB24. First 8 + next 4
static const uvec8 kShuffleMaskARGBToRGB24_0 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
};
// Shuffle table for converting ARGB to RAW.
static const uvec8 kShuffleMaskARGBToRAW_0 = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
};
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm { __asm {
...@@ -1654,6 +1664,100 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1654,6 +1664,100 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
} }
} }
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb24_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb24
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
movdqa xmm5, kShuffleMaskARGBToRGB24_0
movdqa xmm6, kShuffleMaskARGBToRGB24
align 16
convertloop:
READYUV422
YUVTORGB
// Step 3: Weave into RRGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm2 // RR
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRR first 4 pixels
punpckhwd xmm1, xmm2 // BGRR next 4 pixels
pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
pshufb xmm1, xmm6 // Pack into first 12 bytes.
palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
movq qword ptr [edx], xmm0 // First 8 bytes
movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
lea edx, [edx + 24]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* raw_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // raw
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
movdqa xmm5, kShuffleMaskARGBToRAW_0
movdqa xmm6, kShuffleMaskARGBToRAW
align 16
convertloop:
READYUV422
YUVTORGB
// Step 3: Weave into RRGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm2 // RR
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRR first 4 pixels
punpckhwd xmm1, xmm2 // BGRR next 4 pixels
pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
pshufb xmm1, xmm6 // Pack into first 12 bytes.
palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
movq qword ptr [edx], xmm0 // First 8 bytes
movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
lea edx, [edx + 24]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
// 8 pixels, dest aligned 16. // 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment