Commit 827de16b authored by fbarchard@google.com's avatar fbarchard@google.com

I422ToRGB24Row_SSSE3 in 1 pass. Internally converts to ARGB then packs down to RGB.

BUG=116
TEST=libyuv unittest
Review URL: https://webrtc-codereview.appspot.com/863013

git-svn-id: http://libyuv.googlecode.com/svn/trunk@399 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 6b5a8eff
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 398
Version: 399
License: BSD
License File: LICENSE
......
......@@ -100,6 +100,8 @@ extern "C" {
#define HAS_RGBATOARGBROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#define HAS_I422TORGB24ROW_SSSE3
#define HAS_I422TORAWROW_SSSE3
#endif
// The following are disabled when SSSE3 is available:
......@@ -436,6 +438,19 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
uint8* rgba_buf,
int width);
// RGB24/RAW are unaligned.
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -528,6 +543,19 @@ void I422ToRGBARow_Any_SSSE3(const uint8* y_buf,
uint8* rgba_buf,
int width);
// RGB24/RAW are unaligned.
void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* argb_buf,
int width);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 398
#define LIBYUV_VERSION 399
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -928,12 +928,9 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
I422ToRGB24Row = I422ToRGB24Row_SSSE3;
}
}
}
#endif
for (int y = 0; y < height; ++y) {
......@@ -982,12 +979,9 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
I422ToRAWRow = I422ToRAWRow_SSSE3;
}
}
}
#endif
for (int y = 0; y < height; ++y) {
......
......@@ -1023,9 +1023,9 @@ YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
#endif
#ifdef HAS_I422TORGB24ROW_SSSE3
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \
I422ToRGB24Row_C, 1)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
// I422ToRGB24Row_SSSE3 is unaligned.
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1)
#endif
#ifdef HAS_I422TORGBAROW_SSSE3
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
......
......@@ -122,6 +122,16 @@ static const uvec8 kShuffleMaskARGBToRAW = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting ARGBToRGB24 for I420ToRGB24. First 8 + next 4
static const uvec8 kShuffleMaskARGBToRGB24_0 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
};
// Shuffle table for converting ARGB to RAW.
static const uvec8 kShuffleMaskARGBToRAW_0 = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
};
__declspec(naked) __declspec(align(16))
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
......@@ -1654,6 +1664,100 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb24_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb24
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
movdqa xmm5, kShuffleMaskARGBToRGB24_0
movdqa xmm6, kShuffleMaskARGBToRGB24
align 16
convertloop:
READYUV422
YUVTORGB
// Step 3: Weave into RRGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm2 // RR
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRR first 4 pixels
punpckhwd xmm1, xmm2 // BGRR next 4 pixels
pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
pshufb xmm1, xmm6 // Pack into first 12 bytes.
palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
movq qword ptr [edx], xmm0 // First 8 bytes
movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
lea edx, [edx + 24]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* raw_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // raw
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
movdqa xmm5, kShuffleMaskARGBToRAW_0
movdqa xmm6, kShuffleMaskARGBToRAW
align 16
convertloop:
READYUV422
YUVTORGB
// Step 3: Weave into RRGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm2 // RR
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRR first 4 pixels
punpckhwd xmm1, xmm2 // BGRR next 4 pixels
pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
pshufb xmm1, xmm6 // Pack into first 12 bytes.
palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
movq qword ptr [edx], xmm0 // First 8 bytes
movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
lea edx, [edx + 24]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment