Commit 00b69a2f authored by fbarchard@google.com's avatar fbarchard@google.com

I400ToARGB_Neon optimized

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/935010

git-svn-id: http://libyuv.googlecode.com/svn/trunk@465 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f3144676
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 464
Version: 465
License: BSD
License File: LICENSE
......
......@@ -144,6 +144,7 @@ extern "C" {
#define HAS_ABGRTOARGBROW_NEON
#define HAS_ARGBTOBAYERROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGBAROW_NEON
#define HAS_BGRATOARGBROW_NEON
......@@ -450,31 +451,31 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* argb_buf,
int width);
void I422ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* argb_buf,
int width);
void I411ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_C(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToRGB565Row_C(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
......@@ -483,24 +484,20 @@ void NV12ToRGB565Row_C(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToARGBRow_C(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width);
void I422ToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* bgra_buf,
int width);
void I422ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* abgr_buf,
int width);
void I422ToRGBARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -531,7 +528,6 @@ void I422ToRGB565Row_C(const uint8* y_buf,
const uint8* v_buf,
uint8* dst_rgb565,
int width);
void YToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf,
int width);
......@@ -541,51 +537,42 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* v_buf,
uint8* argb_buf,
int width);
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* argb_buf,
int width);
void I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width);
void NV12ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width);
void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* bgra_buf,
int width);
void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* abgr_buf,
int width);
void I422ToRGBARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -606,14 +593,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
// RGB24/RAW are unaligned.
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -719,20 +704,17 @@ void I422ToRGB565Row_Any_SSSE3(const uint8* y_buf,
const uint8* v_buf,
uint8* rgba_buf,
int width);
// RGB24/RAW are unaligned.
void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToRAWRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* argb_buf,
int width);
......@@ -847,24 +829,21 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 464
#define LIBYUV_VERSION 465
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -248,13 +248,23 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
I400ToARGBRow_C;
#if defined(HAS_I400TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I400ToARGBRow = I400ToARGBRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
I400ToARGBRow = I400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I400ToARGBRow = I400ToARGBRow_SSE2;
}
}
}
#elif defined(HAS_I400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
I400ToARGBRow = I400ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_NEON;
}
}
#endif
for (int y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width);
src_y += src_stride_y;
......
......@@ -116,6 +116,7 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
// SSSE3 RGB24 is multiple of 16 pixels, aligned source and destination.
// SSE2 RGB565 is multiple of 4 pixels, ARGB must be aligned to 16 bytes.
// NEON RGB24 is multiple of 8 pixels, unaligned source and destination.
// I400 To ARGB does multiple of 8 pixels with SIMD and remainder with C.
#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
void NAMEANY(const uint8* argb_buf, \
uint8* rgb_buf, \
......@@ -136,6 +137,8 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
3, 4, 2)
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
3, 4, 2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
7, 1, 4)
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
......@@ -146,6 +149,8 @@ RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
7, 4, 2)
RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
7, 4, 2)
RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
7, 1, 4)
#endif
#undef RGBANY
......
......@@ -24,6 +24,11 @@ extern "C" {
"vld1.u32 {d2[0]}, [%1]! \n" \
"vld1.u32 {d2[1]}, [%2]! \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
"vld1.u8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
"vld1.u8 {d0}, [%0]! \n" \
......@@ -411,6 +416,58 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
}
#endif // HAS_I422TOARGB4444ROW_NEON
#ifdef HAS_YTOARGBROW_NEON
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%3] \n"
"vld1.u8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n"
READYUV400
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(&kUVToRB), // %3
"r"(&kUVToG) // %4
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_YTOARGBROW_NEON
#ifdef HAS_I400TOARGBROW_NEON
void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
".p2align 2 \n"
"vmov.u8 d23, #255 \n"
"1: \n"
"vld1.u8 {d20}, [%0]! \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "d20", "d21", "d22", "d23"
);
}
#endif // HAS_I400TOARGBROW_NEON
#ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* src_y,
const uint8* src_uv,
......
......@@ -171,6 +171,37 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
);
}
void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
);
}
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile (
"movdqa %3,%%xmm5 \n"
......
......@@ -131,6 +131,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
};
// Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked) __declspec(align(16))
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
......@@ -159,6 +160,35 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
}
}
__declspec(naked) __declspec(align(16))
void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
align 16
convertloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm0
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0
punpckhwd xmm1, xmm1
por xmm0, xmm5
por xmm1, xmm5
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
ret
}
}
__declspec(naked) __declspec(align(16))
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
__asm {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment