Commit 00b69a2f authored by fbarchard@google.com's avatar fbarchard@google.com

I400ToARGB_Neon optimized

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/935010

git-svn-id: http://libyuv.googlecode.com/svn/trunk@465 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f3144676
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 464 Version: 465
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -144,6 +144,7 @@ extern "C" { ...@@ -144,6 +144,7 @@ extern "C" {
#define HAS_ABGRTOARGBROW_NEON #define HAS_ABGRTOARGBROW_NEON
#define HAS_ARGBTOBAYERROW_NEON #define HAS_ARGBTOBAYERROW_NEON
#define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORAWROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGBAROW_NEON #define HAS_ARGBTORGBAROW_NEON
#define HAS_BGRATOARGBROW_NEON #define HAS_BGRATOARGBROW_NEON
...@@ -450,31 +451,31 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); ...@@ -450,31 +451,31 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I444ToARGBRow_C(const uint8* y_buf, void I444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void I422ToARGBRow_C(const uint8* y_buf, void I422ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void I411ToARGBRow_C(const uint8* y_buf, void I411ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void NV12ToARGBRow_C(const uint8* y_buf, void NV12ToARGBRow_C(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void NV21ToRGB565Row_C(const uint8* y_buf, void NV21ToRGB565Row_C(const uint8* y_buf,
const uint8* vu_buf, const uint8* vu_buf,
uint8* argb_buf, uint8* argb_buf,
...@@ -483,24 +484,20 @@ void NV12ToRGB565Row_C(const uint8* y_buf, ...@@ -483,24 +484,20 @@ void NV12ToRGB565Row_C(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void NV21ToARGBRow_C(const uint8* y_buf, void NV21ToARGBRow_C(const uint8* y_buf,
const uint8* vu_buf, const uint8* vu_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void I422ToBGRARow_C(const uint8* y_buf, void I422ToBGRARow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* bgra_buf, uint8* bgra_buf,
int width); int width);
void I422ToABGRRow_C(const uint8* y_buf, void I422ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* abgr_buf, uint8* abgr_buf,
int width); int width);
void I422ToRGBARow_C(const uint8* y_buf, void I422ToRGBARow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -531,7 +528,6 @@ void I422ToRGB565Row_C(const uint8* y_buf, ...@@ -531,7 +528,6 @@ void I422ToRGB565Row_C(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_rgb565, uint8* dst_rgb565,
int width); int width);
void YToARGBRow_C(const uint8* y_buf, void YToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
...@@ -541,51 +537,42 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -541,51 +537,42 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void I422ToARGBRow_SSSE3(const uint8* y_buf, void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void I411ToARGBRow_SSSE3(const uint8* y_buf, void I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void NV12ToARGBRow_SSSE3(const uint8* y_buf, void NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void NV21ToARGBRow_SSSE3(const uint8* y_buf, void NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* vu_buf, const uint8* vu_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void NV12ToRGB565Row_SSSE3(const uint8* y_buf, void NV12ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void NV21ToRGB565Row_SSSE3(const uint8* y_buf, void NV21ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* vu_buf, const uint8* vu_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
void I422ToBGRARow_SSSE3(const uint8* y_buf, void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* bgra_buf, uint8* bgra_buf,
int width); int width);
void I422ToABGRRow_SSSE3(const uint8* y_buf, void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* abgr_buf, uint8* abgr_buf,
int width); int width);
void I422ToRGBARow_SSSE3(const uint8* y_buf, void I422ToRGBARow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -606,14 +593,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, ...@@ -606,14 +593,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
// RGB24/RAW are unaligned. // RGB24/RAW are unaligned.
void I422ToRGB24Row_SSSE3(const uint8* y_buf, void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToRAWRow_SSSE3(const uint8* y_buf, void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -719,20 +704,17 @@ void I422ToRGB565Row_Any_SSSE3(const uint8* y_buf, ...@@ -719,20 +704,17 @@ void I422ToRGB565Row_Any_SSSE3(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgba_buf, uint8* rgba_buf,
int width); int width);
// RGB24/RAW are unaligned. // RGB24/RAW are unaligned.
void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf, void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToRAWRow_Any_SSSE3(const uint8* y_buf, void I422ToRAWRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void YToARGBRow_SSE2(const uint8* y_buf, void YToARGBRow_SSE2(const uint8* y_buf,
uint8* argb_buf, uint8* argb_buf,
int width); int width);
...@@ -847,19 +829,16 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, ...@@ -847,19 +829,16 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 464 #define LIBYUV_VERSION 465
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -248,13 +248,23 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, ...@@ -248,13 +248,23 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
I400ToARGBRow_C; I400ToARGBRow_C;
#if defined(HAS_I400TOARGBROW_SSE2) #if defined(HAS_I400TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) && if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) && I400ToARGBRow = I400ToARGBRow_Any_SSE2;
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I400ToARGBRow = I400ToARGBRow_SSE2; I400ToARGBRow = I400ToARGBRow_SSE2;
} }
}
}
#elif defined(HAS_I400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
I400ToARGBRow = I400ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_NEON;
}
}
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width); I400ToARGBRow(src_y, dst_argb, width);
src_y += src_stride_y; src_y += src_stride_y;
......
...@@ -116,6 +116,7 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) ...@@ -116,6 +116,7 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
// SSSE3 RGB24 is multiple of 16 pixels, aligned source and destination. // SSSE3 RGB24 is multiple of 16 pixels, aligned source and destination.
// SSE2 RGB565 is multiple of 4 pixels, ARGB must be aligned to 16 bytes. // SSE2 RGB565 is multiple of 4 pixels, ARGB must be aligned to 16 bytes.
// NEON RGB24 is multiple of 8 pixels, unaligned source and destination. // NEON RGB24 is multiple of 8 pixels, unaligned source and destination.
// I400 To ARGB does multiple of 8 pixels with SIMD and remainder with C.
#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ #define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
void NAMEANY(const uint8* argb_buf, \ void NAMEANY(const uint8* argb_buf, \
uint8* rgb_buf, \ uint8* rgb_buf, \
...@@ -136,6 +137,8 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, ...@@ -136,6 +137,8 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
3, 4, 2) 3, 4, 2)
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
3, 4, 2) 3, 4, 2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
7, 1, 4)
#endif #endif
#if defined(HAS_ARGBTORGB24ROW_NEON) #if defined(HAS_ARGBTORGB24ROW_NEON)
RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3) RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
...@@ -146,6 +149,8 @@ RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C, ...@@ -146,6 +149,8 @@ RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
7, 4, 2) 7, 4, 2)
RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
7, 4, 2) 7, 4, 2)
RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
7, 1, 4)
#endif #endif
#undef RGBANY #undef RGBANY
......
...@@ -24,6 +24,11 @@ extern "C" { ...@@ -24,6 +24,11 @@ extern "C" {
"vld1.u32 {d2[0]}, [%1]! \n" \ "vld1.u32 {d2[0]}, [%1]! \n" \
"vld1.u32 {d2[1]}, [%2]! \n" "vld1.u32 {d2[1]}, [%2]! \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
"vld1.u8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12 // Read 8 Y and 4 UV from NV12
#define READNV12 \ #define READNV12 \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.u8 {d0}, [%0]! \n" \
...@@ -411,6 +416,58 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ...@@ -411,6 +416,58 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
} }
#endif // HAS_I422TOARGB4444ROW_NEON #endif // HAS_I422TOARGB4444ROW_NEON
#ifdef HAS_YTOARGBROW_NEON
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
"vld1.u8 {d24}, [%3] \n"
"vld1.u8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
".p2align 2 \n"
"1: \n"
READYUV400
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(&kUVToRB), // %3
"r"(&kUVToG) // %4
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif // HAS_YTOARGBROW_NEON
#ifdef HAS_I400TOARGBROW_NEON
void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
".p2align 2 \n"
"vmov.u8 d23, #255 \n"
"1: \n"
"vld1.u8 {d20}, [%0]! \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "d20", "d21", "d22", "d23"
);
}
#endif // HAS_I400TOARGBROW_NEON
#ifdef HAS_NV12TOARGBROW_NEON #ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* src_y, void NV12ToARGBRow_NEON(const uint8* src_y,
const uint8* src_uv, const uint8* src_uv,
......
...@@ -171,6 +171,37 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -171,6 +171,37 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
); );
} }
void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
);
}
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
......
...@@ -131,6 +131,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = { ...@@ -131,6 +131,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
}; };
// Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm { __asm {
...@@ -159,6 +160,35 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -159,6 +160,35 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
} }
} }
__declspec(naked) __declspec(align(16))
void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
align 16
convertloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm0
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0
punpckhwd xmm1, xmm1
por xmm0, xmm5
por xmm1, xmm5
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
ret
}
}
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
__asm { __asm {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment