Commit 9245317e authored by fbarchard@google.com's avatar fbarchard@google.com

ARGBToRGB565 SSE2 port.

BUG=407
TESTED=ARGBToRGB565Dither unittest
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/41039004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1308 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 274c9bce
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1306 Version: 1307
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -198,6 +198,7 @@ extern "C" { ...@@ -198,6 +198,7 @@ extern "C" {
#define HAS_I422TORGB565ROW_AVX2 #define HAS_I422TORGB565ROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_SSE2
#endif #endif
// The following are available on all x86 platforms, but // The following are available on all x86 platforms, but
...@@ -905,6 +906,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); ...@@ -905,6 +906,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int pix);
void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
...@@ -1375,6 +1379,9 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); ...@@ -1375,6 +1379,9 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int pix);
void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1306 #define LIBYUV_VERSION 1307
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -804,15 +804,16 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, ...@@ -804,15 +804,16 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
static const uint8 kDither8x8[64] = { // Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
0, 128, 32, 160, 8, 136, 40, 168, static const uint8 kDither565_8x8[64] = {
192, 64, 224, 96, 200, 72, 232, 104, 0 >> 5, 128 >> 5, 32 >> 5, 160 >> 5, 8 >> 5, 136 >> 5, 40 >> 5, 168 >> 5,
48, 176, 16, 144, 56, 184, 24, 152, 192 >> 5, 64 >> 5, 224 >> 5, 96 >> 5, 200 >> 5, 72 >> 5, 232 >> 5, 104 >> 5,
240, 112, 208, 80, 248, 120, 216, 88, 48 >> 5, 176 >> 5, 16 >> 5, 144 >> 5, 56 >> 5, 184 >> 5, 24 >> 5, 152 >> 5,
12, 140, 44, 172, 4, 132, 36, 164, 240 >> 5, 112 >> 5, 208 >> 5, 80 >> 5, 248 >> 5, 120 >> 5, 216 >> 5, 88 >> 5,
204, 76, 236, 108, 196, 68, 228, 100, 12 >> 5, 140 >> 5, 44 >> 5, 172 >> 5, 4 >> 5, 132 >> 5, 36 >> 5, 164 >> 5,
60, 188, 28, 156, 52, 180, 20, 148, 204 >> 5, 76 >> 5, 236 >> 5, 108 >> 5, 196 >> 5, 68 >> 5, 228 >> 5, 100 >> 5,
252, 124, 220, 92, 244, 116, 212, 84, 60 >> 5, 188 >> 5, 28 >> 5, 156 >> 5, 52 >> 5, 180 >> 5, 20 >> 5, 148 >> 5,
252 >> 5, 124 >> 5, 220 >> 5, 92 >> 5, 244 >> 5, 116 >> 5, 212 >> 5, 84 >> 5,
}; };
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). // Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
...@@ -832,9 +833,16 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, ...@@ -832,9 +833,16 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb; src_stride_argb = -src_stride_argb;
} }
if (!dither8x8) { if (!dither8x8) {
dither8x8 = kDither8x8; dither8x8 = kDither565_8x8;
}
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
}
} }
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565, ARGBToRGB565DitherRow(src_argb, dst_rgb565,
dither8x8 + ((y & 7) << 3), width); dither8x8 + ((y & 7) << 3), width);
......
...@@ -225,6 +225,22 @@ RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7) ...@@ -225,6 +225,22 @@ RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7)
#endif #endif
#undef RGBANY #undef RGBANY
#define RGBDANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src, uint8* dst, \
const uint8* dither8x8, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ARGBTORGB_SIMD(src, dst, dither8x8, n); \
} \
ARGBTORGB_C(src + n * SBPP, dst + n * BPP, dither8x8, width & MASK); \
}
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
RGBDANY(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
ARGBToRGB565DitherRow_C, 4, 2, 7)
#endif
#undef RGBDANY
// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. // ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \ #define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) { \ void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) { \
......
...@@ -585,6 +585,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -585,6 +585,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
} }
} }
// 4 pixels
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
...@@ -622,6 +623,70 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -622,6 +623,70 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
} }
} }
// 8 pixels
__declspec(naked) __declspec(align(16))
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8, int pix) {
__asm {
mov eax, [esp + 12] // dither8
movq xmm6, qword ptr [eax] // fetch 8 dither values
punpcklbw xmm6, xmm6
movdqa xmm7, xmm6
punpcklwd xmm6, xmm6
punpckhwd xmm7, xmm7
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 16] // pix
pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
psrld xmm3, 27
pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
psrld xmm4, 26
pslld xmm4, 5
pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11
convertloop:
movdqu xmm0, [eax] // fetch 4 pixels of argb
paddusb xmm0, xmm6
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
pslld xmm0, 8 // R
psrld xmm1, 3 // B
psrld xmm2, 5 // G
psrad xmm0, 16 // R
pand xmm1, xmm3 // B
pand xmm2, xmm4 // G
pand xmm0, xmm5 // R
por xmm1, xmm2 // BG
por xmm0, xmm1 // BGR
packssdw xmm0, xmm0
movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
movdqu xmm0, [eax + 16] // fetch 4 pixels of argb
paddusb xmm0, xmm7
movdqa xmm1, xmm0 // B
movdqa xmm2, xmm0 // G
pslld xmm0, 8 // R
psrld xmm1, 3 // B
psrld xmm2, 5 // G
psrad xmm0, 16 // R
pand xmm1, xmm3 // B
pand xmm2, xmm4 // G
pand xmm0, xmm5 // R
por xmm1, xmm2 // BG
por xmm0, xmm1 // BGR
packssdw xmm0, xmm0
movq qword ptr [edx + 8], xmm0 // store 4 pixels of RGB565
lea eax, [eax + 32]
lea edx, [edx + 16]
sub ecx, 8
jg convertloop
ret
}
}
// TODO(fbarchard): Improve sign extension/packing. // TODO(fbarchard): Improve sign extension/packing.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
...@@ -1646,8 +1711,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1646,8 +1711,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vpermq ymm2, ymm2, 0xd8 \ __asm vpermq ymm2, ymm2, 0xd8 \
__asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
__asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
__asm vmovdqu [edx], ymm1 \ __asm vmovdqu 0[edx], ymm1 \
__asm vmovdqu [edx + 32], ymm0 \ __asm vmovdqu 32[edx], ymm0 \
__asm lea edx, [edx + 64] \ __asm lea edx, [edx + 64] \
} }
...@@ -1959,8 +2024,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1959,8 +2024,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm1, xmm0 \ __asm movdqa xmm1, xmm0 \
__asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
__asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
__asm movdqu [edx], xmm0 \ __asm movdqu 0[edx], xmm0 \
__asm movdqu [edx + 16], xmm1 \ __asm movdqu 16[edx], xmm1 \
__asm lea edx, [edx + 32] \ __asm lea edx, [edx + 32] \
} }
...@@ -1973,8 +2038,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1973,8 +2038,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm0, xmm5 \ __asm movdqa xmm0, xmm5 \
__asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
__asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
__asm movdqu [edx], xmm5 \ __asm movdqu 0[edx], xmm5 \
__asm movdqu [edx + 16], xmm0 \ __asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32] \ __asm lea edx, [edx + 32] \
} }
...@@ -1986,8 +2051,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1986,8 +2051,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm1, xmm2 \ __asm movdqa xmm1, xmm2 \
__asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
__asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
__asm movdqu [edx], xmm2 \ __asm movdqu 0[edx], xmm2 \
__asm movdqu [edx + 16], xmm1 \ __asm movdqu 16[edx], xmm1 \
__asm lea edx, [edx + 32] \ __asm lea edx, [edx + 32] \
} }
...@@ -2000,8 +2065,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2000,8 +2065,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm0, xmm5 \ __asm movdqa xmm0, xmm5 \
__asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
__asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
__asm movdqu [edx], xmm5 \ __asm movdqu 0[edx], xmm5 \
__asm movdqu [edx + 16], xmm0 \ __asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32] \ __asm lea edx, [edx + 32] \
} }
...@@ -2017,8 +2082,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2017,8 +2082,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
__asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
__asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
__asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
__asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
__asm lea edx, [edx + 24] \ __asm lea edx, [edx + 24] \
} }
...@@ -2034,8 +2099,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2034,8 +2099,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
__asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
__asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
__asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
__asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
__asm lea edx, [edx + 24] \ __asm lea edx, [edx + 24] \
} }
...@@ -2071,7 +2136,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2071,7 +2136,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm por xmm3, xmm2 /* BG */ \ __asm por xmm3, xmm2 /* BG */ \
__asm por xmm1, xmm3 /* BGR */ \ __asm por xmm1, xmm3 /* BGR */ \
__asm packssdw xmm0, xmm1 \ __asm packssdw xmm0, xmm1 \
__asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
__asm lea edx, [edx + 16] \ __asm lea edx, [edx + 16] \
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment