Commit cc89e3a7 authored by Frank Barchard's avatar Frank Barchard

port ARGB to 565 dithering SSE2 code to GCC.

Previously the assembly code was only available to Windows.
This CL ports the SSE2 code to GCC syntax.

When running a profiler on all the unittests, this function
was the slowest of all functions that still ran in C code.
   3.71%  libyuv_unittest  libyuv_unittest      [.] ARGBToRGB565DitherRow_C

Was
ARGBToRGB565Dither_Opt (2894 ms)
Now
ARGBToRGB565Dither_Opt (432 ms)

TBR=harryjin@google.com
BUG=libyuv:492

Review URL: https://codereview.chromium.org/1397673002 .
parent 3e38762d
......@@ -91,6 +91,7 @@ extern "C" {
#define HAS_ARGBTOARGB4444ROW_SSE2
#define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565ROW_SSE2
#define HAS_ARGBTOUV422ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3
......@@ -102,8 +103,12 @@ extern "C" {
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
#define HAS_H422TOABGRROW_SSSE3
#define HAS_H422TOARGBROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3
#define HAS_I422ALPHATOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
......@@ -115,11 +120,11 @@ extern "C" {
#define HAS_I422TORGBAROW_SSSE3
#define HAS_I422TOUYVYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOABGRROW_SSSE3
#define HAS_I444TOARGBROW_SSSE3
#define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
#define HAS_J422TOABGRROW_SSSE3
#define HAS_H422TOARGBROW_SSSE3
#define HAS_H422TOABGRROW_SSSE3
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORROW_UV_SSSE3
......@@ -145,10 +150,6 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2
#define HAS_I444TOARGBROW_SSSE3
#define HAS_I444TOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422ALPHATOABGRROW_SSSE3
// Effects:
#define HAS_ARGBADDROW_SSE2
......@@ -184,10 +185,10 @@ extern "C" {
// The following are also available on x64 Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422ALPHATOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
#endif
// The following are available for AVX2 Visual C and clangcl 32 bit:
......@@ -199,17 +200,16 @@ extern "C" {
#define HAS_ARGBTOARGB1555ROW_AVX2
#define HAS_ARGBTOARGB4444ROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565ROW_AVX2
#define HAS_I411TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TORGB565ROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_I444TOABGRROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_J400TOARGBROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2
#endif
// The following are available on all x86 platforms, but
......@@ -226,7 +226,11 @@ extern "C" {
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#define HAS_COPYROW_AVX
#define HAS_H422TOABGRROW_AVX2
#define HAS_H422TOARGBROW_AVX2
#define HAS_I400TOARGBROW_AVX2
#define HAS_I422ALPHATOABGRROW_AVX2
#define HAS_I422ALPHATOARGBROW_AVX2
#define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2
#define HAS_I422TOBGRAROW_AVX2
......@@ -234,12 +238,12 @@ extern "C" {
#define HAS_I422TORGB24ROW_AVX2
#define HAS_I422TORGBAROW_AVX2
#define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_J422TOABGRROW_AVX2
#define HAS_H422TOARGBROW_AVX2
#define HAS_H422TOABGRROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
......@@ -249,10 +253,6 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_I422ALPHATOARGBROW_AVX2
#define HAS_I422ALPHATOABGRROW_AVX2
// Effects:
#define HAS_ARGBADDROW_AVX2
......@@ -273,10 +273,12 @@ extern "C" {
#define HAS_ARGB4444TOARGBROW_NEON
#define HAS_ARGB4444TOUVROW_NEON
#define HAS_ARGB4444TOYROW_NEON
#define HAS_ARGBSETROW_NEON
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGB565DITHERROW_NEON
#define HAS_ARGBTORGB565ROW_NEON
#define HAS_ARGBTOUV411ROW_NEON
#define HAS_ARGBTOUV422ROW_NEON
......@@ -288,19 +290,12 @@ extern "C" {
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_COPYROW_NEON
#define HAS_J400TOARGBROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_I411TOARGBROW_NEON
#define HAS_I422TOARGBROW_NEON
#define HAS_I422TOABGRROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
#define HAS_I422TOARGB4444ROW_NEON
// TODO(fbarchard): Implement aarch64 neon version
#ifndef __aarch64__
#define HAS_J422TOARGBROW_NEON
#define HAS_J422TOABGRROW_NEON
#define HAS_H422TOARGBROW_NEON
#define HAS_H422TOABGRROW_NEON
#endif
#define HAS_I422TOARGBROW_NEON
#define HAS_I422TOBGRAROW_NEON
#define HAS_I422TORAWROW_NEON
#define HAS_I422TORGB24ROW_NEON
......@@ -309,6 +304,7 @@ extern "C" {
#define HAS_I422TOUYVYROW_NEON
#define HAS_I422TOYUY2ROW_NEON
#define HAS_I444TOARGBROW_NEON
#define HAS_J400TOARGBROW_NEON
#define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
......@@ -327,29 +323,28 @@ extern "C" {
#define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
#define HAS_ARGBSETROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_YUY2TOARGBROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
#define HAS_ARGBTORGB565DITHERROW_NEON
// Effects:
#define HAS_ARGBADDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
#define HAS_ARGBBLENDROW_NEON
#define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBGRAYROW_NEON
#define HAS_ARGBMIRRORROW_NEON
#define HAS_ARGBMULTIPLYROW_NEON
#define HAS_ARGBQUANTIZEROW_NEON
#define HAS_ARGBSEPIAROW_NEON
#define HAS_ARGBSHADEROW_NEON
#define HAS_ARGBSHUFFLEROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
#define HAS_INTERPOLATEROW_NEON
#define HAS_SOBELROW_NEON
......@@ -357,8 +352,6 @@ extern "C" {
#define HAS_SOBELXROW_NEON
#define HAS_SOBELXYROW_NEON
#define HAS_SOBELYROW_NEON
#define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBSHUFFLEROW_NEON
#endif
// The following are available on Mips platforms:
......@@ -457,9 +450,9 @@ struct YuvConstants {
#define KYTORGB 192
#endif
extern struct YuvConstants kYuvConstants;
extern struct YuvConstants kYuvJConstants;
extern struct YuvConstants kYuvHConstants;
extern struct YuvConstants kYuvConstants; // BT.601
extern struct YuvConstants kYuvJConstants; // JPeg color space
extern struct YuvConstants kYuvHConstants; // BT.709
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
#define OMITFP
......
......@@ -526,6 +526,52 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
);
}
void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
const uint32 dither4, int pix) {
asm volatile (
"movd %3,%%xmm6 \n"
"punpcklbw %%xmm6,%%xmm6 \n"
"movdqa %%xmm6,%%xmm7 \n"
"punpcklwd %%xmm6,%%xmm6 \n"
"punpckhwd %%xmm7,%%xmm7 \n"
"pcmpeqb %%xmm3,%%xmm3 \n"
"psrld $0x1b,%%xmm3 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrld $0x1a,%%xmm4 \n"
"pslld $0x5,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"paddusb %%xmm6,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pslld $0x8,%%xmm0 \n"
"psrld $0x3,%%xmm1 \n"
"psrld $0x5,%%xmm2 \n"
"psrad $0x10,%%xmm0 \n"
"pand %%xmm3,%%xmm1 \n"
"pand %%xmm4,%%xmm2 \n"
"pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"packssdw %%xmm0,%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(pix) // %2
: "m"(dither4) // %3
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
......
......@@ -833,7 +833,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
// 4 pixels
__declspec(naked)
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
......@@ -871,7 +870,6 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
// 8 pixels
__declspec(naked)
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment