Commit febc26a2 authored by Frank Barchard's avatar Frank Barchard

win64 version of I422AlphaToARGB.

Was
I420AlphaToARGB_Premult (8861 ms)
I420AlphaToARGB_Opt (7119 ms)
Now
I420AlphaToABGR_Premult (2840 ms)
I420AlphaToARGB_Opt (484 ms)

C function switched to 1 step.
Was
I420AlphaToARGB_Premult (8862 ms)
I420AlphaToABGR_Opt (6718 ms)

Now
I420AlphaToARGB_Premult (8706 ms)
I420AlphaToARGB_Opt (6541 ms)

R=harryjin@google.com
BUG=libyuv:496, libyuv:473

Review URL: https://codereview.chromium.org/1359183003 .
parent 9a0e12f5
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1494 Version: 1495
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -187,6 +187,8 @@ extern "C" { ...@@ -187,6 +187,8 @@ extern "C" {
(!defined(__clang__) || defined(__SSSE3__)) (!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422ALPHATOABGRROW_SSSE3
#endif #endif
// The following are available for AVX2 Visual C and clangcl 32 bit: // The following are available for AVX2 Visual C and clangcl 32 bit:
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1494 #define LIBYUV_VERSION 1495
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -1353,6 +1353,88 @@ void I422ToARGBRow_C(const uint8* src_y, ...@@ -1353,6 +1353,88 @@ void I422ToARGBRow_C(const uint8* src_y,
} }
} }
void I422AlphaToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
const uint8* src_a,
uint8* rgb_buf,
struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = src_a[0];
YuvPixel(src_y[1], src_u[0], src_v[0],
rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
rgb_buf[7] = src_a[1];
src_y += 2;
src_u += 1;
src_v += 1;
src_a += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = src_a[0];
}
}
void I422ToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = 255;
YuvPixel(src_y[1], src_u[0], src_v[0],
rgb_buf + 6, rgb_buf + 5, rgb_buf + 4, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = 255;
}
}
void I422AlphaToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
const uint8* src_a,
uint8* rgb_buf,
struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = src_a[0];
YuvPixel(src_y[1], src_u[0], src_v[0],
rgb_buf + 6, rgb_buf + 5, rgb_buf + 4, yuvconstants);
rgb_buf[7] = src_a[1];
src_y += 2;
src_u += 1;
src_v += 1;
src_a += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = src_a[0];
}
}
void I422ToRGB24Row_C(const uint8* src_y, void I422ToRGB24Row_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1687,32 +1769,6 @@ void I422ToBGRARow_C(const uint8* src_y, ...@@ -1687,32 +1769,6 @@ void I422ToBGRARow_C(const uint8* src_y,
} }
} }
void I422ToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = 255;
YuvPixel(src_y[1], src_u[0], src_v[0],
rgb_buf + 6, rgb_buf + 5, rgb_buf + 4, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants);
rgb_buf[3] = 255;
}
}
void I422ToRGBARow_C(const uint8* src_y, void I422ToRGBARow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -2412,29 +2468,6 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, ...@@ -2412,29 +2468,6 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
} }
#endif #endif
void I422AlphaToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
const uint8* src_a,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
I422ToARGBRow_C(src_y, src_u, src_v, dst_argb, &kYuvConstants, width);
ARGBCopyYToAlphaRow_C(src_a, dst_argb, width);
}
void I422AlphaToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
const uint8* src_a,
uint8* dst_abgr,
struct YuvConstants* yuvconstants,
int width) {
I422ToABGRRow_C(src_y, src_u, src_v, dst_abgr, &kYuvConstants, width);
ARGBCopyYToAlphaRow_C(src_a, dst_abgr, width);
}
#if defined(HAS_I422TOARGB1555ROW_SSSE3) #if defined(HAS_I422TOARGB1555ROW_SSSE3)
void I422ToARGB1555Row_SSSE3(const uint8* src_y, void I422ToARGB1555Row_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
......
...@@ -30,6 +30,17 @@ extern "C" { ...@@ -30,6 +30,17 @@ extern "C" {
// Read 4 UV from 422, upsample to 8 UV. // Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \ #define READYUV422 \
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
y_buf += 8;
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
...@@ -38,6 +49,8 @@ extern "C" { ...@@ -38,6 +49,8 @@ extern "C" {
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
y_buf += 8; \ y_buf += 8; \
xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
a_buf += 8;
// Convert 8 pixels: 8 UV and 8 Y. // Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(yuvconstants) \ #define YUVTORGB(yuvconstants) \
...@@ -78,9 +91,9 @@ extern "C" { ...@@ -78,9 +91,9 @@ extern "C" {
xmm1 = _mm_loadu_si128(&xmm2); \ xmm1 = _mm_loadu_si128(&xmm2); \
xmm2 = _mm_unpacklo_epi16(xmm2, xmm0); \ xmm2 = _mm_unpacklo_epi16(xmm2, xmm0); \
xmm1 = _mm_unpackhi_epi16(xmm1, xmm0); \ xmm1 = _mm_unpackhi_epi16(xmm1, xmm0); \
_mm_storeu_si128((__m128i *)dst_argb, xmm2); \ _mm_storeu_si128((__m128i *)dst_abgr, xmm2); \
_mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ _mm_storeu_si128((__m128i *)(dst_abgr + 16), xmm1); \
dst_argb += 32; dst_abgr += 32;
#if defined(HAS_I422TOARGBROW_SSSE3) #if defined(HAS_I422TOARGBROW_SSSE3)
...@@ -106,7 +119,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -106,7 +119,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
void I422ToABGRRow_SSSE3(const uint8* y_buf, void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_abgr,
struct YuvConstants* yuvconstants, struct YuvConstants* yuvconstants,
int width) { int width) {
__m128i xmm0, xmm1, xmm2, xmm4; __m128i xmm0, xmm1, xmm2, xmm4;
...@@ -120,6 +133,45 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -120,6 +133,45 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
} }
} }
#endif #endif
#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
__m128i xmm0, xmm1, xmm2, xmm4, xmm5;
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) {
READYUVA422
YUVTORGB(yuvconstants)
STOREARGB
width -= 8;
}
}
#endif
#if defined(HAS_I422ALPHATOABGRROW_SSSE3)
void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_abgr,
struct YuvConstants* yuvconstants,
int width) {
__m128i xmm0, xmm1, xmm2, xmm4, xmm5;
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) {
READYUVA422
YUVTORGB(yuvconstants)
STOREABGR
width -= 8;
}
}
#endif
// 32 bit // 32 bit
#else // defined(_M_X64) #else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment