Commit 7060e0d8 authored by Frank Barchard's avatar Frank Barchard

I420ToABGRMatrix functions with J420ToABGR wrapper.

Allows direct conversion from JPeg to ABGR for android.

BUG=libyuv:488
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/55719004 .
parent fbc3d595
...@@ -149,6 +149,22 @@ int J422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -149,6 +149,22 @@ int J422ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// BGRA little endian (argb in memory) to ARGB. // BGRA little endian (argb in memory) to ARGB.
LIBYUV_API LIBYUV_API
int BGRAToARGB(const uint8* src_frame, int src_stride_frame, int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
......
...@@ -89,6 +89,7 @@ extern "C" { ...@@ -89,6 +89,7 @@ extern "C" {
#define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOARGBMATRIXROW_SSSE3 #define HAS_I422TOARGBMATRIXROW_SSSE3
#define HAS_I422TOABGRMATRIXROW_SSSE3
#define HAS_I422TOBGRAROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3
#define HAS_I422TORAWROW_SSSE3 #define HAS_I422TORAWROW_SSSE3
#define HAS_I422TORGB24ROW_SSSE3 #define HAS_I422TORGB24ROW_SSSE3
...@@ -99,6 +100,7 @@ extern "C" { ...@@ -99,6 +100,7 @@ extern "C" {
#define HAS_I444TOARGBROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3
#define HAS_J400TOARGBROW_SSE2 #define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3 #define HAS_J422TOARGBROW_SSSE3
#define HAS_J422TOABGRROW_SSSE3
#define HAS_MERGEUVROW_SSE2 #define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_SSSE3
...@@ -162,7 +164,9 @@ extern "C" { ...@@ -162,7 +164,9 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__)) (!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGBMATRIXROW_SSSE3 #define HAS_I422TOARGBMATRIXROW_SSSE3
#define HAS_I422TOABGRMATRIXROW_SSSE3
#endif #endif
// GCC >= 4.7.0 required for AVX2. // GCC >= 4.7.0 required for AVX2.
...@@ -226,12 +230,14 @@ extern "C" { ...@@ -226,12 +230,14 @@ extern "C" {
#define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2 #define HAS_I422TOARGBROW_AVX2
#define HAS_I422TOARGBMATRIXROW_AVX2 #define HAS_I422TOARGBMATRIXROW_AVX2
#define HAS_I422TOABGRMATRIXROW_AVX2
#define HAS_I422TOBGRAROW_AVX2 #define HAS_I422TOBGRAROW_AVX2
#define HAS_I422TORAWROW_AVX2 #define HAS_I422TORAWROW_AVX2
#define HAS_I422TORGB24ROW_AVX2 #define HAS_I422TORGB24ROW_AVX2
#define HAS_I422TORGBAROW_AVX2 #define HAS_I422TORGBAROW_AVX2
#define HAS_INTERPOLATEROW_AVX2 #define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2 #define HAS_J422TOARGBROW_AVX2
#define HAS_J422TOABGRROW_AVX2
#define HAS_MERGEUVROW_AVX2 #define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2 #define HAS_MIRRORROW_AVX2
#define HAS_SPLITUVROW_AVX2 #define HAS_SPLITUVROW_AVX2
...@@ -294,7 +300,8 @@ extern "C" { ...@@ -294,7 +300,8 @@ extern "C" {
#define HAS_I422TOARGB4444ROW_NEON #define HAS_I422TOARGB4444ROW_NEON
#define HAS_I422TOARGBROW_NEON #define HAS_I422TOARGBROW_NEON
// TODO(fbarchard): Implement NEON version // TODO(fbarchard): Implement NEON version
#define HAS_I422TOARGBMATRIXROW_NEON // #define HAS_I422TOARGBMATRIXROW_NEON
// #define HAS_I422TOABGRMATRIXROW_NEON
#define HAS_I422TOBGRAROW_NEON #define HAS_I422TOBGRAROW_NEON
#define HAS_I422TORAWROW_NEON #define HAS_I422TORAWROW_NEON
#define HAS_I422TORGB24ROW_NEON #define HAS_I422TORGB24ROW_NEON
...@@ -532,6 +539,12 @@ void I422ToARGBMatrixRow_NEON(const uint8* src_y, ...@@ -532,6 +539,12 @@ void I422ToARGBMatrixRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants, struct YuvConstants* YuvConstants,
int width); int width);
void I422ToABGRMatrixRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I411ToARGBRow_NEON(const uint8* src_y, void I411ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -991,6 +1004,12 @@ void I422ToARGBMatrixRow_C(const uint8* src_y, ...@@ -991,6 +1004,12 @@ void I422ToARGBMatrixRow_C(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants, struct YuvConstants* YuvConstants,
int width); int width);
void I422ToABGRMatrixRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I411ToARGBRow_C(const uint8* src_y, void I411ToARGBRow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1023,6 +1042,11 @@ void J422ToARGBRow_C(const uint8* src_y, ...@@ -1023,6 +1042,11 @@ void J422ToARGBRow_C(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void J422ToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToBGRARow_C(const uint8* src_y, void I422ToBGRARow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1074,6 +1098,12 @@ void I422ToARGBMatrixRow_AVX2(const uint8* src_y, ...@@ -1074,6 +1098,12 @@ void I422ToARGBMatrixRow_AVX2(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants, struct YuvConstants* YuvConstants,
int width); int width);
void I422ToABGRMatrixRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I422ToBGRARow_AVX2(const uint8* src_y, void I422ToBGRARow_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1110,6 +1140,12 @@ void I422ToARGBMatrixRow_SSSE3(const uint8* src_y, ...@@ -1110,6 +1140,12 @@ void I422ToARGBMatrixRow_SSSE3(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants, struct YuvConstants* YuvConstants,
int width); int width);
void I422ToABGRMatrixRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I411ToARGBRow_SSSE3(const uint8* src_y, void I411ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1169,11 +1205,21 @@ void J422ToARGBRow_SSSE3(const uint8* src_y, ...@@ -1169,11 +1205,21 @@ void J422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void J422ToABGRRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void J422ToARGBRow_AVX2(const uint8* src_y, void J422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void J422ToABGRRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToBGRARow_SSSE3(const uint8* src_y, void I422ToBGRARow_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1333,11 +1379,21 @@ void J422ToARGBRow_Any_SSSE3(const uint8* src_y, ...@@ -1333,11 +1379,21 @@ void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void J422ToABGRRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void J422ToARGBRow_Any_AVX2(const uint8* src_y, void J422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void J422ToABGRRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToBGRARow_Any_SSSE3(const uint8* src_y, void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
......
...@@ -1398,6 +1398,152 @@ int J422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1398,6 +1398,152 @@ int J422ToARGB(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
int y;
void (*J422ToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = J422ToABGRRow_C;
if (!src_y || !src_u || !src_v || !dst_abgr ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr;
}
#if defined(HAS_J422TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
J422ToABGRRow = J422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
J422ToABGRRow = J422ToABGRRow_SSSE3;
}
}
#endif
#if defined(HAS_J422TOABGRROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J422ToABGRRow = J422ToABGRRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J422ToABGRRow = J422ToABGRRow_AVX2;
}
}
#endif
#if defined(HAS_J422TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J422ToABGRRow = J422ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
J422ToABGRRow = J422ToABGRRow_NEON;
}
}
#endif
#if defined(HAS_J422TOABGRROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_abgr, 4) && IS_ALIGNED(dst_stride_abgr, 4)) {
J422ToABGRRow = J422ToABGRRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
J422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
dst_abgr += dst_stride_abgr;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
int y;
void (*J422ToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = J422ToABGRRow_C;
if (!src_y || !src_u || !src_v ||
!dst_abgr ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_abgr == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
}
#if defined(HAS_J422TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
J422ToABGRRow = J422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
J422ToABGRRow = J422ToABGRRow_SSSE3;
}
}
#endif
#if defined(HAS_J422TOABGRROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J422ToABGRRow = J422ToABGRRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J422ToABGRRow = J422ToABGRRow_AVX2;
}
}
#endif
#if defined(HAS_J422TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J422ToABGRRow = J422ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
J422ToABGRRow = J422ToABGRRow_NEON;
}
}
#endif
#if defined(HAS_J422TOABGRROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_abgr, 4) && IS_ALIGNED(dst_stride_abgr, 4)) {
J422ToABGRRow = J422ToABGRRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
J422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
dst_abgr += dst_stride_abgr;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -43,6 +43,7 @@ extern "C" { ...@@ -43,6 +43,7 @@ extern "C" {
#ifdef HAS_I422TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3
ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
ANY31(J422ToABGRRow_Any_SSSE3, J422ToABGRRow_SSSE3, 1, 0, 4, 7)
#endif #endif
#ifdef HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I444TOARGBROW_SSSE3
ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
...@@ -70,6 +71,9 @@ ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7) ...@@ -70,6 +71,9 @@ ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
#ifdef HAS_J422TOARGBROW_AVX2 #ifdef HAS_J422TOARGBROW_AVX2
ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15) ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif #endif
#ifdef HAS_J422TOABGRROW_AVX2
ANY31(J422ToABGRRow_Any_AVX2, J422ToABGRRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBROW_AVX2
ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif #endif
......
...@@ -1179,6 +1179,31 @@ void J422ToARGBRow_C(const uint8* src_y, ...@@ -1179,6 +1179,31 @@ void J422ToARGBRow_C(const uint8* src_y,
} }
} }
void J422ToABGRRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvJPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
rgb_buf[3] = 255;
YuvJPixel(src_y[1], src_u[0], src_v[0],
rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvJPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
rgb_buf[3] = 255;
}
}
void I422ToRGB24Row_C(const uint8* src_y, void I422ToRGB24Row_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -2156,49 +2181,33 @@ void I422ToUYVYRow_C(const uint8* src_y, ...@@ -2156,49 +2181,33 @@ void I422ToUYVYRow_C(const uint8* src_y,
} }
} }
#if defined(HAS_I422TOARGBMATRIXROW_SSSE3)
extern struct YuvConstants kYuvConstants; extern struct YuvConstants kYuvConstants;
extern struct YuvConstants kYuvJConstants; extern struct YuvConstants kYuvJConstants;
// JPeg color space version of I422ToARGB #define ANYYUV(NAMEANY, ANY_SIMD, YUVCONSTANTS) \
void J422ToARGBRow_SSSE3(const uint8* y_buf, void NAMEANY(const uint8* y_buf, \
const uint8* u_buf, const uint8* u_buf, \
const uint8* v_buf, const uint8* v_buf, \
uint8* dst_argb, uint8* dst_argb, \
int width) { int width) { \
I422ToARGBMatrixRow_SSSE3(y_buf, u_buf, v_buf, dst_argb, ANY_SIMD(y_buf, u_buf, v_buf, dst_argb, &YUVCONSTANTS, width); \
&kYuvJConstants, width); }
}
#ifdef HAS_I422TOARGBMATRIXROW_SSSE3
void I422ToARGBRow_SSSE3(const uint8* y_buf, ANYYUV(I422ToARGBRow_SSSE3, I422ToARGBMatrixRow_SSSE3, kYuvConstants)
const uint8* u_buf, ANYYUV(J422ToARGBRow_SSSE3, I422ToARGBMatrixRow_SSSE3, kYuvJConstants)
const uint8* v_buf, #endif
uint8* dst_argb, #ifdef HAS_I422TOARGBMATRIXROW_AVX2
int width) { ANYYUV(I422ToARGBRow_AVX2, I422ToARGBMatrixRow_AVX2, kYuvConstants)
I422ToARGBMatrixRow_SSSE3(y_buf, u_buf, v_buf, dst_argb, ANYYUV(J422ToARGBRow_AVX2, I422ToARGBMatrixRow_AVX2, kYuvJConstants)
&kYuvConstants, width); #endif
} #ifdef HAS_I422TOABGRMATRIXROW_SSSE3
ANYYUV(I422ToABGRRow_SSSE3, I422ToABGRMatrixRow_SSSE3, kYuvConstants)
#if defined(HAS_I422TOARGBMATRIXROW_AVX2) ANYYUV(J422ToABGRRow_SSSE3, I422ToABGRMatrixRow_SSSE3, kYuvJConstants)
// JPeg color space version of I422ToARGB
void J422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
I422ToARGBMatrixRow_AVX2(y_buf, u_buf, v_buf, dst_argb,
&kYuvJConstants, width);
}
void I422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
I422ToARGBMatrixRow_AVX2(y_buf, u_buf, v_buf, dst_argb,
&kYuvConstants, width);
}
#endif #endif
#ifdef HAS_I422TOABGRMATRIXROW_AVX2
ANYYUV(I422ToABGRRow_AVX2, I422ToABGRMatrixRow_AVX2, kYuvConstants)
ANYYUV(J422ToABGRRow_AVX2, I422ToABGRMatrixRow_AVX2, kYuvJConstants)
#endif #endif
// Maximum temporary width for wrappers to process at a time, in pixels. // Maximum temporary width for wrappers to process at a time, in pixels.
......
...@@ -1669,7 +1669,7 @@ void OMITFP I422ToARGBMatrixRow_SSSE3(const uint8* y_buf, ...@@ -1669,7 +1669,7 @@ void OMITFP I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
[v_buf]"+r"(v_buf), // %[v_buf] [v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb] [dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(YuvConstants) // %[YuvConstants] : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); );
...@@ -1773,11 +1773,12 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1773,11 +1773,12 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, void OMITFP I422ToABGRMatrixRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_abgr, uint8* dst_abgr,
int width) { struct YuvConstants* YuvConstants,
int width) {
asm volatile ( asm volatile (
"sub %[u_buf],%[v_buf] \n" "sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
...@@ -1793,7 +1794,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1793,7 +1794,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
[v_buf]"+r"(v_buf), // %[v_buf] [v_buf]"+r"(v_buf), // %[v_buf]
[dst_abgr]"+r"(dst_abgr), // %[dst_abgr] [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); );
...@@ -1940,7 +1941,7 @@ void OMITFP I422ToARGBMatrixRow_AVX2(const uint8* y_buf, ...@@ -1940,7 +1941,7 @@ void OMITFP I422ToARGBMatrixRow_AVX2(const uint8* y_buf,
[v_buf]"+r"(v_buf), // %[v_buf] [v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb] [dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(YuvConstants) // %[YuvConstants] : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); );
...@@ -1950,11 +1951,12 @@ void OMITFP I422ToARGBMatrixRow_AVX2(const uint8* y_buf, ...@@ -1950,11 +1951,12 @@ void OMITFP I422ToARGBMatrixRow_AVX2(const uint8* y_buf,
#if defined(HAS_I422TOABGRROW_AVX2) #if defined(HAS_I422TOABGRROW_AVX2)
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, void OMITFP I422ToABGRMatrixRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
int width) { struct YuvConstants* YuvConstants,
int width) {
asm volatile ( asm volatile (
"sub %[u_buf],%[v_buf] \n" "sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
...@@ -1981,7 +1983,7 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1981,7 +1983,7 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
[v_buf]"+r"(v_buf), // %[v_buf] [v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb] [dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : [kYuvConstants]"r"(YuvConstants) // %[kYuvConstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); );
......
...@@ -145,6 +145,62 @@ YuvConstants SIMD_ALIGNED(kYuvJConstants) = { ...@@ -145,6 +145,62 @@ YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
// 64 bit // 64 bit
#if defined(_M_X64) #if defined(_M_X64)
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
u_buf += 4;
// Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) \
xmm1 = _mm_loadu_si128(&xmm0); \
xmm2 = _mm_loadu_si128(&xmm0); \
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)YuvConstants->kUVToB); \
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)YuvConstants->kUVToG); \
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)YuvConstants->kUVToR); \
xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0); \
xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1); \
xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2); \
xmm3 = _mm_loadl_epi64((__m128i*)y_buf); \
y_buf += 8; \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); \
xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)YuvConstants->kYToRgb); \
xmm0 = _mm_adds_epi16(xmm0, xmm3); \
xmm1 = _mm_adds_epi16(xmm1, xmm3); \
xmm2 = _mm_adds_epi16(xmm2, xmm3); \
xmm0 = _mm_srai_epi16(xmm0, 6); \
xmm1 = _mm_srai_epi16(xmm1, 6); \
xmm2 = _mm_srai_epi16(xmm2, 6); \
xmm0 = _mm_packus_epi16(xmm0, xmm0); \
xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2);
// Store 8 ARGB values.
#define STOREARGB \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
xmm1 = _mm_loadu_si128(&xmm0); \
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
_mm_storeu_si128((__m128i *)dst_argb, xmm0); \
_mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \
dst_argb += 32;
// Store 8 ABGR values.
#define STOREABGR \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm5); \
xmm1 = _mm_loadu_si128(&xmm2); \
xmm2 = _mm_unpacklo_epi16(xmm2, xmm0); \
xmm1 = _mm_unpackhi_epi16(xmm1, xmm0); \
_mm_storeu_si128((__m128i *)dst_argb, xmm2); \
_mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \
dst_argb += 32;
#if defined(HAS_I422TOARGBMATRIXROW_SSSE3) #if defined(HAS_I422TOARGBMATRIXROW_SSSE3)
void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf, void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
...@@ -155,44 +211,29 @@ void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf, ...@@ -155,44 +211,29 @@ void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
__m128i xmm0, xmm1, xmm2, xmm3; __m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) { while (width > 0) {
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); READYUV422
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); YUVTORGB(YuvConstants)
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); STOREARGB
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); width -= 8;
xmm1 = _mm_loadu_si128(&xmm0); }
xmm2 = _mm_loadu_si128(&xmm0); }
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)YuvConstants->kUVToB); #endif
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)YuvConstants->kUVToG);
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)YuvConstants->kUVToR);
xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0);
xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1);
xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2);
xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)YuvConstants->kYToRgb);
xmm0 = _mm_adds_epi16(xmm0, xmm3);
xmm1 = _mm_adds_epi16(xmm1, xmm3);
xmm2 = _mm_adds_epi16(xmm2, xmm3);
xmm0 = _mm_srai_epi16(xmm0, 6);
xmm1 = _mm_srai_epi16(xmm1, 6);
xmm2 = _mm_srai_epi16(xmm2, 6);
xmm0 = _mm_packus_epi16(xmm0, xmm0);
xmm1 = _mm_packus_epi16(xmm1, xmm1);
xmm2 = _mm_packus_epi16(xmm2, xmm2);
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
xmm1 = _mm_loadu_si128(&xmm0);
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
_mm_storeu_si128((__m128i *)dst_argb, xmm0);
_mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
y_buf += 8; #if defined(HAS_I422TOABGRMATRIXROW_SSSE3)
u_buf += 4; void I422ToABGRMatrixRow_SSSE3(const uint8* y_buf,
dst_argb += 32; const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width) {
__m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) {
READYUV422
YUVTORGB(YuvConstants)
STOREABGR
width -= 8; width -= 8;
} }
} }
...@@ -1962,7 +2003,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1962,7 +2003,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
// Convert 16 pixels: 16 UV and 16 Y. // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) __asm { \ #define YUVTORGB_AVX2(YuvConstants) __asm { \
/* Step 1: Find 8 UV contributions to 16 R,G,B values */ \
__asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
__asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
__asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
...@@ -1991,7 +2031,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1991,7 +2031,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
// Store 16 ARGB values. // Store 16 ARGB values.
#define STOREARGB_AVX2 __asm { \ #define STOREARGB_AVX2 __asm { \
/* Step 3: Weave into ARGB */ \
__asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
__asm vpermq ymm0, ymm0, 0xd8 \ __asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
...@@ -2003,6 +2042,45 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -2003,6 +2042,45 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm lea edx, [edx + 64] \ __asm lea edx, [edx + 64] \
} }
// Store 16 ABGR values.
#define STOREBGRA_AVX2 __asm { \
__asm vpunpcklbw ymm1, ymm1, ymm0 /* GB */ \
__asm vpermq ymm1, ymm1, 0xd8 \
__asm vpunpcklbw ymm2, ymm5, ymm2 /* AR */ \
__asm vpermq ymm2, ymm2, 0xd8 \
__asm vpunpcklwd ymm0, ymm2, ymm1 /* ARGB first 8 pixels */ \
__asm vpunpckhwd ymm2, ymm2, ymm1 /* ARGB next 8 pixels */ \
__asm vmovdqu [edx], ymm0 \
__asm vmovdqu [edx + 32], ymm2 \
__asm lea edx, [edx + 64] \
}
// Store 16 RGBA values.
#define STORERGBA_AVX2 __asm { \
__asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
__asm vpermq ymm1, ymm1, 0xd8 \
__asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
__asm vpermq ymm2, ymm2, 0xd8 \
__asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
__asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
__asm vmovdqu [edx], ymm0 \
__asm vmovdqu [edx + 32], ymm1 \
__asm lea edx, [edx + 64] \
}
// Store 16 ABGR values.
#define STOREABGR_AVX2 __asm { \
__asm vpunpcklbw ymm1, ymm2, ymm1 /* RG */ \
__asm vpermq ymm1, ymm1, 0xd8 \
__asm vpunpcklbw ymm2, ymm0, ymm5 /* BA */ \
__asm vpermq ymm2, ymm2, 0xd8 \
__asm vpunpcklwd ymm0, ymm1, ymm2 /* RGBA first 8 pixels */ \
__asm vpunpckhwd ymm1, ymm1, ymm2 /* RGBA next 8 pixels */ \
__asm vmovdqu [edx], ymm0 \
__asm vmovdqu [edx + 32], ymm1 \
__asm lea edx, [edx + 64] \
}
#ifdef HAS_I422TOARGBMATRIXROW_AVX2 #ifdef HAS_I422TOARGBMATRIXROW_AVX2
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
...@@ -2201,17 +2279,8 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -2201,17 +2279,8 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
YUVTORGB_AVX2(kYuvConstants) YUVTORGB_AVX2(kYuvConstants)
STOREBGRA_AVX2
// Step 3: Weave into BGRA
vpunpcklbw ymm1, ymm1, ymm0 // GB
vpermq ymm1, ymm1, 0xd8
vpunpcklbw ymm2, ymm5, ymm2 // AR
vpermq ymm2, ymm2, 0xd8
vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels
vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm2
lea edx, [edx + 64]
sub ecx, 16 sub ecx, 16
jg convertloop jg convertloop
...@@ -2226,7 +2295,6 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -2226,7 +2295,6 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
#ifdef HAS_I422TORGBAROW_AVX2 #ifdef HAS_I422TORGBAROW_AVX2
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked) __declspec(naked)
void I422ToRGBARow_AVX2(const uint8* y_buf, void I422ToRGBARow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
...@@ -2247,17 +2315,8 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, ...@@ -2247,17 +2315,8 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
YUVTORGB_AVX2(kYuvConstants) YUVTORGB_AVX2(kYuvConstants)
STORERGBA_AVX2
// Step 3: Weave into RGBA
vpunpcklbw ymm1, ymm1, ymm2 // GR
vpermq ymm1, ymm1, 0xd8
vpunpcklbw ymm2, ymm5, ymm0 // AB
vpermq ymm2, ymm2, 0xd8
vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels
vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 16 sub ecx, 16
jg convertloop jg convertloop
...@@ -2272,41 +2331,35 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, ...@@ -2272,41 +2331,35 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
#ifdef HAS_I422TOABGRROW_AVX2 #ifdef HAS_I422TOABGRROW_AVX2
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked) __declspec(naked)
void I422ToABGRRow_AVX2(const uint8* y_buf, void I422ToABGRMatrixRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
int width) { struct YuvConstants* YuvConstants,
int width) {
__asm { __asm {
push esi push esi
push edi push edi
mov eax, [esp + 8 + 4] // Y push ebp
mov esi, [esp + 8 + 8] // U mov eax, [esp + 12 + 4] // Y
mov edi, [esp + 8 + 12] // V mov esi, [esp + 12 + 8] // U
mov edx, [esp + 8 + 16] // argb mov edi, [esp + 12 + 12] // V
mov ecx, [esp + 8 + 20] // width mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants
mov ecx, [esp + 12 + 20] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
YUVTORGB_AVX2(kYuvConstants) YUVTORGB_AVX2(ebp)
STOREABGR_AVX2
// Step 3: Weave into ABGR
vpunpcklbw ymm1, ymm2, ymm1 // RG
vpermq ymm1, ymm1, 0xd8
vpunpcklbw ymm2, ymm0, ymm5 // BA
vpermq ymm2, ymm2, 0xd8
vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels
vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels
vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 16 sub ecx, 16
jg convertloop jg convertloop
pop ebp
pop edi pop edi
pop esi pop esi
vzeroupper vzeroupper
...@@ -2354,7 +2407,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2354,7 +2407,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// Convert 8 pixels: 8 UV and 8 Y. // Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) __asm { \ #define YUVTORGB(YuvConstants) __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movdqa xmm1, xmm0 \ __asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \ __asm movdqa xmm2, xmm0 \
__asm movdqa xmm3, xmm0 \ __asm movdqa xmm3, xmm0 \
...@@ -2367,7 +2419,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2367,7 +2419,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
__asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
__asm psubw xmm2, xmm3 \ __asm psubw xmm2, xmm3 \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] \ __asm movq xmm3, qword ptr [eax] \
__asm lea eax, [eax + 8] \ __asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm3 \ __asm punpcklbw xmm3, xmm3 \
...@@ -2385,7 +2436,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2385,7 +2436,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// Store 8 ARGB values. // Store 8 ARGB values.
#define STOREARGB __asm { \ #define STOREARGB __asm { \
/* Step 3: Weave into ARGB */ \
__asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \
__asm punpcklbw xmm2, xmm5 /* RA */ \ __asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \ __asm movdqa xmm1, xmm0 \
...@@ -2398,7 +2448,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2398,7 +2448,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// Store 8 BGRA values. // Store 8 BGRA values.
#define STOREBGRA __asm { \ #define STOREBGRA __asm { \
/* Step 3: Weave into BGRA */ \
__asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
__asm punpcklbw xmm1, xmm0 /* GB */ \ __asm punpcklbw xmm1, xmm0 /* GB */ \
__asm punpcklbw xmm5, xmm2 /* AR */ \ __asm punpcklbw xmm5, xmm2 /* AR */ \
...@@ -2412,7 +2461,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2412,7 +2461,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// Store 8 ABGR values. // Store 8 ABGR values.
#define STOREABGR __asm { \ #define STOREABGR __asm { \
/* Step 3: Weave into ABGR */ \
__asm punpcklbw xmm2, xmm1 /* RG */ \ __asm punpcklbw xmm2, xmm1 /* RG */ \
__asm punpcklbw xmm0, xmm5 /* BA */ \ __asm punpcklbw xmm0, xmm5 /* BA */ \
__asm movdqa xmm1, xmm2 \ __asm movdqa xmm1, xmm2 \
...@@ -2425,7 +2473,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2425,7 +2473,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// Store 8 RGBA values. // Store 8 RGBA values.
#define STORERGBA __asm { \ #define STORERGBA __asm { \
/* Step 3: Weave into RGBA */ \
__asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
__asm punpcklbw xmm1, xmm2 /* GR */ \ __asm punpcklbw xmm1, xmm2 /* GR */ \
__asm punpcklbw xmm5, xmm0 /* AB */ \ __asm punpcklbw xmm5, xmm0 /* AB */ \
...@@ -2439,13 +2486,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2439,13 +2486,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// Store 8 RGB24 values. // Store 8 RGB24 values.
#define STORERGB24 __asm { \ #define STORERGB24 __asm { \
/* Step 3: Weave into RRGB */ \ /* Weave into RRGB */ \
__asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \
__asm punpcklbw xmm2, xmm2 /* RR */ \ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \ __asm movdqa xmm1, xmm0 \
__asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
__asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
/* Step 4: RRGB -> RGB24 */ \ /* RRGB -> RGB24 */ \
__asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
__asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
__asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
...@@ -2456,7 +2503,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2456,7 +2503,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// Store 8 RAW values. // Store 8 RAW values.
#define STORERAW __asm { \ #define STORERAW __asm { \
/* Step 3: Weave into RRGB */ \ /* Weave into RRGB */ \
__asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \
__asm punpcklbw xmm2, xmm2 /* RR */ \ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \ __asm movdqa xmm1, xmm0 \
...@@ -2473,13 +2520,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2473,13 +2520,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// Store 8 RGB565 values. // Store 8 RGB565 values.
#define STORERGB565 __asm { \ #define STORERGB565 __asm { \
/* Step 3: Weave into RRGB */ \ /* Weave into RRGB */ \
__asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \
__asm punpcklbw xmm2, xmm2 /* RR */ \ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \ __asm movdqa xmm1, xmm0 \
__asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
__asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
/* Step 4: RRGB -> RGB565 */ \ /* RRGB -> RGB565 */ \
__asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
__asm movdqa xmm2, xmm0 /* G */ \ __asm movdqa xmm2, xmm0 /* G */ \
__asm pslld xmm0, 8 /* R */ \ __asm pslld xmm0, 8 /* R */ \
...@@ -2666,7 +2713,6 @@ void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf, ...@@ -2666,7 +2713,6 @@ void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // YuvConstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -2808,30 +2854,34 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -2808,30 +2854,34 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
} }
__declspec(naked) __declspec(naked)
void I422ToABGRRow_SSSE3(const uint8* y_buf, void I422ToABGRMatrixRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_abgr, uint8* dst_abgr,
int width) { struct YuvConstants* YuvConstants,
int width) {
__asm { __asm {
push esi push esi
push edi push edi
mov eax, [esp + 8 + 4] // Y push ebp
mov esi, [esp + 8 + 8] // U mov eax, [esp + 12 + 4] // Y
mov edi, [esp + 8 + 12] // V mov esi, [esp + 12 + 8] // U
mov edx, [esp + 8 + 16] // abgr mov edi, [esp + 12 + 12] // V
mov ecx, [esp + 8 + 20] // width mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants
mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop: convertloop:
READYUV422 READYUV422
YUVTORGB(kYuvConstants) YUVTORGB(ebp)
STOREABGR STOREABGR
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
pop ebp
pop edi pop edi
pop esi pop esi
ret ret
......
...@@ -492,6 +492,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ ...@@ -492,6 +492,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4)
...@@ -502,6 +503,7 @@ TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4) ...@@ -502,6 +503,7 @@ TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4)
TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4) TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment