Commit e214fe3f authored by fbarchard@google.com's avatar fbarchard@google.com

I411ToARGB doing 2 UV values with 8 Y values

BUG=40
TEST=planar_test
Review URL: https://webrtc-codereview.appspot.com/637005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@277 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 6d6b7709
...@@ -31,6 +31,13 @@ void CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -31,6 +31,13 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height); int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v)
int I420ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// I420 mirror. // I420 mirror.
int I420Mirror(const uint8* src_y, int src_stride_y, int I420Mirror(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
...@@ -62,6 +69,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -62,6 +69,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Convert I444 to ARGB.
int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I422 to ARGB. // Convert I422 to ARGB.
int I422ToARGB(const uint8* src_y, int src_stride_y, int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
...@@ -69,8 +83,8 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -69,8 +83,8 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Convert I444 to ARGB. // Convert I411 to ARGB.
int I444ToARGB(const uint8* src_y, int src_stride_y, int I411ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
......
...@@ -660,32 +660,32 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, ...@@ -660,32 +660,32 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToARGBRow = I420ToARGBRow_Any_SSSE3; I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I420ToARGBRow = I420ToARGBRow_Unaligned_SSSE3; I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
} }
} }
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToARGBRow(src_y, src_u, src_v, dst_argb, width); I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
...@@ -708,32 +708,32 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, ...@@ -708,32 +708,32 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
dst_stride_bgra = -dst_stride_bgra; dst_stride_bgra = -dst_stride_bgra;
} }
void (*I420ToBGRARow)(const uint8* y_buf, void (*I422ToBGRARow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToBGRARow_C; int width) = I422ToBGRARow_C;
#if defined(HAS_I420TOBGRAROW_NEON) #if defined(HAS_I422TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToBGRARow = I420ToBGRARow_Any_NEON; I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
I420ToBGRARow = I420ToBGRARow_NEON; I422ToBGRARow = I422ToBGRARow_NEON;
} }
} }
#elif defined(HAS_I420TOBGRAROW_SSSE3) #elif defined(HAS_I422TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToBGRARow = I420ToBGRARow_Any_SSSE3; I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I420ToBGRARow = I420ToBGRARow_Unaligned_SSSE3; I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
I420ToBGRARow = I420ToBGRARow_SSSE3; I422ToBGRARow = I422ToBGRARow_SSSE3;
} }
} }
} }
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToBGRARow(src_y, src_u, src_v, dst_bgra, width); I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
dst_bgra += dst_stride_bgra; dst_bgra += dst_stride_bgra;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
...@@ -756,32 +756,32 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, ...@@ -756,32 +756,32 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr; dst_stride_abgr = -dst_stride_abgr;
} }
void (*I420ToABGRRow)(const uint8* y_buf, void (*I422ToABGRRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToABGRRow_C; int width) = I422ToABGRRow_C;
#if defined(HAS_I420TOABGRROW_NEON) #if defined(HAS_I422TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToABGRRow = I420ToABGRRow_Any_NEON; I422ToABGRRow = I422ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
I420ToABGRRow = I420ToABGRRow_NEON; I422ToABGRRow = I422ToABGRRow_NEON;
} }
} }
#elif defined(HAS_I420TOABGRROW_SSSE3) #elif defined(HAS_I422TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToABGRRow = I420ToABGRRow_Any_SSSE3; I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I420ToABGRRow = I420ToABGRRow_Unaligned_SSSE3; I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
I420ToABGRRow = I420ToABGRRow_SSSE3; I422ToABGRRow = I422ToABGRRow_SSSE3;
} }
} }
} }
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToABGRRow(src_y, src_u, src_v, dst_abgr, width); I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
dst_abgr += dst_stride_abgr; dst_abgr += dst_stride_abgr;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
...@@ -804,18 +804,18 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, ...@@ -804,18 +804,18 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
#endif #endif
...@@ -835,7 +835,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, ...@@ -835,7 +835,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToARGBRow(src_y, src_u, src_v, row, width); I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToRGB24Row(row, dst_argb, width); ARGBToRGB24Row(row, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
...@@ -859,18 +859,18 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, ...@@ -859,18 +859,18 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
#endif #endif
...@@ -890,7 +890,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, ...@@ -890,7 +890,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToARGBRow(src_y, src_u, src_v, row, width); I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToRAWRow(row, dst_argb, width); ARGBToRAWRow(row, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
...@@ -914,18 +914,18 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -914,18 +914,18 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
dst_stride_rgb = -dst_stride_rgb; dst_stride_rgb = -dst_stride_rgb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
#endif #endif
...@@ -944,7 +944,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -944,7 +944,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToARGBRow(src_y, src_u, src_v, row, width); I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToRGB565Row(row, dst_rgb, width); ARGBToRGB565Row(row, dst_rgb, width);
dst_rgb += dst_stride_rgb; dst_rgb += dst_stride_rgb;
src_y += src_stride_y; src_y += src_stride_y;
...@@ -968,18 +968,18 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, ...@@ -968,18 +968,18 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
#endif #endif
...@@ -998,7 +998,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, ...@@ -998,7 +998,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToARGBRow(src_y, src_u, src_v, row, width); I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToARGB1555Row(row, dst_argb, width); ARGBToARGB1555Row(row, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
...@@ -1022,18 +1022,18 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, ...@@ -1022,18 +1022,18 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
#endif #endif
...@@ -1052,7 +1052,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, ...@@ -1052,7 +1052,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToARGBRow(src_y, src_u, src_v, row, width); I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToARGB4444Row(row, dst_argb, width); ARGBToARGB4444Row(row, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
......
...@@ -446,18 +446,18 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, ...@@ -446,18 +446,18 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
src_stride_u = -src_stride_u; src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v; src_stride_v = -src_stride_v;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
#endif #endif
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
...@@ -478,7 +478,7 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, ...@@ -478,7 +478,7 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
} }
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToARGBRow(src_y, src_u, src_v, row, width); I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width); ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
dst_bayer += dst_stride_bayer; dst_bayer += dst_stride_bayer;
src_y += src_stride_y; src_y += src_stride_y;
......
...@@ -51,6 +51,26 @@ void CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -51,6 +51,26 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
} }
} }
// Convert I420 to I400. (calls CopyPlane ignoring u/v)
int I420ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
uint8*, int,
uint8*, int,
int width, int height) {
if (!src_y || !dst_y ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
return 0;
}
// Mirror a plane of data // Mirror a plane of data
void MirrorPlane(const uint8* src_y, int src_stride_y, void MirrorPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
...@@ -202,6 +222,45 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, ...@@ -202,6 +222,45 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
return 0; return 0;
} }
// Convert I444 to ARGB.
int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*I444ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I444ToARGBRow_C;
#if defined(HAS_I444TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I444ToARGBRow = I444ToARGBRow_SSSE3;
}
}
}
#endif
for (int y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
}
// Convert I422 to ARGB. // Convert I422 to ARGB.
int I422ToARGB(const uint8* src_y, int src_stride_y, int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
...@@ -214,30 +273,32 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -214,30 +273,32 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToARGBRow = I420ToARGBRow_Any_SSSE3; I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) && if (IS_ALIGNED(width, 8)) {
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
I420ToARGBRow = I420ToARGBRow_SSSE3; if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
} }
} }
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I420ToARGBRow(src_y, src_u, src_v, dst_argb, width); I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
src_u += src_stride_u; src_u += src_stride_u;
...@@ -246,8 +307,8 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -246,8 +307,8 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Convert I444 to ARGB. // Convert I411 to ARGB.
int I444ToARGB(const uint8* src_y, int src_stride_y, int I411ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
...@@ -258,21 +319,25 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, ...@@ -258,21 +319,25 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*I444ToARGBRow)(const uint8* y_buf, void (*I411ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I444ToARGBRow_C; int width) = I411ToARGBRow_C;
#if defined(HAS_I444TOARGBROW_SSSE3) #if defined(HAS_I411TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
IS_ALIGNED(width, 8) && I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { if (IS_ALIGNED(width, 8)) {
I444ToARGBRow = I444ToARGBRow_SSSE3; I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I411ToARGBRow = I411ToARGBRow_SSSE3;
}
}
} }
#endif #endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, width); I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
src_u += src_stride_u; src_u += src_stride_u;
...@@ -281,6 +346,7 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, ...@@ -281,6 +346,7 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Convert I400 to ARGB. // Convert I400 to ARGB.
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
...@@ -724,24 +790,24 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, ...@@ -724,24 +790,24 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* argb_buf, uint8* argb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToARGBRow = I420ToARGBRow_Any_SSSE3; I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) && if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
} }
#endif #endif
...@@ -766,7 +832,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, ...@@ -766,7 +832,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
src_uv += src_stride_uv; src_uv += src_stride_uv;
} }
I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width); I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
} }
...@@ -803,24 +869,24 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -803,24 +869,24 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
} }
} }
#endif #endif
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* argb_buf, uint8* argb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToARGBRow = I420ToARGBRow_Any_SSSE3; I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) && if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
} }
#endif #endif
...@@ -832,7 +898,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -832,7 +898,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
YUY2ToUVRow(src_yuy2, src_stride_yuy2, rowu, rowv, width); YUY2ToUVRow(src_yuy2, src_stride_yuy2, rowu, rowv, width);
YUY2ToYRow(src_yuy2, rowy, width); YUY2ToYRow(src_yuy2, rowy, width);
I420ToARGBRow(rowy, rowu, rowv, dst_argb, width); I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
src_yuy2 += src_stride_yuy2; src_yuy2 += src_stride_yuy2;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
...@@ -869,24 +935,24 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -869,24 +935,24 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
} }
} }
#endif #endif
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* argb_buf, uint8* argb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I420ToARGBRow = I420ToARGBRow_Any_SSSE3; I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) && if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
} }
#endif #endif
...@@ -898,7 +964,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -898,7 +964,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
UYVYToUVRow(src_uyvy, src_stride_uyvy, rowu, rowv, width); UYVYToUVRow(src_uyvy, src_stride_uyvy, rowu, rowv, width);
UYVYToYRow(src_uyvy, rowy, width); UYVYToYRow(src_uyvy, rowy, width);
I420ToARGBRow(rowy, rowu, rowv, dst_argb, width); I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
src_uyvy += src_stride_uyvy; src_uyvy += src_stride_uyvy;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
...@@ -916,18 +982,18 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -916,18 +982,18 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
dst_stride_rgb = -dst_stride_rgb; dst_stride_rgb = -dst_stride_rgb;
} }
void (*I420ToARGBRow)(const uint8* y_buf, void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = I420ToARGBRow_C; int width) = I422ToARGBRow_C;
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I420ToARGBRow = I420ToARGBRow_NEON; I422ToARGBRow = I422ToARGBRow_NEON;
} }
#elif defined(HAS_I420TOARGBROW_SSSE3) #elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I420ToARGBRow = I420ToARGBRow_SSSE3; I422ToARGBRow = I422ToARGBRow_SSSE3;
} }
#endif #endif
SIMD_ALIGNED(uint8 row[kMaxStride]); SIMD_ALIGNED(uint8 row[kMaxStride]);
...@@ -960,7 +1026,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -960,7 +1026,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
src_uv += src_stride_uv; src_uv += src_stride_uv;
} }
I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width); I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width);
ARGBToRGB565Row(row, dst_rgb, width); ARGBToRGB565Row(row, dst_rgb, width);
dst_rgb += dst_stride_rgb; dst_rgb += dst_stride_rgb;
src_y += src_stride_y; src_y += src_stride_y;
......
...@@ -30,7 +30,7 @@ extern "C" { ...@@ -30,7 +30,7 @@ extern "C" {
#define LIBYUV_SSSE3_ONLY #define LIBYUV_SSSE3_ONLY
#endif #endif
// The following are available on all x86 platforms // The following are available on all x86 platforms:
#if !defined(YUV_DISABLE_ASM) && \ #if !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ABGRTOARGBROW_SSSE3
...@@ -55,10 +55,11 @@ extern "C" { ...@@ -55,10 +55,11 @@ extern "C" {
#define HAS_COPYROW_SSE2 #define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86 #define HAS_COPYROW_X86
#define HAS_I400TOARGBROW_SSE2 #define HAS_I400TOARGBROW_SSE2
#define HAS_I420TOABGRROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3
#define HAS_I420TOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3
#define HAS_I420TOBGRAROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3
#define HAS_I444TOARGBROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3
#define HAS_I411TOARGBROW_SSSE3
#define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORROWUV_SSSE3 #define HAS_MIRRORROWUV_SSSE3
#define HAS_ADDROW_SSE2 #define HAS_ADDROW_SSE2
...@@ -75,7 +76,7 @@ extern "C" { ...@@ -75,7 +76,7 @@ extern "C" {
#define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSEPIAROW_SSSE3
#endif #endif
// The following are available only useful when SSSE3 is unavailable. // The following are disabled when SSSE3 is available:
#if !defined(YUV_DISABLE_ASM) && \ #if !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_SSSE3_ONLY) !defined(LIBYUV_SSSE3_ONLY)
...@@ -91,9 +92,9 @@ extern "C" { ...@@ -91,9 +92,9 @@ extern "C" {
#define HAS_MIRRORROWUV_NEON #define HAS_MIRRORROWUV_NEON
#define HAS_SPLITUV_NEON #define HAS_SPLITUV_NEON
#define HAS_COPYROW_NEON #define HAS_COPYROW_NEON
#define HAS_I420TOARGBROW_NEON #define HAS_I422TOARGBROW_NEON
#define HAS_I420TOBGRAROW_NEON #define HAS_I422TOBGRAROW_NEON
#define HAS_I420TOABGRROW_NEON #define HAS_I422TOABGRROW_NEON
#endif #endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
...@@ -118,17 +119,17 @@ typedef uint32 __attribute__((vector_size(16))) uvec32; ...@@ -118,17 +119,17 @@ typedef uint32 __attribute__((vector_size(16))) uvec32;
#define OMITFP __attribute__((optimize("omit-frame-pointer"))) #define OMITFP __attribute__((optimize("omit-frame-pointer")))
#endif #endif
void I420ToARGBRow_NEON(const uint8* y_buf, void I422ToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToBGRARow_NEON(const uint8* y_buf, void I422ToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToABGRRow_NEON(const uint8* y_buf, void I422ToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -219,19 +220,19 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); ...@@ -219,19 +220,19 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I420ToARGBRow_C(const uint8* y_buf, void I422ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToBGRARow_C(const uint8* y_buf, void I422ToBGRARow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToABGRRow_C(const uint8* y_buf, void I422ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -243,54 +244,78 @@ void I444ToARGBRow_C(const uint8* y_buf, ...@@ -243,54 +244,78 @@ void I444ToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I411ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void YToARGBRow_C(const uint8* y_buf, void YToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToARGBRow_SSSE3(const uint8* y_buf, void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width); int width);
void I420ToBGRARow_SSSE3(const uint8* y_buf, void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width); int width);
void I420ToABGRRow_SSSE3(const uint8* y_buf, void I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width); int width);
void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* bgra_buf,
int width); int width);
void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* abgr_buf,
int width); int width);
void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width); int width);
void I444ToARGBRow_SSSE3(const uint8* y_buf, void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width);
void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* argb_buf,
int width);
void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* bgra_buf,
int width);
void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* abgr_buf,
int width); int width);
void YToARGBRow_SSE2(const uint8* y_buf, void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf, uint8* argb_buf,
int width); int width);
// ARGB preattenuated alpha blend. // ARGB preattenuated alpha blend.
...@@ -310,24 +335,37 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, ...@@ -310,24 +335,37 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
// 'Any' functions handle any size and alignment. // 'Any' functions handle any size and alignment.
void I420ToARGBRow_Any_SSSE3(const uint8* y_buf, void I444ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToBGRARow_Any_SSSE3(const uint8* y_buf, void I422ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToABGRRow_Any_SSSE3(const uint8* y_buf, void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I422ToBGRARow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I422ToABGRRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
...@@ -344,19 +382,19 @@ void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -344,19 +382,19 @@ void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void I420ToARGBRow_Any_NEON(const uint8* y_buf, void I422ToARGBRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToBGRARow_Any_NEON(const uint8* y_buf, void I422ToBGRARow_Any_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void I420ToABGRRow_Any_NEON(const uint8* y_buf, void I422ToABGRRow_Any_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
......
...@@ -359,7 +359,8 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf, ...@@ -359,7 +359,8 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
(255u << ashift); (255u << ashift);
} }
void I420ToARGBRow_C(const uint8* y_buf, // Also used for 420
void I422ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -377,7 +378,7 @@ void I420ToARGBRow_C(const uint8* y_buf, ...@@ -377,7 +378,7 @@ void I420ToARGBRow_C(const uint8* y_buf,
} }
} }
void I420ToBGRARow_C(const uint8* y_buf, void I422ToBGRARow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -395,7 +396,7 @@ void I420ToBGRARow_C(const uint8* y_buf, ...@@ -395,7 +396,7 @@ void I420ToBGRARow_C(const uint8* y_buf,
} }
} }
void I420ToABGRRow_C(const uint8* y_buf, void I422ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -427,6 +428,32 @@ void I444ToARGBRow_C(const uint8* y_buf, ...@@ -427,6 +428,32 @@ void I444ToARGBRow_C(const uint8* y_buf,
} }
} }
void I411ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 3; x += 4) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
y_buf += 4;
u_buf += 1;
v_buf += 1;
rgb_buf += 16; // Advance 4 pixels.
}
if (width & 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
y_buf += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
}
}
void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) { void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
for (int x = 0; x < width; ++x) { for (int x = 0; x < width; ++x) {
YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0); YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
...@@ -686,8 +713,8 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -686,8 +713,8 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
} }
#endif // HAS_ARGBBLENDROW_SSSE3 #endif // HAS_ARGBBLENDROW_SSSE3
// Wrappers to handle odd sizes/alignments // Wrappers to handle odd width
#define YUVANY(NAMEANY, I420TORGB_SSE, I420TORGB_C) \ #define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT) \
void NAMEANY(const uint8* y_buf, \ void NAMEANY(const uint8* y_buf, \
const uint8* u_buf, \ const uint8* u_buf, \
const uint8* v_buf, \ const uint8* v_buf, \
...@@ -696,22 +723,24 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -696,22 +723,24 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
int n = width & ~7; \ int n = width & ~7; \
I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \ I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \
I420TORGB_C(y_buf + n, \ I420TORGB_C(y_buf + n, \
u_buf + (n >> 1), \ u_buf + (n >> UV_SHIFT), \
v_buf + (n >> 1), \ v_buf + (n >> UV_SHIFT), \
rgb_buf + n * 4, width & 7); \ rgb_buf + n * 4, width & 7); \
} }
#if defined(HAS_I420TOARGBROW_SSSE3) #if defined(HAS_I422TOARGBROW_SSSE3)
YUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_Unaligned_SSSE3, I420ToARGBRow_C) YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
YUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_Unaligned_SSSE3, I420ToBGRARow_C) YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
YUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_Unaligned_SSSE3, I420ToABGRRow_C) YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
#endif #endif
#if defined(HAS_I420TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
YUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, I420ToARGBRow_C) YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C)
YUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, I420ToBGRARow_C) YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C)
YUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, I420ToABGRRow_C) YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C)
#endif #endif
#undef YUVANY #undef YANY
#define RGBANY(NAMEANY, ARGBTORGB, BPP) \ #define RGBANY(NAMEANY, ARGBTORGB, BPP) \
void NAMEANY(const uint8* argb_buf, \ void NAMEANY(const uint8* argb_buf, \
......
...@@ -18,7 +18,7 @@ extern "C" { ...@@ -18,7 +18,7 @@ extern "C" {
// This module is for GCC Neon // This module is for GCC Neon
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define YUVTORGB \ #define YUV422TORGB \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.u8 {d0}, [%0]! \n" \
"vld1.u32 {d2[0]}, [%1]! \n" \ "vld1.u32 {d2[0]}, [%1]! \n" \
"vld1.u32 {d2[1]}, [%2]! \n" \ "vld1.u32 {d2[1]}, [%2]! \n" \
...@@ -46,17 +46,17 @@ extern "C" { ...@@ -46,17 +46,17 @@ extern "C" {
"vtrn.u8 d22, d23 \n" \ "vtrn.u8 d22, d23 \n" \
"vtrn.u8 d16, d17 \n" \ "vtrn.u8 d16, d17 \n" \
#if defined(HAS_I420TOARGBROW_NEON) || \ #if defined(HAS_I422TOARGBROW_NEON) || \
defined(HAS_I420TOBGRAROW_NEON) || \ defined(HAS_I422TOBGRAROW_NEON) || \
defined(HAS_I420TOABGRROW_NEON) defined(HAS_I422TOABGRROW_NEON)
static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
0, 0, 0, 0, 0, 0, 0, 0 }; 0, 0, 0, 0, 0, 0, 0, 0 };
static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
0, 0, 0, 0, 0, 0, 0, 0 }; 0, 0, 0, 0, 0, 0, 0, 0 };
#endif #endif
#ifdef HAS_I420TOARGBROW_NEON #ifdef HAS_I422TOARGBROW_NEON
void I420ToARGBRow_NEON(const uint8* y_buf, void I422ToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -68,7 +68,7 @@ void I420ToARGBRow_NEON(const uint8* y_buf, ...@@ -68,7 +68,7 @@ void I420ToARGBRow_NEON(const uint8* y_buf,
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
"1: \n" "1: \n"
YUVTORGB YUV422TORGB
"vmov.u8 d21, d16 \n" "vmov.u8 d21, d16 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"vst4.u8 {d20, d21, d22, d23}, [%3]! \n" "vst4.u8 {d20, d21, d22, d23}, [%3]! \n"
...@@ -85,10 +85,10 @@ YUVTORGB ...@@ -85,10 +85,10 @@ YUVTORGB
"q10", "q11", "q12", "q13", "q14", "q15" "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif #endif // HAS_I422TOARGBROW_NEON
#ifdef HAS_I420TOBGRAROW_NEON #ifdef HAS_I422TOBGRAROW_NEON
void I420ToBGRARow_NEON(const uint8* y_buf, void I422ToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -100,7 +100,7 @@ void I420ToBGRARow_NEON(const uint8* y_buf, ...@@ -100,7 +100,7 @@ void I420ToBGRARow_NEON(const uint8* y_buf,
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
"1: \n" "1: \n"
YUVTORGB YUV422TORGB
"vswp.u8 d20, d22 \n" "vswp.u8 d20, d22 \n"
"vmov.u8 d21, d16 \n" "vmov.u8 d21, d16 \n"
"vmov.u8 d19, #255 \n" "vmov.u8 d19, #255 \n"
...@@ -118,10 +118,10 @@ YUVTORGB ...@@ -118,10 +118,10 @@ YUVTORGB
"q10", "q11", "q12", "q13", "q14", "q15" "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif #endif // HAS_I422TOBGRAROW_NEON
#ifdef HAS_I420TOABGRROW_NEON #ifdef HAS_I422TOABGRROW_NEON
void I420ToABGRRow_NEON(const uint8* y_buf, void I422ToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
...@@ -133,7 +133,7 @@ void I420ToABGRRow_NEON(const uint8* y_buf, ...@@ -133,7 +133,7 @@ void I420ToABGRRow_NEON(const uint8* y_buf,
"vmov.u16 q14, #74 \n" "vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n" "vmov.u16 q15, #16 \n"
"1: \n" "1: \n"
YUVTORGB YUV422TORGB
"vswp.u8 d20, d22 \n" "vswp.u8 d20, d22 \n"
"vmov.u8 d21, d16 \n" "vmov.u8 d21, d16 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
...@@ -151,7 +151,7 @@ YUVTORGB ...@@ -151,7 +151,7 @@ YUVTORGB
"q10", "q11", "q12", "q13", "q14", "q15" "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
#endif #endif // HAS_I422TOABGRROW_NEON
#ifdef HAS_SPLITUV_NEON #ifdef HAS_SPLITUV_NEON
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
...@@ -172,7 +172,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { ...@@ -172,7 +172,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
: "memory", "cc", "q0", "q1" // Clobber List : "memory", "cc", "q0", "q1" // Clobber List
); );
} }
#endif #endif // HAS_SPLITUV_NEON
#ifdef HAS_COPYROW_NEON #ifdef HAS_COPYROW_NEON
// Copy multiple of 64 // Copy multiple of 64
...@@ -266,7 +266,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ...@@ -266,7 +266,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
: "memory", "cc", "r3", "q0" : "memory", "cc", "r3", "q0"
); );
} }
#endif #endif // HAS_MIRRORROW_NEON
#ifdef HAS_MIRRORROWUV_NEON #ifdef HAS_MIRRORROWUV_NEON
void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
...@@ -325,7 +325,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { ...@@ -325,7 +325,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
: "memory", "cc", "r12", "q0" : "memory", "cc", "r12", "q0"
); );
} }
#endif #endif // HAS_MIRRORROWUV_NEON
#endif // __ARM_NEON__ #endif // __ARM_NEON__
......
...@@ -1215,7 +1215,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ...@@ -1215,7 +1215,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
#endif // HAS_ARGBTOYROW_SSSE3 #endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_I420TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */ #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
#define UR 0 #define UR 0
...@@ -1251,8 +1251,37 @@ struct { ...@@ -1251,8 +1251,37 @@ struct {
{ YG, YG, YG, YG, YG, YG, YG, YG } { YG, YG, YG, YG, YG, YG, YG, YG }
}; };
// Convert 8 pixels // Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB \ #define YUV444TORGB \
"movq (%1),%%xmm0 \n" \
"movq (%1,%2,1),%%xmm1 \n" \
"lea 0x8(%1),%1 \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw (%5),%%xmm0 \n" \
"pmaddubsw 16(%5),%%xmm1 \n" \
"pmaddubsw 32(%5),%%xmm2 \n" \
"psubw 48(%5),%%xmm0 \n" \
"psubw 64(%5),%%xmm1 \n" \
"psubw 80(%5),%%xmm2 \n" \
"movq (%0),%%xmm3 \n" \
"lea 0x8(%0),%0 \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
"psubsw 96(%5),%%xmm3 \n" \
"pmullw 112(%5),%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
// Convert 8 pixels: 4 UV and 8 Y
#define YUV422TORGB \
"movd (%1),%%xmm0 \n" \ "movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \ "movd (%1,%2,1),%%xmm1 \n" \
"lea 0x4(%1),%1 \n" \ "lea 0x4(%1),%1 \n" \
...@@ -1281,10 +1310,41 @@ struct { ...@@ -1281,10 +1310,41 @@ struct {
"packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n" "packuswb %%xmm2,%%xmm2 \n"
void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, // Convert 8 pixels: 2 UV and 8 Y
#define YUV411TORGB \
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
"lea 0x2(%1),%1 \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"punpckldq %%xmm0,%%xmm0 \n" \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw (%5),%%xmm0 \n" \
"pmaddubsw 16(%5),%%xmm1 \n" \
"pmaddubsw 32(%5),%%xmm2 \n" \
"psubw 48(%5),%%xmm0 \n" \
"psubw 64(%5),%%xmm1 \n" \
"psubw 80(%5),%%xmm2 \n" \
"movq (%0),%%xmm3 \n" \
"lea 0x8(%0),%0 \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
"psubsw 96(%5),%%xmm3 \n" \
"pmullw 112(%5),%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
...@@ -1292,7 +1352,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1292,7 +1352,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
YUVTORGB YUV444TORGB
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
...@@ -1306,7 +1366,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1306,7 +1366,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(argb_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5 : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc" : "memory", "cc"
...@@ -1316,10 +1376,10 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1316,10 +1376,10 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
...@@ -1327,22 +1387,21 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1327,22 +1387,21 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
YUVTORGB YUV422TORGB
"pcmpeqb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm2 \n"
"punpcklbw %%xmm2,%%xmm5 \n" "movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm2,%%xmm1 \n"
"punpckhwd %%xmm1,%%xmm0 \n" "movdqa %%xmm0,(%3) \n"
"movdqa %%xmm5,(%3) \n" "movdqa %%xmm1,0x10(%3) \n"
"movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(argb_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5 : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc" : "memory", "cc"
...@@ -1352,10 +1411,10 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1352,10 +1411,10 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
...@@ -1363,13 +1422,13 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1363,13 +1422,13 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
YUVTORGB YUV411TORGB
"punpcklbw %%xmm1,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm2,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm0,%%xmm1 \n" "punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm2,(%3) \n" "movdqa %%xmm0,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n" "movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
...@@ -1377,7 +1436,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1377,7 +1436,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(argb_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5 : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc" : "memory", "cc"
...@@ -1387,10 +1446,10 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1387,10 +1446,10 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
...@@ -1398,7 +1457,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1398,7 +1457,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
YUVTORGB YUV444TORGB
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
...@@ -1412,7 +1471,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1412,7 +1471,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(argb_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5 : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc" : "memory", "cc"
...@@ -1422,10 +1481,10 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1422,10 +1481,10 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
...@@ -1433,22 +1492,92 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1433,22 +1492,92 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
YUVTORGB YUV422TORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV411TORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* bgra_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV422TORGB
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n" "punpcklbw %%xmm2,%%xmm5 \n"
"movdqa %%xmm5,%%xmm0 \n" "movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n" "punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm0 \n"
"movdqu %%xmm5,(%3) \n" "movdqa %%xmm5,(%3) \n"
"movdqu %%xmm0,0x10(%3) \n" "movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(bgra_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5 : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc" : "memory", "cc"
...@@ -1458,10 +1587,10 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1458,10 +1587,10 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* abgr_buf,
int width) { int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
...@@ -1469,21 +1598,21 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1469,21 +1598,21 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
YUVTORGB YUV422TORGB
"punpcklbw %%xmm1,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n" "movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n" "punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n" "punpckhwd %%xmm0,%%xmm1 \n"
"movdqu %%xmm2,(%3) \n" "movdqa %%xmm2,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n" "movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n" "lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n" "sub $0x8,%4 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(abgr_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5 : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc" : "memory", "cc"
...@@ -1493,10 +1622,10 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1493,10 +1622,10 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* bgra_buf,
int width) { int width) {
asm volatile ( asm volatile (
"sub %1,%2 \n" "sub %1,%2 \n"
...@@ -1504,43 +1633,22 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1504,43 +1633,22 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n" ".p2align 4 \n"
"1: \n" "1: \n"
"movd (%1),%%xmm0 \n" YUV422TORGB
"movd (%1,%2,1),%%xmm1 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"lea 0x4(%1),%1 \n" "punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm2,%%xmm5 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm5,%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n" "punpcklwd %%xmm1,%%xmm5 \n"
"pmaddubsw (%5),%%xmm0 \n" "punpckhwd %%xmm1,%%xmm0 \n"
"pmaddubsw 16(%5),%%xmm1 \n" "movdqu %%xmm5,(%3) \n"
"pmaddubsw 32(%5),%%xmm2 \n" "movdqu %%xmm0,0x10(%3) \n"
"psubw 48(%5),%%xmm0 \n" "lea 0x20(%3),%3 \n"
"psubw 64(%5),%%xmm1 \n" "sub $0x8,%4 \n"
"psubw 80(%5),%%xmm2 \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
"punpcklbw %%xmm4,%%xmm3 \n"
"psubsw 96(%5),%%xmm3 \n"
"pmullw 112(%5),%%xmm3 \n"
"paddsw %%xmm3,%%xmm0 \n"
"paddsw %%xmm3,%%xmm1 \n"
"paddsw %%xmm3,%%xmm2 \n"
"psraw $0x6,%%xmm0 \n"
"psraw $0x6,%%xmm1 \n"
"psraw $0x6,%%xmm2 \n"
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm1,%%xmm1 \n"
"packuswb %%xmm2,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"sub $0x4,%4 \n"
"movdqa %%xmm0,(%3) \n"
"lea 0x10(%3),%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(bgra_buf), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5 : "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc" : "memory", "cc"
...@@ -1549,7 +1657,43 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1549,7 +1657,43 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
#endif #endif
); );
} }
void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* abgr_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
YUV422TORGB
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqu %%xmm2,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(abgr_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif #endif
);
}
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_YTOARGBROW_SSE2 #ifdef HAS_YTOARGBROW_SSE2
void YToARGBRow_SSE2(const uint8* y_buf, void YToARGBRow_SSE2(const uint8* y_buf,
......
...@@ -1200,7 +1200,7 @@ __asm { ...@@ -1200,7 +1200,7 @@ __asm {
} }
} }
#ifdef HAS_I420TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3
#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */ #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
...@@ -1235,7 +1235,42 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; ...@@ -1235,7 +1235,42 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
#define YUVTORGB __asm { \ // TODO(fbarchard): NV12/NV21 fetch UV and use directly.
// Convert 8 pixels: 8 UV and 8 Y
#define YUV444TORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movq xmm0, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
__asm psubw xmm1, kUVBiasG \
__asm psubw xmm2, kUVBiasR \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \
__asm psubsw xmm3, kYSub16 \
__asm pmullw xmm3, kYToRgb \
__asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
// Convert 8 pixels: 4 UV and 8 Y
#define YUV422TORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movd xmm0, [esi] /* U */ \ __asm movd xmm0, [esi] /* U */ \
__asm movd xmm1, [esi + edi] /* V */ \ __asm movd xmm1, [esi + edi] /* V */ \
...@@ -1267,11 +1302,47 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; ...@@ -1267,11 +1302,47 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
__asm packuswb xmm2, xmm2 /* R */ \ __asm packuswb xmm2, xmm2 /* R */ \
} }
// Convert 8 pixels: 2 UV and 8 Y
#define YUV411TORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movd xmm0, [esi] /* U */ \
__asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 2] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
__asm psubw xmm1, kUVBiasG \
__asm psubw xmm2, kUVBiasR \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \
__asm psubsw xmm3, kYSub16 \
__asm pmullw xmm3, kYToRgb \
__asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
// 8 pixels, dest aligned 16.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I420ToARGBRow_SSSE3(const uint8* y_buf, void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
__asm { __asm {
push esi push esi
...@@ -1279,7 +1350,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1279,7 +1350,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb mov edx, [esp + 8 + 16] // argb
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -1287,7 +1358,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1287,7 +1358,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
align 16 align 16
convertloop: convertloop:
YUVTORGB YUV444TORGB
// Step 3: Weave into ARGB // Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG punpcklbw xmm0, xmm1 // BG
...@@ -1307,11 +1378,13 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1307,11 +1378,13 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
} }
} }
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I420ToBGRARow_SSSE3(const uint8* y_buf, void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
__asm { __asm {
push esi push esi
...@@ -1319,24 +1392,67 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1319,24 +1392,67 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb mov edx, [esp + 8 + 16] // argb
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 16 align 16
convertloop: convertloop:
YUVTORGB YUV422TORGB
// Step 3: Weave into BGRA // Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
// 8 pixels, dest aligned 16.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
// Similar to I420 but duplicate UV once more.
__declspec(naked) __declspec(align(16))
void I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* argb_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // argb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
punpcklbw xmm1, xmm0 // GB pxor xmm4, xmm4
punpcklbw xmm5, xmm2 // AR
movdqa xmm0, xmm5 align 16
punpcklwd xmm5, xmm1 // BGRA first 4 pixels convertloop:
punpckhwd xmm0, xmm1 // BGRA next 4 pixels YUV411TORGB
movdqa [edx], xmm5
movdqa [edx + 16], xmm0 // Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
...@@ -1347,11 +1463,13 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1347,11 +1463,13 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
} }
} }
// 8 pixels, unaligned.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I420ToABGRRow_SSSE3(const uint8* y_buf, void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
__asm { __asm {
push esi push esi
...@@ -1359,7 +1477,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1359,7 +1477,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb mov edx, [esp + 8 + 16] // argb
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -1367,15 +1485,15 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1367,15 +1485,15 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
align 16 align 16
convertloop: convertloop:
YUVTORGB YUV444TORGB
// Step 3: Weave into ARGB // Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG punpcklbw xmm0, xmm1 // BG
punpcklbw xmm0, xmm5 // BA punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm2 movdqa xmm1, xmm0
punpcklwd xmm2, xmm0 // RGBA first 4 pixels punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm0 // RGBA next 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm2 movdqa [edx], xmm0
movdqa [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
...@@ -1387,11 +1505,13 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1387,11 +1505,13 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
} }
} }
// 8 pixels, unaligned.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
__asm { __asm {
push esi push esi
...@@ -1399,7 +1519,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1399,7 +1519,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb mov edx, [esp + 8 + 16] // argb
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -1407,7 +1527,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1407,7 +1527,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
align 16 align 16
convertloop: convertloop:
YUVTORGB YUV422TORGB
// Step 3: Weave into ARGB // Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG punpcklbw xmm0, xmm1 // BG
...@@ -1415,8 +1535,8 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1415,8 +1535,8 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqu [edx], xmm0 movdqa [edx], xmm0
movdqu [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
...@@ -1427,11 +1547,14 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1427,11 +1547,14 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
} }
} }
// 8 pixels, unaligned.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
// Similar to I420 but duplicate UV once more.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* argb_buf,
int width) { int width) {
__asm { __asm {
push esi push esi
...@@ -1439,14 +1562,54 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1439,14 +1562,54 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb mov edx, [esp + 8 + 16] // argb
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 16 align 16
convertloop: convertloop:
YUVTORGB YUV411TORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* bgra_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // bgra
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
align 16
convertloop:
YUV422TORGB
// Step 3: Weave into BGRA // Step 3: Weave into BGRA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -1455,8 +1618,8 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1455,8 +1618,8 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
movdqa xmm0, xmm5 movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels punpcklwd xmm5, xmm1 // BGRA first 4 pixels
punpckhwd xmm0, xmm1 // BGRA next 4 pixels punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqu [edx], xmm5 movdqa [edx], xmm5
movdqu [edx + 16], xmm0 movdqa [edx + 16], xmm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
...@@ -1468,10 +1631,10 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1468,10 +1631,10 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
} }
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* abgr_buf,
int width) { int width) {
__asm { __asm {
push esi push esi
...@@ -1479,7 +1642,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1479,7 +1642,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb mov edx, [esp + 8 + 16] // abgr
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -1487,7 +1650,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1487,7 +1650,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
align 16 align 16
convertloop: convertloop:
YUVTORGB YUV422TORGB
// Step 3: Weave into ARGB // Step 3: Weave into ARGB
punpcklbw xmm2, xmm1 // RG punpcklbw xmm2, xmm1 // RG
...@@ -1495,8 +1658,8 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1495,8 +1658,8 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
movdqa xmm1, xmm2 movdqa xmm1, xmm2
punpcklwd xmm2, xmm0 // RGBA first 4 pixels punpcklwd xmm2, xmm0 // RGBA first 4 pixels
punpckhwd xmm1, xmm0 // RGBA next 4 pixels punpckhwd xmm1, xmm0 // RGBA next 4 pixels
movdqu [edx], xmm2 movdqa [edx], xmm2
movdqu [edx + 16], xmm1 movdqa [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8 sub ecx, 8
jg convertloop jg convertloop
...@@ -1508,10 +1671,10 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ...@@ -1508,10 +1671,10 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
} }
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void I444ToARGBRow_SSSE3(const uint8* y_buf, void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* bgra_buf,
int width) { int width) {
__asm { __asm {
push esi push esi
...@@ -1519,7 +1682,47 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1519,7 +1682,47 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // rgb mov edx, [esp + 8 + 16] // bgra
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pxor xmm4, xmm4
align 16
convertloop:
YUV422TORGB
// Step 3: Weave into BGRA
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
punpcklbw xmm1, xmm0 // GB
punpcklbw xmm5, xmm2 // AR
movdqa xmm0, xmm5
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
movdqu [edx], xmm5
movdqu [edx + 16], xmm0
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* abgr_buf,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // abgr
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -1527,43 +1730,18 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1527,43 +1730,18 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
align 16 align 16
convertloop: convertloop:
// Step 1: Find 4 UV contributions to 4 R,G,B values YUV422TORGB
movd xmm0, [esi] // U
movd xmm1, [esi + edi] // V
lea esi, [esi + 4]
punpcklbw xmm0, xmm1 // UV
movdqa xmm1, xmm0
movdqa xmm2, xmm0
pmaddubsw xmm0, kUVToB // scale B UV
pmaddubsw xmm1, kUVToG // scale G UV
pmaddubsw xmm2, kUVToR // scale R UV
psubw xmm0, kUVBiasB // unbias back to signed
psubw xmm1, kUVBiasG
psubw xmm2, kUVBiasR
// Step 2: Find Y contribution to 4 R,G,B values
movd xmm3, [eax]
lea eax, [eax + 4]
punpcklbw xmm3, xmm4
psubsw xmm3, kYSub16
pmullw xmm3, kYToRgb
paddsw xmm0, xmm3 // B += Y
paddsw xmm1, xmm3 // G += Y
paddsw xmm2, xmm3 // R += Y
psraw xmm0, 6
psraw xmm1, 6
psraw xmm2, 6
packuswb xmm0, xmm0 // B
packuswb xmm1, xmm1 // G
packuswb xmm2, xmm2 // R
// Step 3: Weave into ARGB // Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG punpcklbw xmm2, xmm1 // RG
punpcklbw xmm2, xmm5 // RA punpcklbw xmm0, xmm5 // BA
punpcklwd xmm0, xmm2 // BGRA 4 pixels movdqa xmm1, xmm2
movdqa [edx], xmm0 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
lea edx, [edx + 16] punpckhwd xmm1, xmm0 // RGBA next 4 pixels
sub ecx, 4 movdqu [edx], xmm2
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop jg convertloop
pop edi pop edi
...@@ -1571,7 +1749,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1571,7 +1749,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
ret ret
} }
} }
#endif #endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_YTOARGBROW_SSE2 #ifdef HAS_YTOARGBROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
...@@ -1617,7 +1795,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -1617,7 +1795,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
ret ret
} }
} }
#endif #endif // HAS_YTOARGBROW_SSE2
#endif #endif
#ifdef HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_SSSE3
......
...@@ -25,88 +25,44 @@ ...@@ -25,88 +25,44 @@
namespace libyuv { namespace libyuv {
TEST_F(libyuvTest, BenchmarkI420ToARGB_C) { #define TESTPLANARTOB(FMT_A, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \
align_buffer_16(src_y, benchmark_width_ * benchmark_height_); TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \
align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2);
align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2);
align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_);
MaskCpuFlags(kCpuInitialized);
for (int i = 0; i < benchmark_iterations_; ++i)
I420ToARGB(src_y, benchmark_width_,
src_u, benchmark_width_ >> 1,
src_v, benchmark_width_ >> 1,
dst_argb, benchmark_width_ << 2,
benchmark_width_, benchmark_height_);
MaskCpuFlags(-1);
EXPECT_EQ(0, 0);
free_aligned_buffer_16(src_y)
free_aligned_buffer_16(src_u)
free_aligned_buffer_16(src_v)
free_aligned_buffer_16(dst_argb)
}
TEST_F(libyuvTest, BenchmarkI420ToARGB_OPT) {
align_buffer_16(src_y, benchmark_width_ * benchmark_height_);
align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2);
align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2);
align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_);
for (int i = 0; i < benchmark_iterations_; ++i)
I420ToARGB(src_y, benchmark_width_,
src_u, benchmark_width_ >> 1,
src_v, benchmark_width_ >> 1,
dst_argb, benchmark_width_ << 2,
benchmark_width_, benchmark_height_);
free_aligned_buffer_16(src_y)
free_aligned_buffer_16(src_u)
free_aligned_buffer_16(src_v)
free_aligned_buffer_16(dst_argb)
}
#define TESTI420TO(FMT, BPP) \
TEST_F(libyuvTest, I420To##FMT##_CvsOPT) { \
const int src_width = 1280; \ const int src_width = 1280; \
const int src_height = 720; \ const int src_height = 720; \
align_buffer_16(src_y, src_width * src_height); \ align_buffer_16(src_y, src_width * src_height); \
align_buffer_16(src_u, (src_width * src_height) >> 2); \ align_buffer_16(src_u, src_width / SUBSAMP_X * src_height / SUBSAMP_Y); \
align_buffer_16(src_v, (src_width * src_height) >> 2); \ align_buffer_16(src_v, src_width / SUBSAMP_X * src_height / SUBSAMP_Y); \
align_buffer_16(dst_rgb_c, (src_width * BPP) * src_height); \ align_buffer_16(dst_rgb_c, (src_width * BPP_B) * src_height); \
align_buffer_16(dst_rgb_opt, (src_width * BPP) * src_height); \ align_buffer_16(dst_rgb_opt, (src_width * BPP_B) * src_height); \
srandom(time(NULL)); \ srandom(time(NULL)); \
for (int i = 0; i < src_height; ++i) \ for (int i = 0; i < src_height; ++i) \
for (int j = 0; j < src_width; ++j) \ for (int j = 0; j < src_width; ++j) \
src_y[(i * src_width) + j] = (random() & 0xff); \ src_y[(i * src_width) + j] = (random() & 0xff); \
for (int i = 0; i < src_height >> 1; ++i) \ for (int i = 0; i < src_height / SUBSAMP_X; ++i) \
for (int j = 0; j < src_width >> 1; ++j) { \ for (int j = 0; j < src_width / SUBSAMP_Y; ++j) { \
src_u[(i * src_width >> 1) + j] = (random() & 0xff); \ src_u[(i * src_width / SUBSAMP_X) + j] = (random() & 0xff); \
src_v[(i * src_width >> 1) + j] = (random() & 0xff); \ src_v[(i * src_width / SUBSAMP_X) + j] = (random() & 0xff); \
} \ } \
MaskCpuFlags(kCpuInitialized); \ MaskCpuFlags(kCpuInitialized); \
I420To##FMT(src_y, src_width, \ ##FMT_A##To##FMT_B(src_y, src_width, \
src_u, src_width >> 1, \ src_u, src_width / SUBSAMP_X, \
src_v, src_width >> 1, \ src_v, src_width / SUBSAMP_X, \
dst_rgb_c, src_width * BPP, \ dst_rgb_c, src_width * BPP_B, \
src_width, src_height); \ src_width, src_height); \
MaskCpuFlags(-1); \ MaskCpuFlags(-1); \
const int runs = 1000; \ const int runs = 1000; \
for (int i = 0; i < runs; ++i) { \ for (int i = 0; i < runs; ++i) { \
I420To##FMT(src_y, src_width, \ ##FMT_A##To##FMT_B(src_y, src_width, \
src_u, src_width >> 1, \ src_u, src_width / SUBSAMP_X, \
src_v, src_width >> 1, \ src_v, src_width / SUBSAMP_X, \
dst_rgb_opt, src_width * BPP, \ dst_rgb_opt, src_width * BPP_B, \
src_width, src_height); \ src_width, src_height); \
} \ } \
int err = 0; \ int err = 0; \
for (int i = 0; i < src_height; ++i) { \ for (int i = 0; i < src_height; ++i) { \
for (int j = 0; j < src_width * BPP; ++j) { \ for (int j = 0; j < src_width * BPP_B; ++j) { \
int diff = static_cast<int>(dst_rgb_c[i * src_width * BPP + j]) - \ int diff = static_cast<int>(dst_rgb_c[i * src_width * BPP_B + j]) - \
static_cast<int>(dst_rgb_opt[i * src_width * BPP + j]); \ static_cast<int>(dst_rgb_opt[i * src_width * BPP_B + j]); \
if (abs(diff) > 2) \ if (abs(diff) > 2) \
err++; \ err++; \
} \ } \
...@@ -119,14 +75,17 @@ TEST_F(libyuvTest, I420To##FMT##_CvsOPT) { \ ...@@ -119,14 +75,17 @@ TEST_F(libyuvTest, I420To##FMT##_CvsOPT) { \
free_aligned_buffer_16(dst_rgb_opt) \ free_aligned_buffer_16(dst_rgb_opt) \
} }
TESTI420TO(ARGB, 4) TESTPLANARTOB(I420, 2, 2, ARGB, 4)
TESTI420TO(BGRA, 4) TESTPLANARTOB(I420, 2, 2, BGRA, 4)
TESTI420TO(ABGR, 4) TESTPLANARTOB(I420, 2, 2, ABGR, 4)
TESTI420TO(RAW, 3) TESTPLANARTOB(I420, 2, 2, RAW, 3)
TESTI420TO(RGB24, 3) TESTPLANARTOB(I420, 2, 2, RGB24, 3)
TESTI420TO(RGB565, 2) TESTPLANARTOB(I420, 2, 2, RGB565, 2)
TESTI420TO(ARGB1555, 2) TESTPLANARTOB(I420, 2, 2, ARGB1555, 2)
TESTI420TO(ARGB4444, 2) TESTPLANARTOB(I420, 2, 2, ARGB4444, 2)
TESTPLANARTOB(I411, 4, 1, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, ARGB, 4)
TESTPLANARTOB(I444, 1, 1, ARGB, 4)
#define TESTATOB(FMT_A, BPP_A, FMT_B, BPP_B) \ #define TESTATOB(FMT_A, BPP_A, FMT_B, BPP_B) \
TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \ TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment