Commit 2d9fe082 authored by fbarchard@google.com's avatar fbarchard@google.com

direct conversion from NV12 to ARGB

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/645004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@281 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 7c8e16f8
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 280
Version: 281
License: BSD
License File: LICENSE
......
......@@ -47,16 +47,33 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert NV12 to ARGB. Also used for NV21.
// Convert NV12 to ARGB.
int NV12ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Convert NV12 to RGB565. Also used for NV21.
// Convert NV21 to ARGB.
int NV21ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert M420 to ARGB.
int M420ToARGB(const uint8* src_m420, int src_stride_m420,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert NV12 to RGB565.
int NV12ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_frame, int dst_stride_frame,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert NV21 to RGB565.
int NV21ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert YUY2 to ARGB.
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 280
#define LIBYUV_VERSION 281
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -367,7 +367,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
// easy conversion to I420.
// M420 format description:
// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
// Chroma is half width / half height. (420)
// src_stride_m420 is row planar. Normally this will be the width in pixels.
// The UV plane is half width, but 2 values, so src_stride_m420 applies to
......
......@@ -839,51 +839,191 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* argb_buf,
int width) = I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_NEON;
void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width) = NV12ToARGBRow_C;
#if defined(HAS_NV12TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
}
}
#elif defined(HAS_I422TOARGBROW_SSSE3)
#endif
for (int y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_uv += src_stride_uv;
}
}
return 0;
}
// Convert NV21 to ARGB.
int NV21ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*NV21ToARGBRow)(const uint8* y_buf,
const uint8* vu_buf,
uint8* rgb_buf,
int width) = NV21ToARGBRow_C;
#if defined(HAS_NV21TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
NV21ToARGBRow = NV21ToARGBRow_SSSE3;
}
}
}
#endif
int halfwidth = (width + 1) >> 1;
void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUV_C;
#if defined(HAS_SPLITUV_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitUV = SplitUV_NEON;
for (int y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_vu, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_vu += src_stride_vu;
}
}
#elif defined(HAS_SPLITUV_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16)) {
SplitUV = SplitUV_SSE2;
return 0;
}
// Convert M420 to ARGB.
int M420ToARGB(const uint8* src_m420, int src_stride_m420,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width) = NV12ToARGBRow_C;
#if defined(HAS_NV12TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
}
}
#endif
for (int y = 0; y < height - 1; y += 2) {
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
dst_argb + dst_stride_argb, width);
dst_argb += dst_stride_argb * 2;
src_m420 += src_stride_m420 * 3;
}
if (height & 1) {
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
}
return 0;
}
// Convert NV12 to RGB565.
// TODO(fbarchard): (Re) Optimize for Neon.
int NV12ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
dst_stride_rgb565 = -dst_stride_rgb565;
}
void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width) = NV12ToARGBRow_C;
#if defined(HAS_NV12TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB565Row_C;
#if defined(HAS_ARGBTORGB565ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
#endif
SIMD_ALIGNED(uint8 rowuv[kMaxStride * 2]);
for (int y = 0; y < height; ++y) {
if ((y & 1) == 0) {
// Copy a row of UV.
SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
NV12ToARGBRow(src_y, src_uv, row, width);
ARGBToRGB565Row(row, dst_rgb565, width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
src_uv += src_stride_uv;
}
I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width);
dst_argb += dst_stride_argb;
}
return 0;
}
// Convert NV21 to RGB565.
int NV21ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_vu, int src_stride_vu,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
dst_stride_rgb565 = -dst_stride_rgb565;
}
void (*NV21ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width) = NV21ToARGBRow_C;
#if defined(HAS_NV21TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
NV21ToARGBRow = NV21ToARGBRow_SSSE3;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB565Row_C;
#if defined(HAS_ARGBTORGB565ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
#endif
for (int y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_vu, row, width);
ARGBToRGB565Row(row, dst_rgb565, width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
src_vu += src_stride_vu;
}
}
return 0;
}
......@@ -1020,69 +1160,6 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
return 0;
}
// Convert NV12 to RGB565.
int NV12ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb, int dst_stride_rgb,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
dst_stride_rgb = -dst_stride_rgb;
}
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
#elif defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
#endif
SIMD_ALIGNED(uint8 row[kMaxStride]);
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB565Row_C;
#if defined(HAS_ARGBTORGB565ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
#endif
int halfwidth = (width + 1) >> 1;
void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUV_C;
#if defined(HAS_SPLITUV_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitUV = SplitUV_NEON;
}
#elif defined(HAS_SPLITUV_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16)) {
SplitUV = SplitUV_SSE2;
}
#endif
SIMD_ALIGNED(uint8 rowuv[kMaxStride * 2]);
for (int y = 0; y < height; ++y) {
if ((y & 1) == 0) {
// Copy a row of UV.
SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
src_uv += src_stride_uv;
}
I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width);
ARGBToRGB565Row(row, dst_rgb, width);
dst_rgb += dst_stride_rgb;
src_y += src_stride_y;
}
return 0;
}
// SetRow8 writes 'count' bytes using a 32 bit value repeated
// SetRow32 writes 'count' words using a 32 bit value repeated
......
......@@ -54,12 +54,14 @@ extern "C" {
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
#define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOBGRAROW_SSSE3
#define HAS_I444TOARGBROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
#define HAS_I411TOARGBROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV21TOARGBROW_SSSE3
#define HAS_I422TOBGRAROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORROWUV_SSSE3
#define HAS_ADDROW_SSE2
......@@ -220,34 +222,44 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I422ToARGBRow_C(const uint8* y_buf,
void I444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
uint8* argb_buf,
int width);
void I422ToBGRARow_C(const uint8* y_buf,
void I422ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
uint8* argb_buf,
int width);
void I422ToABGRRow_C(const uint8* y_buf,
void I411ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void I444ToARGBRow_C(const uint8* y_buf,
void NV12ToARGBRow_C(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToARGBRow_C(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width);
void I422ToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
uint8* bgra_buf,
int width);
void I411ToARGBRow_C(const uint8* y_buf,
void I422ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
uint8* abgr_buf,
int width);
void YToARGBRow_C(const uint8* y_buf,
......@@ -269,6 +281,16 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
void I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width);
......@@ -299,6 +321,16 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width);
......@@ -314,37 +346,16 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
uint8* abgr_buf,
int width);
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* argb_buf,
int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
// 'Any' functions handle any size and alignment.
void I444ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
uint8* argb_buf,
int width);
void I422ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
uint8* argb_buf,
int width);
void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
......@@ -353,18 +364,47 @@ void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
uint8* rgb_buf,
int width);
void NV12ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width);
void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width);
void I422ToBGRARow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
uint8* bgra_buf,
int width);
void I422ToABGRRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
uint8* abgr_buf,
int width);
void YToARGBRow_SSE2(const uint8* y_buf,
uint8* argb_buf,
int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
......
......@@ -359,6 +359,20 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
(255u << ashift);
}
void I444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width; ++x) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
y_buf += 1;
u_buf += 1;
v_buf += 1;
rgb_buf += 4; // Advance 1 pixel.
}
}
// Also used for 420
void I422ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
......@@ -378,79 +392,97 @@ void I422ToARGBRow_C(const uint8* y_buf,
}
}
void I422ToBGRARow_C(const uint8* y_buf,
void I411ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
y_buf += 2;
for (int x = 0; x < width - 3; x += 4) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
y_buf += 4;
u_buf += 1;
v_buf += 1;
rgb_buf += 16; // Advance 4 pixels.
}
if (width & 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
y_buf += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
}
}
void I422ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
void NV12ToARGBRow_C(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0);
y_buf += 2;
u_buf += 1;
v_buf += 1;
uv_buf += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
}
}
void I444ToARGBRow_C(const uint8* y_buf,
void NV21ToARGBRow_C(const uint8* y_buf,
const uint8* vu_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0);
y_buf += 2;
vu_buf += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
}
}
void I422ToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width; ++x) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
y_buf += 1;
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 4; // Advance 1 pixel.
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
}
}
void I411ToARGBRow_C(const uint8* y_buf,
void I422ToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 3; x += 4) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
y_buf += 4;
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 16; // Advance 4 pixels.
}
if (width & 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
y_buf += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
}
}
......@@ -728,10 +760,26 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
rgb_buf + n * 4, width & 7); \
}
// Wrappers to handle odd width
#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT) \
void NAMEANY(const uint8* y_buf, \
const uint8* uv_buf, \
uint8* rgb_buf, \
int width) { \
int n = width & ~7; \
NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n); \
NV12TORGB_C(y_buf + n, \
uv_buf + (n >> UV_SHIFT), \
rgb_buf + n * 4, width & 7); \
}
#if defined(HAS_I422TOARGBROW_SSSE3)
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
#endif
......
......@@ -1231,14 +1231,17 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
struct {
vec8 kUVToB;
vec8 kUVToG;
vec8 kUVToR;
vec16 kUVBiasB;
vec16 kUVBiasG;
vec16 kUVBiasR;
vec16 kYSub16;
vec16 kYToRgb;
vec8 kUVToB; // 0
vec8 kUVToG; // 16
vec8 kUVToR; // 32
vec16 kUVBiasB; // 48
vec16 kUVBiasG; // 64
vec16 kUVBiasR; // 80
vec16 kYSub16; // 96
vec16 kYToRgb; // 112
vec8 kVUToB; // 128
vec8 kVUToG; // 144
vec8 kVUToR; // 160
} CONST SIMD_ALIGNED(kYuvConstants) = {
{ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
......@@ -1247,48 +1250,83 @@ struct {
{ BG, BG, BG, BG, BG, BG, BG, BG },
{ BR, BR, BR, BR, BR, BR, BR, BR },
{ 16, 16, 16, 16, 16, 16, 16, 16 },
{ YG, YG, YG, YG, YG, YG, YG, YG }
{ YG, YG, YG, YG, YG, YG, YG, YG },
{ VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
{ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
{ VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
};
// Read 8 UV from 411
#define READYUV444 \
"movq (%1),%%xmm0 \n" \
"movq (%1,%2,1),%%xmm1 \n" \
"lea 0x8(%1),%1 \n" \
"movq (%[u_buf]),%%xmm0 \n" \
"movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x8(%[u_buf]),%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
// Read 4 UV from 422, upsample to 8 UV
#define READYUV422 \
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
"lea 0x4(%1),%1 \n" \
"movd (%[u_buf]),%%xmm0 \n" \
"movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x4(%[u_buf]),%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
// Read 2 UV from 411, upsample to 8 UV
#define READYUV411 \
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
"lea 0x2(%1),%1 \n" \
"movd (%[u_buf]),%%xmm0 \n" \
"movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x2(%[u_buf]),%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"punpckldq %%xmm0,%%xmm0 \n" \
// Read 4 UV from NV12, upsample to 8 UV
#define READNV12 \
"movq (%[uv_buf]),%%xmm0 \n" \
"lea 0x8(%[uv_buf]),%[uv_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw (%5),%%xmm0 \n" \
"pmaddubsw 16(%5),%%xmm1 \n" \
"pmaddubsw 32(%5),%%xmm2 \n" \
"psubw 48(%5),%%xmm0 \n" \
"psubw 64(%5),%%xmm1 \n" \
"psubw 80(%5),%%xmm2 \n" \
"movq (%0),%%xmm3 \n" \
"lea 0x8(%0),%0 \n" \
"pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
"pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
"pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
"psubw 48(%[kYuvConstants]),%%xmm0 \n" \
"psubw 64(%[kYuvConstants]),%%xmm1 \n" \
"psubw 80(%[kYuvConstants]),%%xmm2 \n" \
"movq (%[y_buf]),%%xmm3 \n" \
"lea 0x8(%[y_buf]),%[y_buf] \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
"psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
"pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n" \
// Convert 8 pixels: 8 VU and 8 Y
#define YVUTORGB \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
"pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
"pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
"psubw 48(%[kYuvConstants]),%%xmm0 \n" \
"psubw 64(%[kYuvConstants]),%%xmm1 \n" \
"psubw 80(%[kYuvConstants]),%%xmm2 \n" \
"movq (%[y_buf]),%%xmm3 \n" \
"lea 0x8(%[y_buf]),%[y_buf] \n" \
"punpcklbw %%xmm4,%%xmm3 \n" \
"psubsw 96(%5),%%xmm3 \n" \
"pmullw 112(%5),%%xmm3 \n" \
"psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
"pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
"paddsw %%xmm3,%%xmm0 \n" \
"paddsw %%xmm3,%%xmm1 \n" \
"paddsw %%xmm3,%%xmm2 \n" \
......@@ -1297,7 +1335,7 @@ struct {
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n" \
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
......@@ -1305,7 +1343,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1317,17 +1355,17 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqa %%xmm0,(%[argb_buf]) \n"
"movdqa %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1341,7 +1379,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1353,17 +1391,17 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqa %%xmm0,(%[argb_buf]) \n"
"movdqa %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1377,7 +1415,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1389,17 +1427,83 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqa %%xmm0,(%[argb_buf]) \n"
"movdqa %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
READNV12
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,(%[argb_buf]) \n"
"movdqa %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
READNV12
YVUTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,(%[argb_buf]) \n"
"movdqa %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(vu_buf), // %[uv_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1413,7 +1517,7 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1425,17 +1529,17 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqu %%xmm0,(%[argb_buf]) \n"
"movdqu %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1449,7 +1553,7 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1461,17 +1565,17 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqu %%xmm0,(%[argb_buf]) \n"
"movdqu %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1485,7 +1589,7 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1497,17 +1601,83 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqu %%xmm0,(%[argb_buf]) \n"
"movdqu %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
READNV12
YUVTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%[argb_buf]) \n"
"movdqu %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* vu_buf,
uint8* argb_buf,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
READNV12
YVUTORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%[argb_buf]) \n"
"movdqu %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(vu_buf), // %[uv_buf]
[argb_buf]"+r"(argb_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1521,7 +1691,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
uint8* bgra_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1534,17 +1704,17 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5,(%3) \n"
"movdqa %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqa %%xmm5,(%[argb_buf]) \n"
"movdqa %%xmm0,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(bgra_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(bgra_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1558,7 +1728,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
uint8* abgr_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1570,17 +1740,17 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,(%3) \n"
"movdqa %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqa %%xmm2,(%[argb_buf]) \n"
"movdqa %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(abgr_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(abgr_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1594,7 +1764,7 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
uint8* bgra_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1607,17 +1777,17 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqu %%xmm5,(%3) \n"
"movdqu %%xmm0,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqu %%xmm5,(%[argb_buf]) \n"
"movdqu %%xmm0,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(bgra_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(bgra_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
......@@ -1631,7 +1801,7 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
uint8* abgr_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
......@@ -1643,24 +1813,23 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqu %%xmm2,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x8,%4 \n"
"movdqu %%xmm2,(%[argb_buf]) \n"
"movdqu %%xmm1,0x10(%[argb_buf]) \n"
"lea 0x20(%[argb_buf]),%[argb_buf] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(abgr_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[argb_buf]"+r"(abgr_buf), // %[argb_buf]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_YTOARGBROW_SSE2
......
......@@ -1230,6 +1230,18 @@ static const vec8 kUVToG = {
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
};
static const vec8 kVUToB = {
VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
};
static const vec8 kVUToR = {
VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
};
static const vec8 kVUToG = {
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
};
static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
......@@ -1265,6 +1277,13 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
__asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
}
// Read 4 UV from NV12, upsample to 8 UV
#define READNV12 __asm { \
__asm movq xmm0, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
}
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
......@@ -1293,6 +1312,34 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
__asm packuswb xmm2, xmm2 /* R */ \
}
// Convert 8 pixels: 8 VU and 8 Y
#define YVUTORGB __asm { \
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
__asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
__asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
__asm psubw xmm1, kUVBiasG \
__asm psubw xmm2, kUVBiasR \
/* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \
__asm psubsw xmm3, kYSub16 \
__asm pmullw xmm3, kYToRgb \
__asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
// 8 pixels, dest aligned 16.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16))
......@@ -1423,6 +1470,82 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16))
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // Y
mov esi, [esp + 4 + 8] // UV
mov edx, [esp + 4 + 12] // argb
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
align 16
convertloop:
READNV12
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop esi
ret
}
}
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16))
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // Y
mov esi, [esp + 4 + 8] // VU
mov edx, [esp + 4 + 12] // argb
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
align 16
convertloop:
READNV12
YVUTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop esi
ret
}
}
// 8 pixels, unaligned.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16))
......@@ -1553,6 +1676,83 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
}
}
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16))
void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // Y
mov esi, [esp + 4 + 8] // UV
mov edx, [esp + 4 + 12] // argb
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
align 16
convertloop:
READNV12
YUVTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop esi
ret
}
}
// 8 pixels, dest aligned 16.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes)
__declspec(naked) __declspec(align(16))
void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* argb_buf,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // Y
mov esi, [esp + 4 + 8] // VU
mov edx, [esp + 4 + 12] // argb
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
align 16
convertloop:
READNV12
YVUTORGB
// Step 3: Weave into ARGB
punpcklbw xmm0, xmm1 // BG
punpcklbw xmm2, xmm5 // RA
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg convertloop
pop esi
ret
}
}
__declspec(naked) __declspec(align(16))
void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
......
......@@ -26,7 +26,7 @@
namespace libyuv {
#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \
TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_CvsOPT) { \
TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_OptVsC) { \
const int kWidth = 1280; \
const int kHeight = 720; \
align_buffer_16(src_y, kWidth * kHeight); \
......@@ -88,8 +88,60 @@ TESTPLANARTOB(I411, 4, 1, ARGB, 4)
TESTPLANARTOB(I422, 2, 1, ARGB, 4)
TESTPLANARTOB(I444, 1, 1, ARGB, 4)
#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \
TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_OptVsC) { \
const int kWidth = 1280; \
const int kHeight = 720; \
align_buffer_16(src_y, kWidth * kHeight); \
align_buffer_16(src_uv, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y * 2); \
align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \
align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \
srandom(time(NULL)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
src_y[(i * kWidth) + j] = (random() & 0xff); \
for (int i = 0; i < kHeight / SUBSAMP_X; ++i) \
for (int j = 0; j < kWidth / SUBSAMP_Y * 2; ++j) { \
src_uv[(i * kWidth / SUBSAMP_X) * 2 + j] = (random() & 0xff); \
} \
MaskCpuFlags(kCpuInitialized); \
##FMT_PLANAR##To##FMT_B(src_y, kWidth, \
src_uv, kWidth / SUBSAMP_X * 2, \
dst_argb_c, kWidth * BPP_B, \
kWidth, kHeight); \
MaskCpuFlags(-1); \
const int runs = 1000; \
for (int i = 0; i < runs; ++i) { \
##FMT_PLANAR##To##FMT_B(src_y, kWidth, \
src_uv, kWidth / SUBSAMP_X * 2, \
dst_argb_opt, kWidth * BPP_B, \
kWidth, kHeight); \
} \
int err = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth * BPP_B; ++j) { \
int diff = static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) - \
static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j]); \
if (abs(diff) > 2) { \
++err; \
} \
} \
} \
EXPECT_EQ(err, 0); \
free_aligned_buffer_16(src_y) \
free_aligned_buffer_16(src_uv) \
free_aligned_buffer_16(dst_argb_c) \
free_aligned_buffer_16(dst_argb_opt) \
}
TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4)
TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4)
TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2)
TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2)
#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
TEST_F(libyuvTest, ##FMT_A##To##FMT_PLANAR##_CvsOPT) { \
TEST_F(libyuvTest, ##FMT_A##To##FMT_PLANAR##_OptVsC) { \
const int kWidth = 1280; \
const int kHeight = 720; \
align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight); \
......@@ -171,36 +223,34 @@ TESTATOPLANAR(ARGB, 4, I422, 2, 1)
//TESTATOPLANAR(ARGB, 4, I444, 1, 1)
// TODO(fbarchard): Implement and test 411 and 444
#define TESTATOB(FMT_A, BPP_A, FMT_B, BPP_B) \
TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \
#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \
TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_OptVsC) { \
const int kWidth = 1280; \
const int kHeight = 720; \
align_buffer_16(src_argb, kWidth * kHeight * BPP_A); \
align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight); \
align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \
align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \
srandom(time(NULL)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth * BPP_A; ++j) \
src_argb[(i * kWidth * BPP_A) + j] = (random() & 0xff); \
for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) { \
src_argb[i] = (random() & 0xff); \
} \
MaskCpuFlags(kCpuInitialized); \
##FMT_A##To##FMT_B(src_argb, kWidth * BPP_A, \
##FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \
dst_argb_c, kWidth * BPP_B, \
kWidth, kHeight); \
MaskCpuFlags(-1); \
const int runs = 1000; \
for (int i = 0; i < runs; ++i) { \
##FMT_A##To##FMT_B(src_argb, kWidth * BPP_A, \
##FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \
dst_argb_opt, kWidth * BPP_B, \
kWidth, kHeight); \
} \
int err = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth * BPP_B; ++j) { \
int diff = static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) - \
static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j]); \
if (abs(diff) > 2) \
err++; \
} \
for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) { \
int diff = static_cast<int>(dst_argb_c[i]) - \
static_cast<int>(dst_argb_opt[i]); \
if (abs(diff) > 2) \
err++; \
} \
EXPECT_EQ(err, 0); \
free_aligned_buffer_16(src_argb) \
......@@ -208,25 +258,26 @@ TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \
free_aligned_buffer_16(dst_argb_opt) \
}
TESTATOB(ARGB, 4, ARGB, 4)
TESTATOB(ARGB, 4, BGRA, 4)
TESTATOB(ARGB, 4, ABGR, 4)
TESTATOB(ARGB, 4, RAW, 3)
TESTATOB(ARGB, 4, RGB24, 3)
TESTATOB(ARGB, 4, RGB565, 2)
TESTATOB(ARGB, 4, ARGB1555, 2)
TESTATOB(ARGB, 4, ARGB4444, 2)
TESTATOB(ARGB, 4, 4, ARGB, 4)
TESTATOB(ARGB, 4, 4, BGRA, 4)
TESTATOB(ARGB, 4, 4, ABGR, 4)
TESTATOB(ARGB, 4, 4, RAW, 3)
TESTATOB(ARGB, 4, 4, RGB24, 3)
TESTATOB(ARGB, 4, 4, RGB565, 2)
TESTATOB(ARGB, 4, 4, ARGB1555, 2)
TESTATOB(ARGB, 4, 4, ARGB4444, 2)
TESTATOB(BGRA, 4, ARGB, 4)
TESTATOB(ABGR, 4, ARGB, 4)
TESTATOB(RAW, 3, ARGB, 4)
TESTATOB(RGB24, 3, ARGB, 4)
TESTATOB(RGB565, 2, ARGB, 4)
TESTATOB(ARGB1555, 2, ARGB, 4)
TESTATOB(ARGB4444, 2, ARGB, 4)
TESTATOB(BGRA, 4, 4, ARGB, 4)
TESTATOB(ABGR, 4, 4, ARGB, 4)
TESTATOB(RAW, 3, 3, ARGB, 4)
TESTATOB(RGB24, 3, 3, ARGB, 4)
TESTATOB(RGB565, 2, 2, ARGB, 4)
TESTATOB(ARGB1555, 2, 2, ARGB, 4)
TESTATOB(ARGB4444, 2, 2, ARGB, 4)
TESTATOB(YUY2, 2, ARGB, 4)
TESTATOB(UYVY, 2, ARGB, 4)
TESTATOB(YUY2, 2, 2, ARGB, 4)
TESTATOB(UYVY, 2, 2, ARGB, 4)
TESTATOB(M420, 3 / 2, 1, ARGB, 4)
TEST_F(libyuvTest, TestAttenuate) {
SIMD_ALIGNED(uint8 orig_pixels[256][4]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment