Commit 8af6ea41 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

I420ToAR30 in 1 step SSSE3 assembly

Bug: libyuv:751
Test: LibYUVConvertTest.I420ToAR30_Opt
Change-Id: Ie89c3eb2526354cf11175746bc8af72be83a1e00
Reviewed-on: https://chromium-review.googlesource.com/877541Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 09db0c4c
......@@ -258,6 +258,7 @@ extern "C" {
// I210 is for H010. 2 = 422. I for 601 vs H for 709.
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3
#define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#endif
......@@ -1683,6 +1684,12 @@ void I422ToARGBRow_C(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToAR30Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void I210ToAR30Row_C(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
......@@ -1798,6 +1805,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
const struct YuvConstants* yuvconstants,
int width);
void I422ToAR30Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void I210ToAR30Row_SSSE3(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
......@@ -1960,6 +1973,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToAR30Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void I210ToAR30Row_Any_SSSE3(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
......
......@@ -1149,12 +1149,10 @@ static int I420ToAR30Matrix(const uint8* src_y,
int width,
int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
void (*I422ToAR30Row)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C;
void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToAR30Row_C;
I422ToAR30Row_C;
if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
return -1;
......@@ -1166,71 +1164,31 @@ static int I420ToAR30Matrix(const uint8* src_y,
dst_stride_ar30 = -dst_stride_ar30;
}
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
#if defined(HAS_I422TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBToAR30Row = ARGBToAR30Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOAR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToAR30Row = ARGBToAR30Row_AVX2;
I422ToAR30Row = I422ToAR30Row_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
#if defined(HAS_I422TOAR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
I422ToAR30Row = I422ToAR30Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
I422ToAR30Row = I422ToAR30Row_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
{
// Row buffer for ARGB.
align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30, width);
dst_ar30 += dst_stride_ar30;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
for (y = 0; y < height; ++y) {
I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
dst_ar30 += dst_stride_ar30;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
free_aligned_buffer_64(row_argb);
}
return 0;
}
......
......@@ -145,6 +145,9 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I422TOAR30ROW_SSSE3
ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
......
......@@ -1261,6 +1261,8 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
#undef YG
// C reference code that mimics the YUV assembly.
// Reads 8 bit YUV and leaves result as 16 bit.
static __inline void YuvPixel(uint8 y,
uint8 u,
uint8 v,
......@@ -1303,14 +1305,14 @@ static __inline void YuvPixel(uint8 y,
*r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
}
// C reference code that mimics the YUV 10 bit assembly.
static __inline void YuvPixel10(uint16 y,
uint16 u,
uint16 v,
uint8* b,
uint8* g,
uint8* r,
const struct YuvConstants* yuvconstants) {
// Reads 8 bit YUV and leaves result as 16 bit.
static __inline void YuvPixel8_16(uint8 y,
uint8 u,
uint8 v,
int* b,
int* g,
int* r,
const struct YuvConstants* yuvconstants) {
#if defined(__aarch64__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
......@@ -1340,15 +1342,14 @@ static __inline void YuvPixel10(uint16 y,
int yg = yuvconstants->kYToRgb[0];
#endif
uint32 y1 = (uint32)((y << 6) * yg) >> 16;
u = clamp255(u >> 2);
v = clamp255(v >> 2);
*b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
*g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
*r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
*b = (int)(-(u * ub) + y1 + bb);
*g = (int)(-(u * ug + v * vg) + y1 + bg);
*r = (int)(-(v * vr) + y1 + br);
}
// C reference code that mimics the YUV 16 bit assembly.
// Reads 10 bit YUV and leaves result as 16 bit.
static __inline void YuvPixel16(int16 y,
int16 u,
int16 v,
......@@ -1391,11 +1392,24 @@ static __inline void YuvPixel16(int16 y,
*b = (int)(-(u * ub) + y1 + bb);
*g = (int)(-(u * ug + v * vg) + y1 + bg);
*r = (int)(-(v * vr) + y1 + br);
}
if ((int16)(*b & 0xffff) != *b) {
printf("%d vs %d bb %d y1 %d\n",(int16)*b, *b, bb, y1);
}
// C reference code that mimics the YUV 10 bit assembly.
// Reads 10 bit YUV and clamps down to 8 bit RGB.
static __inline void YuvPixel10(uint16 y,
uint16 u,
uint16 v,
uint8* b,
uint8* g,
uint8* r,
const struct YuvConstants* yuvconstants) {
int b16;
int g16;
int r16;
YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
*b = Clamp(b16 >> 6);
*g = Clamp(g16 >> 6);
*r = Clamp(r16 >> 6);
}
// Y contribution to R,G,B. Scale and bias.
......@@ -1560,6 +1574,35 @@ void I210ToAR30Row_C(const uint16* src_y,
}
}
// 8 bit YUV to 10 bit AR30
// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
void I422ToAR30Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
int b;
int g;
int r;
for (x = 0; x < width - 1; x += 2) {
YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf, b, g, r);
YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf + 4, b, g, r);
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf, b, g, r);
}
}
void I422AlphaToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......
......@@ -1901,6 +1901,40 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
);
}
void OMITFP I422ToAR30Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
"psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits
"pxor %%xmm6,%%xmm6 \n"
"pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
"1: \n"
READYUV422
YUVTORGB16(yuvconstants)
STOREAR30
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
// 10 bit YUV to ARGB
void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf,
const uint16* u_buf,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment