Commit 09db0c4c authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

H010ToAR30 in 1 step with SSSE3 assembly

Switch YUV conversion macro to output 16 bits per channel.
STOREAR30 macro to output AR30.

[ RUN      ] LibYUVConvertTest.TestH420ToARGB
uniques: B 220, G, 220, R 220
[       OK ] LibYUVConvertTest.TestH420ToARGB (0 ms)
[ RUN      ] LibYUVConvertTest.TestH010ToARGB
uniques: B 256, G, 256, R 256
[       OK ] LibYUVConvertTest.TestH010ToARGB (0 ms)
[ RUN      ] LibYUVConvertTest.TestH010ToAR30
uniques: B 883, G, 883, R 883
[       OK ] LibYUVConvertTest.TestH010ToAR30 (0 ms)

Bug: libyuv:751
Test: LibYUVConvertTest.H010ToAR30_Opt
Change-Id: I902b718e2c8b68ede69625ccafebc6519d5af70d
Reviewed-on: https://chromium-review.googlesource.com/869511Reviewed-by: 's avatarFrank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarMiguel Casas <mcasas@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 37f97210
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1689 Version: 1690
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -420,6 +420,19 @@ int H010ToARGB(const uint16* src_y, ...@@ -420,6 +420,19 @@ int H010ToARGB(const uint16* src_y,
int width, int width,
int height); int height);
// Convert I010 to AR30.
LIBYUV_API
int I010ToAR30(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
uint8* dst_ar30,
int dst_stride_ar30,
int width,
int height);
// Convert H010 to AR30. // Convert H010 to AR30.
LIBYUV_API LIBYUV_API
int H010ToAR30(const uint16* src_y, int H010ToAR30(const uint16* src_y,
......
...@@ -256,6 +256,7 @@ extern "C" { ...@@ -256,6 +256,7 @@ extern "C" {
#define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2 #define HAS_CONVERT8TO16ROW_SSE2
// I210 is for H010. 2 = 422. I for 601 vs H for 709. // I210 is for H010. 2 = 422. I for 601 vs H for 709.
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
...@@ -1682,6 +1683,12 @@ void I422ToARGBRow_C(const uint8* src_y, ...@@ -1682,6 +1683,12 @@ void I422ToARGBRow_C(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToAR30Row_C(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void I210ToARGBRow_C(const uint16* src_y, void I210ToARGBRow_C(const uint16* src_y,
const uint16* src_u, const uint16* src_u,
const uint16* src_v, const uint16* src_v,
...@@ -1791,6 +1798,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y, ...@@ -1791,6 +1798,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToAR30Row_SSSE3(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void I210ToARGBRow_SSSE3(const uint16* src_y, void I210ToARGBRow_SSSE3(const uint16* src_y,
const uint16* src_u, const uint16* src_u,
const uint16* src_v, const uint16* src_v,
...@@ -1947,6 +1960,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y, ...@@ -1947,6 +1960,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToAR30Row_Any_SSSE3(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void I210ToARGBRow_Any_SSSE3(const uint16* src_y, void I210ToARGBRow_Any_SSSE3(const uint16* src_y,
const uint16* src_u, const uint16* src_u,
const uint16* src_v, const uint16* src_v,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1689 #define LIBYUV_VERSION 1690
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -413,7 +413,7 @@ int H422ToABGR(const uint8* src_y, ...@@ -413,7 +413,7 @@ int H422ToABGR(const uint8* src_y,
// Convert 10 bit YUV to ARGB with matrix // Convert 10 bit YUV to ARGB with matrix
// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
// multiply 10 bit yuv into high bits to allow any number of bits. // multiply 10 bit yuv into high bits to allow any number of bits.
static int H010ToAR30Matrix(const uint16* src_y, static int I010ToAR30Matrix(const uint16* src_y,
int src_stride_y, int src_stride_y,
const uint16* src_u, const uint16* src_u,
int src_stride_u, int src_stride_u,
...@@ -425,12 +425,10 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -425,12 +425,10 @@ static int H010ToAR30Matrix(const uint16* src_y,
int width, int width,
int height) { int height) {
int y; int y;
void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf, void (*I210ToAR30Row)(const uint16* y_buf, const uint16* u_buf,
const uint16* v_buf, uint8* rgb_buf, const uint16* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) = const struct YuvConstants* yuvconstants, int width) =
I210ToARGBRow_C; I210ToAR30Row_C;
void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToAR30Row_C;
if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
return -1; return -1;
} }
...@@ -440,46 +438,24 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -440,46 +438,24 @@ static int H010ToAR30Matrix(const uint16* src_y,
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30;
} }
#if defined(HAS_I210TOARGBROW_SSSE3) #if defined(HAS_I210TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I210ToARGBRow = I210ToARGBRow_Any_SSSE3; I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I210ToARGBRow = I210ToARGBRow_SSSE3; I210ToAR30Row = I210ToAR30Row_SSSE3;
} }
} }
#endif #endif
#if defined(HAS_I210TOARGBROW_AVX2) #if defined(HAS_I210TOAR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
I210ToARGBRow = I210ToARGBRow_Any_AVX2; I210ToAR30Row = I210ToAR30Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
I210ToARGBRow = I210ToARGBRow_AVX2; I210ToAR30Row = I210ToAR30Row_AVX2;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBToAR30Row = ARGBToAR30Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOAR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToAR30Row = ARGBToAR30Row_AVX2;
}
}
#endif
{
// Row buffers for 8 bit YUV and RGB.
align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I210ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width); I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30, width);
dst_ar30 += dst_stride_ar30; dst_ar30 += dst_stride_ar30;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
...@@ -487,13 +463,26 @@ static int H010ToAR30Matrix(const uint16* src_y, ...@@ -487,13 +463,26 @@ static int H010ToAR30Matrix(const uint16* src_y,
src_v += src_stride_v; src_v += src_stride_v;
} }
} }
free_aligned_buffer_64(row_argb);
}
return 0; return 0;
} }
// Convert I010 to AR30.
LIBYUV_API
int I010ToAR30(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
uint8* dst_ar30,
int dst_stride_ar30,
int width,
int height) {
return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_ar30, dst_stride_ar30,
&kYuvI601Constants, width, height);
}
// Convert H010 to AR30. // Convert H010 to AR30.
LIBYUV_API LIBYUV_API
int H010ToAR30(const uint16* src_y, int H010ToAR30(const uint16* src_y,
...@@ -506,7 +495,7 @@ int H010ToAR30(const uint16* src_y, ...@@ -506,7 +495,7 @@ int H010ToAR30(const uint16* src_y,
int dst_stride_ar30, int dst_stride_ar30,
int width, int width,
int height) { int height) {
return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_ar30, dst_stride_ar30, src_stride_v, dst_ar30, dst_stride_ar30,
&kYuvH709Constants, width, height); &kYuvH709Constants, width, height);
} }
......
...@@ -214,6 +214,9 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) ...@@ -214,6 +214,9 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
} }
#ifdef HAS_I210TOAR30ROW_SSSE3
ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16, 2, 4, 7)
#endif
#ifdef HAS_I210TOARGBROW_SSSE3 #ifdef HAS_I210TOARGBROW_SSSE3
ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7) ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7)
#endif #endif
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "libyuv/row.h" #include "libyuv/row.h"
#include <string.h> // For memcpy and memset. #include <string.h> // For memcpy and memset.
#include <stdio.h>
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
...@@ -31,9 +32,8 @@ static __inline int32 clamp255(int32 v) { ...@@ -31,9 +32,8 @@ static __inline int32 clamp255(int32 v) {
return (((255 - (v)) >> 31) | (v)) & 255; return (((255 - (v)) >> 31) | (v)) & 255;
} }
static __inline uint32 Clamp(int32 val) { static __inline int32 clamp1023(int32 v) {
int v = clamp0(val); return (((1023 - (v)) >> 31) | (v)) & 1023;
return (uint32)(clamp255(v));
} }
static __inline uint32 Abs(int32 v) { static __inline uint32 Abs(int32 v) {
...@@ -49,15 +49,23 @@ static __inline int32 clamp255(int32 v) { ...@@ -49,15 +49,23 @@ static __inline int32 clamp255(int32 v) {
return (v > 255) ? 255 : v; return (v > 255) ? 255 : v;
} }
static __inline uint32 Clamp(int32 val) { static __inline int32 clamp1023(int32 v) {
int v = clamp0(val); return (v > 1023) ? 1023 : v;
return (uint32)(clamp255(v));
} }
static __inline uint32 Abs(int32 v) { static __inline uint32 Abs(int32 v) {
return (v < 0) ? -v : v; return (v < 0) ? -v : v;
} }
#endif // USE_BRANCHLESS #endif // USE_BRANCHLESS
static __inline uint32 Clamp(int32 val) {
int v = clamp0(val);
return (uint32)(clamp255(v));
}
static __inline uint32 Clamp10(int32 val) {
int v = clamp0(val);
return (uint32)(clamp1023(v));
}
#ifdef LIBYUV_LITTLE_ENDIAN #ifdef LIBYUV_LITTLE_ENDIAN
#define WRITEWORD(p, v) *(uint32*)(p) = v #define WRITEWORD(p, v) *(uint32*)(p) = v
...@@ -1340,6 +1348,56 @@ static __inline void YuvPixel10(uint16 y, ...@@ -1340,6 +1348,56 @@ static __inline void YuvPixel10(uint16 y,
*r = Clamp((int32)(-(v * vr) + y1 + br) >> 6); *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
} }
// C reference code that mimics the YUV 16 bit assembly.
static __inline void YuvPixel16(int16 y,
int16 u,
int16 v,
int* b,
int* g,
int* r,
const struct YuvConstants* yuvconstants) {
#if defined(__aarch64__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
int vg = yuvconstants->kUVToG[1];
int vr = -yuvconstants->kUVToRB[1];
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
int yg = yuvconstants->kYToRgb[0] / 0x0101;
#elif defined(__arm__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
int vg = yuvconstants->kUVToG[4];
int vr = -yuvconstants->kUVToRB[4];
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
int yg = yuvconstants->kYToRgb[0] / 0x0101;
#else
int ub = yuvconstants->kUVToB[0];
int ug = yuvconstants->kUVToG[0];
int vg = yuvconstants->kUVToG[1];
int vr = yuvconstants->kUVToR[1];
int bb = yuvconstants->kUVBiasB[0];
int bg = yuvconstants->kUVBiasG[0];
int br = yuvconstants->kUVBiasR[0];
int yg = yuvconstants->kYToRgb[0];
#endif
uint32 y1 = (uint32)((y << 6) * yg) >> 16;
u = clamp255(u >> 2);
v = clamp255(v >> 2);
*b = (int)(-(u * ub) + y1 + bb);
*g = (int)(-(u * ug + v * vg) + y1 + bg);
*r = (int)(-(v * vr) + y1 + br);
if ((int16)(*b & 0xffff) != *b) {
printf("%d vs %d bb %d y1 %d\n",(int16)*b, *b, bb, y1);
}
}
// Y contribution to R,G,B. Scale and bias. // Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
...@@ -1460,6 +1518,48 @@ void I210ToARGBRow_C(const uint16* src_y, ...@@ -1460,6 +1518,48 @@ void I210ToARGBRow_C(const uint16* src_y,
} }
} }
static void StoreAR30(uint8* rgb_buf,
int b,
int g,
int r) {
uint32 ar30;
b = b >> 4; // convert 10.6 to 10 bit.
g = g >> 4;
r = r >> 4;
b = Clamp10(b);
g = Clamp10(g);
r = Clamp10(r);
ar30 = b | ((uint32)g << 10) | ((uint32)r << 20) | 0xc0000000;
(*(uint32*)rgb_buf) = ar30;
}
// 10 bit YUV to 10 bit AR30
void I210ToAR30Row_C(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
int b;
int g;
int r;
for (x = 0; x < width - 1; x += 2) {
YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf, b, g, r);
YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf + 4, b, g, r);
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf, b, g, r);
}
}
void I422AlphaToARGBRow_C(const uint8* src_y, void I422AlphaToARGBRow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
......
...@@ -1696,7 +1696,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, ...@@ -1696,7 +1696,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
"movdqa 160(%[yuvconstants]),%%xmm13 \n" \ "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
"movdqa 192(%[yuvconstants]),%%xmm14 \n" "movdqa 192(%[yuvconstants]),%%xmm14 \n"
// Convert 8 pixels: 8 UV and 8 Y // Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB(yuvconstants) \ #define YUVTORGB16(yuvconstants) \
"movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm3 \n" \ "movdqa %%xmm0,%%xmm3 \n" \
...@@ -1712,20 +1712,14 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, ...@@ -1712,20 +1712,14 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
"pmulhuw %%xmm14,%%xmm4 \n" \ "pmulhuw %%xmm14,%%xmm4 \n" \
"paddsw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm0 \n" \
"paddsw %%xmm4,%%xmm1 \n" \ "paddsw %%xmm4,%%xmm1 \n" \
"paddsw %%xmm4,%%xmm2 \n" \ "paddsw %%xmm4,%%xmm2 \n"
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
#define YUVTORGB_REGS \ #define YUVTORGB_REGS \
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
#else #else
#define YUVTORGB_SETUP(yuvconstants) #define YUVTORGB_SETUP(yuvconstants)
// Convert 8 pixels: 8 UV and 8 Y // Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB(yuvconstants) \ #define YUVTORGB16(yuvconstants) \
"movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm3 \n" \ "movdqa %%xmm0,%%xmm3 \n" \
...@@ -1741,15 +1735,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, ...@@ -1741,15 +1735,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
"pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
"paddsw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm0 \n" \
"paddsw %%xmm4,%%xmm1 \n" \ "paddsw %%xmm4,%%xmm1 \n" \
"paddsw %%xmm4,%%xmm2 \n" \ "paddsw %%xmm4,%%xmm2 \n"
#define YUVTORGB_REGS
#endif
#define YUVTORGB(yuvconstants) \
YUVTORGB16(yuvconstants) \
"psraw $0x6,%%xmm0 \n" \ "psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \ "psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \ "psraw $0x6,%%xmm2 \n" \
"packuswb %%xmm0,%%xmm0 \n" \ "packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n" "packuswb %%xmm2,%%xmm2 \n"
#define YUVTORGB_REGS
#endif
// Store 8 ARGB values. // Store 8 ARGB values.
#define STOREARGB \ #define STOREARGB \
...@@ -1774,6 +1771,32 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, ...@@ -1774,6 +1771,32 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
"movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
"lea 0x20(%[dst_rgba]),%[dst_rgba] \n" "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
// Store 8 AR30 values.
#define STOREAR30 \
"psraw $0x4,%%xmm0 \n" \
"psraw $0x4,%%xmm1 \n" \
"psraw $0x4,%%xmm2 \n" \
"pminsw %%xmm7,%%xmm0 \n" \
"pminsw %%xmm7,%%xmm1 \n" \
"pminsw %%xmm7,%%xmm2 \n" \
"pmaxsw %%xmm6,%%xmm0 \n" \
"pmaxsw %%xmm6,%%xmm1 \n" \
"pmaxsw %%xmm6,%%xmm2 \n" \
"psllw $0x4,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm3 \n" \
"punpcklwd %%xmm2,%%xmm0 \n" \
"punpckhwd %%xmm2,%%xmm3 \n" \
"movdqa %%xmm1,%%xmm2 \n" \
"punpcklwd %%xmm5,%%xmm1 \n" \
"punpckhwd %%xmm5,%%xmm2 \n" \
"pslld $0xa,%%xmm1 \n" \
"pslld $0xa,%%xmm2 \n" \
"por %%xmm1,%%xmm0 \n" \
"por %%xmm2,%%xmm3 \n" \
"movdqu %%xmm0,(%[dst_ar30]) \n" \
"movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
"lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -1908,6 +1931,41 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf, ...@@ -1908,6 +1931,41 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf,
); );
} }
// 10 bit YUV to AR30
void OMITFP I210ToAR30Row_SSSE3(const uint16* y_buf,
const uint16* u_buf,
const uint16* v_buf,
uint8* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $14,%%xmm5 \n"
"psllw $4,%%xmm5 \n" // 2 alpha bits
"pxor %%xmm6,%%xmm6 \n"
"pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
"psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
"1: \n"
READYUV210
YUVTORGB16(yuvconstants)
STOREAR30
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#ifdef HAS_I422ALPHATOARGBROW_SSSE3 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment