Commit 925c3d9e authored by Frank Barchard's avatar Frank Barchard

I420ToARGB conversion with matrix.

Take color conversion constants as a parameter to row function for I420ToARGBMatrixRow_SSSE3.
Allows future variations of color space using a single low level.

R=harryjin@google.com
BUG=libyuv:488

Review URL: https://webrtc-codereview.appspot.com/56669004 .
parent 0bc626a5
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1475 Version: 1476
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -88,6 +88,7 @@ extern "C" { ...@@ -88,6 +88,7 @@ extern "C" {
#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOARGBMATRIXROW_SSSE3
#define HAS_I422TOBGRAROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3
#define HAS_I422TORAWROW_SSSE3 #define HAS_I422TORAWROW_SSSE3
#define HAS_I422TORGB24ROW_SSSE3 #define HAS_I422TORGB24ROW_SSSE3
...@@ -161,6 +162,7 @@ extern "C" { ...@@ -161,6 +162,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__)) (!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOARGBMATRIXROW_SSSE3
#endif #endif
// GCC >= 4.7.0 required for AVX2. // GCC >= 4.7.0 required for AVX2.
...@@ -223,6 +225,7 @@ extern "C" { ...@@ -223,6 +225,7 @@ extern "C" {
#define HAS_I400TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2 #define HAS_I422TOARGBROW_AVX2
#define HAS_I422TOARGBMATRIXROW_AVX2
#define HAS_I422TOBGRAROW_AVX2 #define HAS_I422TOBGRAROW_AVX2
#define HAS_I422TORAWROW_AVX2 #define HAS_I422TORAWROW_AVX2
#define HAS_I422TORGB24ROW_AVX2 #define HAS_I422TORGB24ROW_AVX2
...@@ -290,6 +293,8 @@ extern "C" { ...@@ -290,6 +293,8 @@ extern "C" {
#define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB1555ROW_NEON
#define HAS_I422TOARGB4444ROW_NEON #define HAS_I422TOARGB4444ROW_NEON
#define HAS_I422TOARGBROW_NEON #define HAS_I422TOARGBROW_NEON
// TODO(fbarchard): Implement NEON version
#define HAS_I422TOARGBMATRIXROW_NEON
#define HAS_I422TOBGRAROW_NEON #define HAS_I422TOBGRAROW_NEON
#define HAS_I422TORAWROW_NEON #define HAS_I422TORAWROW_NEON
#define HAS_I422TORGB24ROW_NEON #define HAS_I422TORGB24ROW_NEON
...@@ -414,6 +419,21 @@ typedef uint32 ulvec32[8]; ...@@ -414,6 +419,21 @@ typedef uint32 ulvec32[8];
typedef uint8 ulvec8[32]; typedef uint8 ulvec8[32];
#endif #endif
// This struct is for Intel color conversion.
#if defined(_M_IX86) || defined(_M_X64) || \
defined(__x86_64__) || defined(__i386__)
struct YuvConstants {
lvec8 kUVToB;
lvec8 kUVToG;
lvec8 kUVToR;
lvec16 kUVBiasB;
lvec16 kUVBiasG;
lvec16 kUVBiasR;
lvec16 kYToRgb;
};
#endif
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
#define OMITFP #define OMITFP
#else #else
...@@ -509,6 +529,12 @@ void I422ToARGBRow_NEON(const uint8* src_y, ...@@ -509,6 +529,12 @@ void I422ToARGBRow_NEON(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToARGBMatrixRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I411ToARGBRow_NEON(const uint8* src_y, void I411ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -962,6 +988,12 @@ void I422ToARGBRow_C(const uint8* src_y, ...@@ -962,6 +988,12 @@ void I422ToARGBRow_C(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToARGBMatrixRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I411ToARGBRow_C(const uint8* src_y, void I411ToARGBRow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1039,6 +1071,12 @@ void I422ToARGBRow_AVX2(const uint8* src_y, ...@@ -1039,6 +1071,12 @@ void I422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToARGBMatrixRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I422ToBGRARow_AVX2(const uint8* src_y, void I422ToBGRARow_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1069,6 +1107,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y, ...@@ -1069,6 +1107,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToARGBMatrixRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I411ToARGBRow_SSSE3(const uint8* src_y, void I411ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1203,6 +1247,12 @@ void I422ToARGBRow_Any_AVX2(const uint8* src_y, ...@@ -1203,6 +1247,12 @@ void I422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToARGBMatrixRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I422ToBGRARow_Any_AVX2(const uint8* src_y, void I422ToBGRARow_Any_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1233,6 +1283,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y, ...@@ -1233,6 +1283,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToARGBMatrixRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I411ToARGBRow_Any_SSSE3(const uint8* src_y, void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -1460,6 +1516,12 @@ void I444ToARGBRow_Any_NEON(const uint8* src_y, ...@@ -1460,6 +1516,12 @@ void I444ToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToARGBRow_Any_NEON(const uint8* src_y, void I422ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width);
void I422ToARGBMatrixRow_Any_NEON(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1475 #define LIBYUV_VERSION 1476
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -2156,6 +2156,51 @@ void I422ToUYVYRow_C(const uint8* src_y, ...@@ -2156,6 +2156,51 @@ void I422ToUYVYRow_C(const uint8* src_y,
} }
} }
#if defined(HAS_I422TOARGBMATRIXROW_SSSE3)
extern struct YuvConstants kYuvConstants;
extern struct YuvConstants kYuvJConstants;
// JPeg color space version of I422ToARGB
void J422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
I422ToARGBMatrixRow_SSSE3(y_buf, u_buf, v_buf, dst_argb,
&kYuvJConstants, width);
}
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
I422ToARGBMatrixRow_SSSE3(y_buf, u_buf, v_buf, dst_argb,
&kYuvConstants, width);
}
#if defined(HAS_I422TOARGBMATRIXROW_AVX2)
// JPeg color space version of I422ToARGB
void J422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
I422ToARGBMatrixRow_AVX2(y_buf, u_buf, v_buf, dst_argb,
&kYuvJConstants, width);
}
void I422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
I422ToARGBMatrixRow_AVX2(y_buf, u_buf, v_buf, dst_argb,
&kYuvConstants, width);
}
#endif
#endif
// Maximum temporary width for wrappers to process at a time, in pixels. // Maximum temporary width for wrappers to process at a time, in pixels.
#define MAXTWIDTH 2048 #define MAXTWIDTH 2048
......
...@@ -1319,16 +1319,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1319,16 +1319,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
struct YuvConstants {
lvec8 kUVToB; // 0
lvec8 kUVToG; // 32
lvec8 kUVToR; // 64
lvec16 kUVBiasB; // 96
lvec16 kUVBiasG; // 128
lvec16 kUVBiasR; // 160
lvec16 kYToRgb; // 192
};
// BT.601 YUV to RGB reference // BT.601 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.596 // R = (Y - 16) * 1.164 - V * -1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
...@@ -1351,7 +1341,7 @@ struct YuvConstants { ...@@ -1351,7 +1341,7 @@ struct YuvConstants {
#define BR (VR * 128 + YGB) #define BR (VR * 128 + YGB)
// BT601 constants for YUV to RGB. // BT601 constants for YUV to RGB.
static YuvConstants SIMD_ALIGNED(kYuvConstants) = { YuvConstants SIMD_ALIGNED(kYuvConstants) = {
{ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
...@@ -1365,7 +1355,7 @@ static YuvConstants SIMD_ALIGNED(kYuvConstants) = { ...@@ -1365,7 +1355,7 @@ static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
}; };
// BT601 constants for NV21 where chroma plane is VU instead of UV. // BT601 constants for NV21 where chroma plane is VU instead of UV.
static YuvConstants SIMD_ALIGNED(kYvuConstants) = { YuvConstants SIMD_ALIGNED(kYvuConstants) = {
{ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
{ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
...@@ -1658,10 +1648,11 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, ...@@ -1658,10 +1648,11 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, void OMITFP I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width) { int width) {
asm volatile ( asm volatile (
"sub %[u_buf],%[v_buf] \n" "sub %[u_buf],%[v_buf] \n"
...@@ -1678,33 +1669,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1678,33 +1669,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
[v_buf]"+r"(v_buf), // %[v_buf] [v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb] [dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : [kYuvConstants]"r"(YuvConstants) // %[YuvConstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUV422
YUVTORGB(kYuvConstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); );
...@@ -1939,55 +1904,14 @@ void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -1939,55 +1904,14 @@ void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
} }
#endif // HAS_I422TOBGRAROW_AVX2 #endif // HAS_I422TOBGRAROW_AVX2
#if defined(HAS_I422TOARGBROW_AVX2) #if defined(HAS_I422TOARGBMATRIXROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(kYuvConstants)
// Step 3: Weave into ARGB
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
"vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
"vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
"vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
"lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_I422TOARGBROW_AVX2
#if defined(HAS_J422TOARGBROW_AVX2)
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf, void OMITFP I422ToARGBMatrixRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width) { int width) {
asm volatile ( asm volatile (
"sub %[u_buf],%[v_buf] \n" "sub %[u_buf],%[v_buf] \n"
...@@ -2016,12 +1940,12 @@ void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2016,12 +1940,12 @@ void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
[v_buf]"+r"(v_buf), // %[v_buf] [v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb] [dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] : [kYuvConstants]"r"(YuvConstants) // %[YuvConstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); );
} }
#endif // HAS_J422TOARGBROW_AVX2 #endif // HAS_I422TOARGBMATRIXROW_AVX2
#if defined(HAS_I422TOABGRROW_AVX2) #if defined(HAS_I422TOABGRROW_AVX2)
// 16 pixels // 16 pixels
......
...@@ -25,16 +25,6 @@ extern "C" { ...@@ -25,16 +25,6 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
struct YuvConstants {
lvec8 kUVToB;
lvec8 kUVToG;
lvec8 kUVToR;
lvec16 kUVBiasB;
lvec16 kUVBiasG;
lvec16 kUVBiasR;
lvec16 kYToRgb;
};
#define KUVTOB 0 #define KUVTOB 0
#define KUVTOG 32 #define KUVTOG 32
#define KUVTOR 64 #define KUVTOR 64
...@@ -65,7 +55,7 @@ struct YuvConstants { ...@@ -65,7 +55,7 @@ struct YuvConstants {
#define BR (VR * 128 + YGB) #define BR (VR * 128 + YGB)
// BT601 constants for YUV to RGB. // BT601 constants for YUV to RGB.
static YuvConstants SIMD_ALIGNED(kYuvConstants) = { YuvConstants SIMD_ALIGNED(kYuvConstants) = {
{ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
...@@ -79,7 +69,7 @@ static YuvConstants SIMD_ALIGNED(kYuvConstants) = { ...@@ -79,7 +69,7 @@ static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
}; };
// BT601 constants for NV21 where chroma plane is VU instead of UV. // BT601 constants for NV21 where chroma plane is VU instead of UV.
static YuvConstants SIMD_ALIGNED(kYvuConstants) = { YuvConstants SIMD_ALIGNED(kYvuConstants) = {
{ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
{ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
...@@ -124,7 +114,7 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { ...@@ -124,7 +114,7 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
#define BRJ (VRJ * 128 + YGBJ) #define BRJ (VRJ * 128 + YGBJ)
// JPEG constants for YUV to RGB. // JPEG constants for YUV to RGB.
static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
{ UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
{ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
...@@ -155,11 +145,12 @@ static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { ...@@ -155,11 +145,12 @@ static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
// 64 bit // 64 bit
#if defined(_M_X64) #if defined(_M_X64)
#if defined(HAS_I422TOARGBROW_SSSE3) #if defined(HAS_I422TOARGBMATRIXROW_SSSE3)
void I422ToARGBRow_SSSE3(const uint8* y_buf, void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width) { int width) {
__m128i xmm0, xmm1, xmm2, xmm3; __m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm5 = _mm_set1_epi8(-1);
...@@ -172,15 +163,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -172,15 +163,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
xmm1 = _mm_loadu_si128(&xmm0); xmm1 = _mm_loadu_si128(&xmm0);
xmm2 = _mm_loadu_si128(&xmm0); xmm2 = _mm_loadu_si128(&xmm0);
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB); xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)YuvConstants->kUVToB);
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG); xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)YuvConstants->kUVToG);
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR); xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)YuvConstants->kUVToR);
xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0); xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0);
xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1); xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1);
xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2); xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2);
xmm3 = _mm_loadl_epi64((__m128i*)y_buf); xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb); xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)YuvConstants->kYToRgb);
xmm0 = _mm_adds_epi16(xmm0, xmm3); xmm0 = _mm_adds_epi16(xmm0, xmm3);
xmm1 = _mm_adds_epi16(xmm1, xmm3); xmm1 = _mm_adds_epi16(xmm1, xmm3);
xmm2 = _mm_adds_epi16(xmm2, xmm3); xmm2 = _mm_adds_epi16(xmm2, xmm3);
...@@ -2012,77 +2003,45 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -2012,77 +2003,45 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm lea edx, [edx + 64] \ __asm lea edx, [edx + 64] \
} }
#ifdef HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBMATRIXROW_AVX2
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(naked)
void I422ToARGBRow_AVX2(const uint8* y_buf, void I422ToARGBMatrixRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width) { int width) {
__asm { __asm {
push esi push esi
push edi push edi
mov eax, [esp + 8 + 4] // Y push ebp
mov esi, [esp + 8 + 8] // U mov eax, [esp + 12 + 4] // Y
mov edi, [esp + 8 + 12] // V mov esi, [esp + 12 + 8] // U
mov edx, [esp + 8 + 16] // argb mov edi, [esp + 12 + 12] // V
mov ecx, [esp + 8 + 20] // width mov edx, [esp + 12 + 16] // argb
sub edi, esi mov ebp, [esp + 12 + 20] // YuvConstants
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha mov ecx, [esp + 12 + 20] // width
convertloop:
READYUV422_AVX2
YUVTORGB_AVX2(kYuvConstants)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
pop edi
pop esi
vzeroupper
ret
}
}
#endif // HAS_I422TOARGBROW_AVX2
#ifdef HAS_J422TOARGBROW_AVX2
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked)
void J422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // argb
mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
YUVTORGB_AVX2(kYuvJConstants) YUVTORGB_AVX2(ebp)
STOREARGB_AVX2 STOREARGB_AVX2
sub ecx, 16 sub ecx, 16
jg convertloop jg convertloop
pop ebp
pop edi pop edi
pop esi pop esi
vzeroupper vzeroupper
ret ret
} }
} }
#endif // HAS_J422TOARGBROW_AVX2 #endif // HAS_I422TOARGBMATRIXROW_AVX2
#ifdef HAS_I444TOARGBROW_AVX2 #ifdef HAS_I444TOARGBROW_AVX2
// 16 pixels // 16 pixels
...@@ -2691,10 +2650,11 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, ...@@ -2691,10 +2650,11 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(naked)
void I422ToARGBRow_SSSE3(const uint8* y_buf, void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* YuvConstants,
int width) { int width) {
__asm { __asm {
push esi push esi
...@@ -2704,8 +2664,9 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2704,8 +2664,9 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ecx, [esp + 12 + 20] // width mov ebp, [esp + 12 + 20] // YuvConstants
lea ebp, kYuvConstants mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -2724,40 +2685,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2724,40 +2685,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
} }
} }
// 8 pixels.
// JPeg color space version of I422ToARGB
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked)
void J422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // U
mov edi, [esp + 8 + 12] // V
mov edx, [esp + 8 + 16] // argb
mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READYUV422
YUVTORGB(kYuvJConstants)
STOREARGB
sub ecx, 8
jg convertloop
pop edi
pop esi
ret
}
}
// 8 pixels. // 8 pixels.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
// Similar to I420 but duplicate UV once more. // Similar to I420 but duplicate UV once more.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment