Commit 5f3d4270 authored by Frank Barchard's avatar Frank Barchard

yuy2 to rgb gcc versions

read in read function for yuv conversion

R=harryjin@google.com
BUG=libyuv:488

Review URL: https://codereview.chromium.org/1355393002 .
parent 03cd8584
...@@ -127,8 +127,6 @@ extern "C" { ...@@ -127,8 +127,6 @@ extern "C" {
#define HAS_MIRRORUVROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3
#define HAS_NV21TOARGBROW_SSSE3
#define HAS_NV21TORGB565ROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3
#define HAS_RAWTOYROW_SSSE3 #define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3
...@@ -209,8 +207,6 @@ extern "C" { ...@@ -209,8 +207,6 @@ extern "C" {
#define HAS_J400TOARGBROW_AVX2 #define HAS_J400TOARGBROW_AVX2
#define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TOARGBROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2 #define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_NV21TORGB565ROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2
#endif #endif
...@@ -321,8 +317,6 @@ extern "C" { ...@@ -321,8 +317,6 @@ extern "C" {
#define HAS_MIRRORUVROW_NEON #define HAS_MIRRORUVROW_NEON
#define HAS_NV12TOARGBROW_NEON #define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB565ROW_NEON #define HAS_NV12TORGB565ROW_NEON
#define HAS_NV21TOARGBROW_NEON
#define HAS_NV21TORGB565ROW_NEON
#define HAS_RAWTOARGBROW_NEON #define HAS_RAWTOARGBROW_NEON
#define HAS_RAWTOUVROW_NEON #define HAS_RAWTOUVROW_NEON
#define HAS_RAWTOYROW_NEON #define HAS_RAWTOYROW_NEON
...@@ -1068,11 +1062,6 @@ void NV12ToARGBRow_C(const uint8* src_y, ...@@ -1068,11 +1062,6 @@ void NV12ToARGBRow_C(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* yuvconstants, struct YuvConstants* yuvconstants,
int width); int width);
void NV21ToRGB565Row_C(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_C(const uint8* src_y, void NV12ToRGB565Row_C(const uint8* src_y,
const uint8* src_uv, const uint8* src_uv,
uint8* dst_argb, uint8* dst_argb,
...@@ -1433,21 +1422,11 @@ void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, ...@@ -1433,21 +1422,11 @@ void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* yuvconstants, struct YuvConstants* yuvconstants,
int width); int width);
void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_Any_AVX2(const uint8* src_y, void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_uv, const uint8* src_uv,
uint8* dst_argb, uint8* dst_argb,
struct YuvConstants* yuvconstants, struct YuvConstants* yuvconstants,
int width); int width);
void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
const uint8* src_uv, const uint8* src_uv,
uint8* dst_argb, uint8* dst_argb,
......
...@@ -2476,48 +2476,6 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, ...@@ -2476,48 +2476,6 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y,
} }
#endif #endif
#if defined(HAS_YUY2TOARGBROW_SSSE3)
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, yuvconstants, twidth);
src_yuy2 += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
#endif
#if defined(HAS_UYVYTOARGBROW_SSSE3)
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, yuvconstants, twidth);
src_uyvy += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
#endif // !defined(LIBYUV_DISABLE_X86)
#if defined(HAS_I422TORGB565ROW_AVX2) #if defined(HAS_I422TORGB565ROW_AVX2)
void I422ToRGB565Row_AVX2(const uint8* src_y, void I422ToRGB565Row_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
......
...@@ -1326,6 +1326,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1326,6 +1326,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \
"movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
<<<<<<< HEAD
"punpcklbw %%xmm4,%%xmm4 \n" \
=======
>>>>>>> refs/remotes/origin/master
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV // Read 4 UV from 422, upsample to 8 UV
...@@ -1336,6 +1340,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1336,6 +1340,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \
"movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
<<<<<<< HEAD
"punpcklbw %%xmm4,%%xmm4 \n" \
=======
>>>>>>> refs/remotes/origin/master
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 2 UV from 411, upsample to 8 UV // Read 2 UV from 411, upsample to 8 UV
...@@ -1347,6 +1355,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1347,6 +1355,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"punpcklwd %%xmm0,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \
"punpckldq %%xmm0,%%xmm0 \n" \ "punpckldq %%xmm0,%%xmm0 \n" \
"movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
<<<<<<< HEAD
"punpcklbw %%xmm4,%%xmm4 \n" \
=======
>>>>>>> refs/remotes/origin/master
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from NV12, upsample to 8 UV // Read 4 UV from NV12, upsample to 8 UV
...@@ -1355,7 +1367,48 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1355,7 +1367,48 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \
"movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
<<<<<<< HEAD
"punpcklbw %%xmm4,%%xmm4 \n" \
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// YUY2 shuf 8 Y to 16 Y.
static const vec8 kShuffleYUY2Y = {
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
};
// YUY2 shuf 4 UV to 8 UV.
static const vec8 kShuffleYUY2UV = {
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
};
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
#define READYUY2 \
"movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
"pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
"movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
"pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
"lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
// UYVY shuf 8 Y to 16 Y.
static const vec8 kShuffleUYVYY = {
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
};
// UYVY shuf 4 UV to 8 UV.
static const vec8 kShuffleUYVYUV = {
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
};
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
#define READUYVY \
"movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
"pshufb %[kShuffleUYVYY], %%xmm4 \n" \
"movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
"pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
"lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
=======
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
>>>>>>> refs/remotes/origin/master
// Convert 8 pixels: 8 UV and 8 Y // Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB(yuvconstants) \ #define YUVTORGB(yuvconstants) \
...@@ -1371,7 +1424,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1371,7 +1424,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
"pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
"psubw %%xmm3,%%xmm2 \n" \ "psubw %%xmm3,%%xmm2 \n" \
<<<<<<< HEAD
=======
"punpcklbw %%xmm4,%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \
>>>>>>> refs/remotes/origin/master
"pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
"paddsw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm0 \n" \
"paddsw %%xmm4,%%xmm1 \n" \ "paddsw %%xmm4,%%xmm1 \n" \
...@@ -1452,7 +1508,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1452,7 +1508,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
...@@ -1479,7 +1535,7 @@ void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1479,7 +1535,7 @@ void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
...@@ -1525,7 +1581,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, ...@@ -1525,7 +1581,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
[kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
[kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
); );
} }
...@@ -1570,7 +1626,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, ...@@ -1570,7 +1626,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
[kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
[kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
); );
} }
...@@ -1597,7 +1653,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1597,7 +1653,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
...@@ -1624,7 +1680,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1624,7 +1680,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
...@@ -1648,7 +1704,55 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1648,7 +1704,55 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
// Does not use r14. // Does not use r14.
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUY2
YUVTORGB(yuvconstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleYUY2Y]"m"(kShuffleYUY2Y),
[kShuffleYUY2UV]"m"(kShuffleYUY2UV)
// Does not use r14.
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READUYVY
YUVTORGB(yuvconstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleUYVYY]"m"(kShuffleUYVYY),
[kShuffleUYVYUV]"m"(kShuffleUYVYUV)
// Does not use r14.
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
...@@ -1675,7 +1779,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -1675,7 +1779,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
...@@ -1702,7 +1806,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -1702,7 +1806,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
...@@ -1729,7 +1833,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -1729,7 +1833,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
...@@ -1808,7 +1912,7 @@ void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -1808,7 +1912,7 @@ void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
#endif // HAS_I422TOBGRAROW_AVX2 #endif // HAS_I422TOBGRAROW_AVX2
...@@ -1851,7 +1955,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -1851,7 +1955,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
#endif // HAS_I422TOARGBROW_AVX2 #endif // HAS_I422TOARGBROW_AVX2
...@@ -1893,7 +1997,7 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1893,7 +1997,7 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
#endif // HAS_I422TOABGRROW_AVX2 #endif // HAS_I422TOABGRROW_AVX2
...@@ -1935,7 +2039,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, ...@@ -1935,7 +2039,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
[width]"+rm"(width) // %[width] [width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
#endif // HAS_I422TORGBAROW_AVX2 #endif // HAS_I422TORGBAROW_AVX2
......
...@@ -36,6 +36,10 @@ extern "C" { ...@@ -36,6 +36,10 @@ extern "C" {
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
u_buf += 4; \ u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
<<<<<<< HEAD
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
=======
>>>>>>> refs/remotes/origin/master
y_buf += 8; \ y_buf += 8; \
// Convert 8 pixels: 8 UV and 8 Y. // Convert 8 pixels: 8 UV and 8 Y.
...@@ -48,7 +52,10 @@ extern "C" { ...@@ -48,7 +52,10 @@ extern "C" {
xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0); \ xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0); \
xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1); \ xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1); \
xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2); \ xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2); \
<<<<<<< HEAD
=======
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
>>>>>>> refs/remotes/origin/master
xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)YuvConstants->kYToRgb); \ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)YuvConstants->kYToRgb); \
xmm0 = _mm_adds_epi16(xmm0, xmm4); \ xmm0 = _mm_adds_epi16(xmm0, xmm4); \
xmm1 = _mm_adds_epi16(xmm1, xmm4); \ xmm1 = _mm_adds_epi16(xmm1, xmm4); \
...@@ -1853,6 +1860,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1853,6 +1860,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vpermq ymm1, ymm1, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \
__asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
__asm vmovdqu xmm4, [eax] /* Y */ \ __asm vmovdqu xmm4, [eax] /* Y */ \
<<<<<<< HEAD
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
=======
>>>>>>> refs/remotes/origin/master
__asm lea eax, [eax + 16] \ __asm lea eax, [eax + 16] \
} }
...@@ -1865,6 +1877,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1865,6 +1877,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vpermq ymm0, ymm0, 0xd8 \ __asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
__asm vmovdqu xmm4, [eax] /* Y */ \ __asm vmovdqu xmm4, [eax] /* Y */ \
<<<<<<< HEAD
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
=======
>>>>>>> refs/remotes/origin/master
__asm lea eax, [eax + 16] \ __asm lea eax, [eax + 16] \
} }
...@@ -1878,6 +1895,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1878,6 +1895,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vpermq ymm0, ymm0, 0xd8 \ __asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
__asm vmovdqu xmm4, [eax] /* Y */ \ __asm vmovdqu xmm4, [eax] /* Y */ \
<<<<<<< HEAD
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
=======
>>>>>>> refs/remotes/origin/master
__asm lea eax, [eax + 16] \ __asm lea eax, [eax + 16] \
} }
...@@ -1888,6 +1910,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1888,6 +1910,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vpermq ymm0, ymm0, 0xd8 \ __asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
__asm vmovdqu xmm4, [eax] /* Y */ \ __asm vmovdqu xmm4, [eax] /* Y */ \
<<<<<<< HEAD
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
=======
>>>>>>> refs/remotes/origin/master
__asm lea eax, [eax + 16] \ __asm lea eax, [eax + 16] \
} }
...@@ -1903,8 +1930,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1903,8 +1930,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
__asm vpsubw ymm0, ymm3, ymm0 \ __asm vpsubw ymm0, ymm3, ymm0 \
/* Step 2: Find Y contribution to 16 R,G,B values */ \ /* Step 2: Find Y contribution to 16 R,G,B values */ \
<<<<<<< HEAD
=======
__asm vpermq ymm4, ymm4, 0xd8 \ __asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \
>>>>>>> refs/remotes/origin/master
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
__asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
__asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
...@@ -1987,7 +2017,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -1987,7 +2017,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
...@@ -2027,7 +2057,7 @@ void I444ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2027,7 +2057,7 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
...@@ -2066,7 +2096,7 @@ void I444ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2066,7 +2096,7 @@ void I444ToABGRRow_AVX2(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr mov edx, [esp + 12 + 16] // abgr
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
...@@ -2105,7 +2135,7 @@ void I411ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2105,7 +2135,7 @@ void I411ToARGBRow_AVX2(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr mov edx, [esp + 12 + 16] // abgr
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
...@@ -2142,7 +2172,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2142,7 +2172,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // UV mov esi, [esp + 8 + 8] // UV
mov edx, [esp + 8 + 12] // argb mov edx, [esp + 8 + 12] // argb
mov ebp, [esp + 8 + 16] // YuvConstants mov ebp, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
...@@ -2181,7 +2211,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -2181,7 +2211,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr mov edx, [esp + 12 + 16] // abgr
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
...@@ -2221,7 +2251,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, ...@@ -2221,7 +2251,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr mov edx, [esp + 12 + 16] // abgr
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
...@@ -2261,7 +2291,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2261,7 +2291,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
...@@ -2293,6 +2323,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2293,6 +2323,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm lea esi, [esi + 8] \ __asm lea esi, [esi + 8] \
__asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklbw xmm0, xmm1 /* UV */ \
__asm movq xmm4, qword ptr [eax] \ __asm movq xmm4, qword ptr [eax] \
<<<<<<< HEAD
__asm punpcklbw xmm4, xmm4 \
=======
>>>>>>> refs/remotes/origin/master
__asm lea eax, [eax + 8] \ __asm lea eax, [eax + 8] \
} }
...@@ -2304,6 +2338,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2304,6 +2338,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \ __asm movq xmm4, qword ptr [eax] \
<<<<<<< HEAD
__asm punpcklbw xmm4, xmm4 \
=======
>>>>>>> refs/remotes/origin/master
__asm lea eax, [eax + 8] \ __asm lea eax, [eax + 8] \
} }
...@@ -2316,6 +2354,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2316,6 +2354,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \ __asm movq xmm4, qword ptr [eax] \
<<<<<<< HEAD
__asm punpcklbw xmm4, xmm4 \
=======
>>>>>>> refs/remotes/origin/master
__asm lea eax, [eax + 8] \ __asm lea eax, [eax + 8] \
} }
...@@ -2325,9 +2367,52 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2325,9 +2367,52 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm lea esi, [esi + 8] \ __asm lea esi, [esi + 8] \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \ __asm movq xmm4, qword ptr [eax] \
<<<<<<< HEAD
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \ __asm lea eax, [eax + 8] \
} }
// YUY2 shuf 8 Y to 16 Y.
static const vec8 kShuffleYUY2Y = {
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
};
// YUY2 shuf 4 UV to 8 UV.
static const vec8 kShuffleYUY2UV = {
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
};
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
#define READYUY2 __asm { \
__asm movdqu xmm4, [eax] /* YUY2 */ \
__asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
__asm movdqu xmm0, [eax] /* UV */ \
__asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 16] \
}
// UYVY shuf 8 Y to 16 Y.
static const vec8 kShuffleUYVYY = {
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
};
// UYVY shuf 4 UV to 8 UV.
static const vec8 kShuffleUYVYUV = {
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
};
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
#define READUYVY __asm { \
__asm movdqu xmm4, [eax] /* UYVY */ \
__asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
__asm movdqu xmm0, [eax] /* UV */ \
__asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 16] \
=======
__asm lea eax, [eax + 8] \
>>>>>>> refs/remotes/origin/master
}
// Convert 8 pixels: 8 UV and 8 Y. // Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) __asm { \ #define YUVTORGB(YuvConstants) __asm { \
__asm movdqa xmm1, xmm0 \ __asm movdqa xmm1, xmm0 \
...@@ -2342,7 +2427,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2342,7 +2427,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
__asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
__asm psubw xmm2, xmm3 \ __asm psubw xmm2, xmm3 \
<<<<<<< HEAD
=======
__asm punpcklbw xmm4, xmm4 \ __asm punpcklbw xmm4, xmm4 \
>>>>>>> refs/remotes/origin/master
__asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
__asm paddsw xmm0, xmm4 /* B += Y */ \ __asm paddsw xmm0, xmm4 /* B += Y */ \
__asm paddsw xmm1, xmm4 /* G += Y */ \ __asm paddsw xmm1, xmm4 /* G += Y */ \
...@@ -2492,7 +2580,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2492,7 +2580,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -2529,7 +2617,7 @@ void I444ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -2529,7 +2617,7 @@ void I444ToABGRRow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr mov edx, [esp + 12 + 16] // abgr
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -2566,7 +2654,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, ...@@ -2566,7 +2654,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
...@@ -2604,7 +2692,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, ...@@ -2604,7 +2692,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0
...@@ -2642,7 +2730,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, ...@@ -2642,7 +2730,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate mask 0x0000001f pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
...@@ -2685,7 +2773,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2685,7 +2773,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -2723,7 +2811,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2723,7 +2811,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // abgr mov edx, [esp + 12 + 16] // abgr
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -2757,7 +2845,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2757,7 +2845,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
mov eax, [esp + 8 + 4] // Y mov eax, [esp + 8 + 4] // Y
mov esi, [esp + 8 + 8] // UV mov esi, [esp + 8 + 8] // UV
mov edx, [esp + 8 + 12] // argb mov edx, [esp + 8 + 12] // argb
mov ebp, [esp + 8 + 16] // YuvConstants mov ebp, [esp + 8 + 16] // yuvconstants
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -2775,6 +2863,62 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2775,6 +2863,62 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
} }
} }
// 8 pixels.
// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked)
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
__asm {
push ebp
mov eax, [esp + 4 + 4] // yuy2
mov edx, [esp + 4 + 8] // argb
mov ebp, [esp + 4 + 12] // yuvconstants
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READYUY2
YUVTORGB(ebp)
STOREARGB
sub ecx, 8
jg convertloop
pop ebp
ret
}
}
// 8 pixels.
// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked)
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
struct YuvConstants* yuvconstants,
int width) {
__asm {
push ebp
mov eax, [esp + 4 + 4] // uyvy
mov edx, [esp + 4 + 8] // argb
mov ebp, [esp + 4 + 12] // yuvconstants
mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READUYVY
YUVTORGB(ebp)
STOREARGB
sub ecx, 8
jg convertloop
pop ebp
ret
}
}
__declspec(naked) __declspec(naked)
void I422ToBGRARow_SSSE3(const uint8* y_buf, void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
...@@ -2790,7 +2934,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -2790,7 +2934,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
...@@ -2824,7 +2968,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -2824,7 +2968,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
...@@ -2859,7 +3003,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -2859,7 +3003,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
mov esi, [esp + 12 + 8] // U mov esi, [esp + 12 + 8] // U
mov edi, [esp + 12 + 12] // V mov edi, [esp + 12 + 12] // V
mov edx, [esp + 12 + 16] // argb mov edx, [esp + 12 + 16] // argb
mov ebp, [esp + 12 + 20] // YuvConstants mov ebp, [esp + 12 + 20] // yuvconstants
mov ecx, [esp + 12 + 24] // width mov ecx, [esp + 12 + 24] // width
sub edi, esi sub edi, esi
...@@ -3524,8 +3668,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { ...@@ -3524,8 +3668,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
#ifdef HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_AVX2
__declspec(naked) __declspec(naked)
void YUY2ToYRow_AVX2(const uint8* src_yuy2, void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
uint8* dst_y, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_yuy2 mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] // dst_y
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment