Commit 5452cce4 authored by Frank Barchard's avatar Frank Barchard

port row to clangcl

BUG=libyuv:487
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/53799005.
parent fa7ce4af
...@@ -23,18 +23,26 @@ extern "C" { ...@@ -23,18 +23,26 @@ extern "C" {
// This module is for Visual C 32/64 bit and clangcl 32 bit // This module is for Visual C 32/64 bit and clangcl 32 bit
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
struct YuvConstants { struct YuvConstants {
lvec8 kUVToB; // 0 lvec8 kUVToB;
lvec8 kUVToG; // 32 lvec8 kUVToG;
lvec8 kUVToR; // 64 lvec8 kUVToR;
lvec16 kUVBiasB; // 96 lvec16 kUVBiasB;
lvec16 kUVBiasG; // 128 lvec16 kUVBiasG;
lvec16 kUVBiasR; // 160 lvec16 kUVBiasR;
lvec16 kYToRgb; // 192 lvec16 kYToRgb;
}; };
#define KUVTOB 0
#define KUVTOG 32
#define KUVTOR 64
#define KUVBIASB 96
#define KUVBIASG 128
#define KUVBIASR 160
#define KYTORGB 192
// BT.601 YUV to RGB reference // BT.601 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.596 // R = (Y - 16) * 1.164 - V * -1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
...@@ -389,7 +397,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -389,7 +397,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24 pslld xmm5, 24
movdqa xmm4, kShuffleMaskRGB24ToARGB movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -428,7 +436,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, ...@@ -428,7 +436,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24 pslld xmm5, 24
movdqa xmm4, kShuffleMaskRAWToARGB movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -759,7 +767,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -759,7 +767,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRGB24 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm0, [eax] // fetch 16 pixels of argb
...@@ -797,7 +805,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -797,7 +805,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRAW movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm0, [eax] // fetch 16 pixels of argb
...@@ -1142,8 +1150,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1142,8 +1150,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
movdqa xmm4, kARGBToY movdqa xmm4, xmmword ptr kARGBToY
movdqa xmm5, kAddY16 movdqa xmm5, xmmword ptr kAddY16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1177,8 +1185,8 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1177,8 +1185,8 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
movdqa xmm4, kARGBToYJ movdqa xmm4, xmmword ptr kARGBToYJ
movdqa xmm5, kAddYJ64 movdqa xmm5, xmmword ptr kAddYJ64
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1218,9 +1226,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1218,9 +1226,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
vbroadcastf128 ymm4, kARGBToY vbroadcastf128 ymm4, xmmword ptr kARGBToY
vbroadcastf128 ymm5, kAddY16 vbroadcastf128 ymm5, xmmword ptr kAddY16
vmovdqu ymm6, kPermdARGBToY_AVX vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
...@@ -1257,9 +1265,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1257,9 +1265,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
vbroadcastf128 ymm4, kARGBToYJ vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
vbroadcastf128 ymm5, kAddYJ64 vbroadcastf128 ymm5, xmmword ptr kAddYJ64
vmovdqu ymm6, kPermdARGBToY_AVX vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
...@@ -1296,8 +1304,8 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1296,8 +1304,8 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
movdqa xmm4, kBGRAToY movdqa xmm4, xmmword ptr kBGRAToY
movdqa xmm5, kAddY16 movdqa xmm5, xmmword ptr kAddY16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1329,8 +1337,8 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1329,8 +1337,8 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
movdqa xmm4, kABGRToY movdqa xmm4, xmmword ptr kABGRToY
movdqa xmm5, kAddY16 movdqa xmm5, xmmword ptr kAddY16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1362,8 +1370,8 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1362,8 +1370,8 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
movdqa xmm4, kRGBAToY movdqa xmm4, xmmword ptr kRGBAToY
movdqa xmm5, kAddY16 movdqa xmm5, xmmword ptr kAddY16
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1400,9 +1408,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1400,9 +1408,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
movdqa xmm5, kAddUV128 movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, kARGBToV movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, kARGBToU movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
convertloop: convertloop:
...@@ -1470,9 +1478,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1470,9 +1478,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
movdqa xmm5, kAddUVJ128 movdqa xmm5, xmmword ptr kAddUVJ128
movdqa xmm6, kARGBToVJ movdqa xmm6, xmmword ptr kARGBToVJ
movdqa xmm7, kARGBToUJ movdqa xmm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
convertloop: convertloop:
...@@ -1542,9 +1550,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1542,9 +1550,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
vbroadcastf128 ymm5, kAddUV128 vbroadcastf128 ymm5, xmmword ptr kAddUV128
vbroadcastf128 ymm6, kARGBToV vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, kARGBToU vbroadcastf128 ymm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
convertloop: convertloop:
...@@ -1578,7 +1586,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1578,7 +1586,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
vpsraw ymm0, ymm0, 8 vpsraw ymm0, ymm0, 8
vpacksswb ymm0, ymm1, ymm0 // mutates vpacksswb ymm0, ymm1, ymm0 // mutates
vpermq ymm0, ymm0, 0xd8 // For vpacksswb vpermq ymm0, ymm0, 0xd8 // For vpacksswb
vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
vpaddb ymm0, ymm0, ymm5 // -> unsigned vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values // step 3 - store 16 U and 16 V values
...@@ -1605,9 +1613,9 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, ...@@ -1605,9 +1613,9 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
mov edx, [esp + 4 + 8] // dst_u mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix mov ecx, [esp + 4 + 16] // pix
movdqa xmm5, kAddUV128 movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, kARGBToV movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, kARGBToU movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
convertloop: convertloop:
...@@ -1662,9 +1670,9 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, ...@@ -1662,9 +1670,9 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
mov edx, [esp + 4 + 8] // dst_u mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix mov ecx, [esp + 4 + 16] // pix
movdqa xmm5, kAddUV128 movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, kARGBToV movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, kARGBToU movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
convertloop: convertloop:
...@@ -1722,9 +1730,9 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1722,9 +1730,9 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
movdqa xmm5, kAddUV128 movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, kBGRAToV movdqa xmm6, xmmword ptr kBGRAToV
movdqa xmm7, kBGRAToU movdqa xmm7, xmmword ptr kBGRAToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
convertloop: convertloop:
...@@ -1792,9 +1800,9 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1792,9 +1800,9 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
movdqa xmm5, kAddUV128 movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, kABGRToV movdqa xmm6, xmmword ptr kABGRToV
movdqa xmm7, kABGRToU movdqa xmm7, xmmword ptr kABGRToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
convertloop: convertloop:
...@@ -1862,9 +1870,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1862,9 +1870,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
movdqa xmm5, kAddUV128 movdqa xmm5, xmmword ptr kAddUV128
movdqa xmm6, kRGBAToV movdqa xmm6, xmmword ptr kRGBAToV
movdqa xmm7, kRGBAToU movdqa xmm7, xmmword ptr kRGBAToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
convertloop: convertloop:
...@@ -1964,21 +1972,21 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1964,21 +1972,21 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
// Convert 16 pixels: 16 UV and 16 Y. // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) __asm { \ #define YUVTORGB_AVX2(YuvConstants) __asm { \
/* Step 1: Find 8 UV contributions to 16 R,G,B values */ \ /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \
__asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \ __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
__asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \ __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
__asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \ __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
__asm vmovdqu ymm3, YuvConstants.kUVBiasR \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
__asm vpsubw ymm2, ymm3, ymm2 \ __asm vpsubw ymm2, ymm3, ymm2 \
__asm vmovdqu ymm3, YuvConstants.kUVBiasG \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
__asm vpsubw ymm1, ymm3, ymm1 \ __asm vpsubw ymm1, ymm3, ymm1 \
__asm vmovdqu ymm3, YuvConstants.kUVBiasB \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
__asm vpsubw ymm0, ymm3, ymm0 \ __asm vpsubw ymm0, ymm3, ymm0 \
/* Step 2: Find Y contribution to 16 R,G,B values */ \ /* Step 2: Find Y contribution to 16 R,G,B values */ \
__asm vmovdqu xmm3, [eax] /* NOLINT */ \ __asm vmovdqu xmm3, [eax] /* NOLINT */ \
__asm lea eax, [eax + 16] \ __asm lea eax, [eax + 16] \
__asm vpermq ymm3, ymm3, 0xd8 \ __asm vpermq ymm3, ymm3, 0xd8 \
__asm vpunpcklbw ymm3, ymm3, ymm3 \ __asm vpunpcklbw ymm3, ymm3, ymm3 \
__asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \ __asm vpmulhuw ymm3, ymm3, ymmword ptr [YuvConstants + KYTORGB] \
__asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \
__asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \
__asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \
...@@ -2393,20 +2401,20 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2393,20 +2401,20 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
__asm movdqa xmm1, xmm0 \ __asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \ __asm movdqa xmm2, xmm0 \
__asm movdqa xmm3, xmm0 \ __asm movdqa xmm3, xmm0 \
__asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
__asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \ __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
__asm psubw xmm0, xmm1 \ __asm psubw xmm0, xmm1 \
__asm movdqa xmm1, YuvConstants.kUVBiasG \ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
__asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \ __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
__asm psubw xmm1, xmm2 \ __asm psubw xmm1, xmm2 \
__asm movdqa xmm2, YuvConstants.kUVBiasR \ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
__asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \ __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
__asm psubw xmm2, xmm3 \ __asm psubw xmm2, xmm3 \
/* Step 2: Find Y contribution to 8 R,G,B values */ \ /* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \ __asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm3 \ __asm punpcklbw xmm3, xmm3 \
__asm pmulhuw xmm3, YuvConstants.kYToRgb \ __asm pmulhuw xmm3, xmmword ptr [YuvConstants + KYTORGB] \
__asm paddsw xmm0, xmm3 /* B += Y */ \ __asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \ __asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \ __asm paddsw xmm2, xmm3 /* R += Y */ \
...@@ -2592,8 +2600,8 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, ...@@ -2592,8 +2600,8 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // rgb24 mov edx, [esp + 8 + 16] // rgb24
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
movdqa xmm5, kShuffleMaskARGBToRGB24_0 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
movdqa xmm6, kShuffleMaskARGBToRGB24 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
convertloop: convertloop:
READYUV422 READYUV422
...@@ -2626,8 +2634,8 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, ...@@ -2626,8 +2634,8 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // raw mov edx, [esp + 8 + 16] // raw
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
movdqa xmm5, kShuffleMaskARGBToRAW_0 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0
movdqa xmm6, kShuffleMaskARGBToRAW movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
convertloop: convertloop:
READYUV422 READYUV422
...@@ -3045,7 +3053,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -3045,7 +3053,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
movdqa xmm5, kShuffleMirror movdqa xmm5, xmmword ptr kShuffleMirror
convertloop: convertloop:
movdqu xmm0, [eax - 16 + ecx] movdqu xmm0, [eax - 16 + ecx]
...@@ -3066,7 +3074,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3066,7 +3074,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
vbroadcastf128 ymm5, kShuffleMirror vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
convertloop: convertloop:
vmovdqu ymm0, [eax - 32 + ecx] vmovdqu ymm0, [eax - 32 + ecx]
...@@ -3123,7 +3131,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ...@@ -3123,7 +3131,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
mov edx, [esp + 4 + 8] // dst_u mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
movdqa xmm1, kShuffleMirrorUV movdqa xmm1, xmmword ptr kShuffleMirrorUV
lea eax, [eax + ecx * 2 - 16] lea eax, [eax + ecx * 2 - 16]
sub edi, edx sub edi, edx
...@@ -3177,7 +3185,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3177,7 +3185,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
vmovdqu ymm5, kARGBShuffleMirror_AVX2 vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
convertloop: convertloop:
vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
...@@ -4133,7 +4141,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4133,7 +4141,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi] // _r_b movdqu xmm2, [esi] // _r_b
pshufb xmm3, kShuffleAlpha // alpha pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
...@@ -4162,7 +4170,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4162,7 +4170,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
movdqa xmm0, xmm3 // src argb movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha pxor xmm3, xmm4 // ~alpha
movd xmm2, [esi] // _r_b movd xmm2, [esi] // _r_b
pshufb xmm3, kShuffleAlpha // alpha pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha pmullw xmm2, xmm3 // _r_b * alpha
...@@ -4246,8 +4254,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4246,8 +4254,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
pcmpeqb xmm3, xmm3 // generate mask 0xff000000 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
pslld xmm3, 24 pslld xmm3, 24
movdqa xmm4, kShuffleAlpha0 movdqa xmm4, xmmword ptr kShuffleAlpha0
movdqa xmm5, kShuffleAlpha1 movdqa xmm5, xmmword ptr kShuffleAlpha1
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
...@@ -4289,7 +4297,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4289,7 +4297,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
sub edx, eax sub edx, eax
vbroadcastf128 ymm4,kShuffleAlpha_AVX2 vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24 vpslld ymm5, ymm5, 24
...@@ -4381,7 +4389,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -4381,7 +4389,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
sub edx, eax sub edx, eax
vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
convertloop: convertloop:
vmovdqu ymm6, [eax] // read 8 pixels. vmovdqu ymm6, [eax] // read 8 pixels.
...@@ -4416,7 +4424,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -4416,7 +4424,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
sub edx, eax sub edx, eax
vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
push esi push esi
push edi push edi
...@@ -4480,8 +4488,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4480,8 +4488,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */ mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* width */ mov ecx, [esp + 12] /* width */
movdqa xmm4, kARGBToYJ movdqa xmm4, xmmword ptr kARGBToYJ
movdqa xmm5, kAddYJ64 movdqa xmm5, xmmword ptr kAddYJ64
convertloop: convertloop:
movdqu xmm0, [eax] // G movdqu xmm0, [eax] // G
...@@ -4538,9 +4546,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { ...@@ -4538,9 +4546,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
__asm { __asm {
mov eax, [esp + 4] /* dst_argb */ mov eax, [esp + 4] /* dst_argb */
mov ecx, [esp + 8] /* width */ mov ecx, [esp + 8] /* width */
movdqa xmm2, kARGBToSepiaB movdqa xmm2, xmmword ptr kARGBToSepiaB
movdqa xmm3, kARGBToSepiaG movdqa xmm3, xmmword ptr kARGBToSepiaG
movdqa xmm4, kARGBToSepiaR movdqa xmm4, xmmword ptr kARGBToSepiaR
convertloop: convertloop:
movdqu xmm0, [eax] // B movdqu xmm0, [eax] // B
...@@ -6245,7 +6253,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, ...@@ -6245,7 +6253,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
// 4 pixel loop. // 4 pixel loop.
convertloop: convertloop:
movdqu xmm0, qword ptr [eax] // generate luma ptr movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3 pmaddubsw xmm0, xmm3
phaddw xmm0, xmm0 phaddw xmm0, xmm0
pand xmm0, xmm4 // mask out low bits pand xmm0, xmm4 // mask out low bits
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment