/* * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "row.h" extern "C" { #ifdef HAS_ARGBTOYROW_SSSE3 // Constant multiplication table for converting ARGB to I400. static const vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; static const vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; // Constants for BGRA static const vec8 kBGRAToY = { 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 }; static const vec8 kBGRAToU = { 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 }; static const vec8 kBGRAToV = { 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 }; // Constants for ABGR static const vec8 kABGRToY = { 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 }; static const vec8 kABGRToU = { -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 }; static const vec8 kABGRToV = { 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 }; static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; static const uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; // Shuffle table for converting BG24 to ARGB. static const uvec8 kShuffleMaskBG24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; // Shuffle table for converting RAW to ARGB. static const uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; // Shuffle table for converting ABGR to ARGB. static const uvec8 kShuffleMaskABGRToARGB = { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u }; // Shuffle table for converting BGRA to ARGB. static const uvec8 kShuffleMaskBGRAToARGB = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; __declspec(naked) void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_y mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 convertloop: movq xmm0, qword ptr [eax] lea eax, [eax + 8] punpcklbw xmm0, xmm0 movdqa xmm1, xmm0 punpcklwd xmm0, xmm0 punpckhwd xmm1, xmm1 por xmm0, xmm5 por xmm1, xmm5 movdqa [edx], xmm0 movdqa [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 ja convertloop ret } } __declspec(naked) void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_abgr mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix movdqa xmm5, kShuffleMaskABGRToARGB convertloop: movdqa xmm0, [eax] lea eax, [eax + 16] pshufb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 ja convertloop ret } } __declspec(naked) void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_bgra mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix movdqa xmm5, kShuffleMaskBGRAToARGB convertloop: movdqa xmm0, [eax] lea eax, [eax + 16] pshufb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 ja convertloop ret } } __declspec(naked) void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_bg24 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, kShuffleMaskBG24ToARGB convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqa [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqa [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqa [edx + 16], xmm1 por xmm3, xmm5 movdqa [edx + 48], xmm3 lea edx, [edx + 64] sub ecx, 16 ja convertloop ret } } __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_raw mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, kShuffleMaskRAWToARGB convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqa [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqa [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqa [edx + 16], xmm1 por xmm3, xmm5 movdqa [edx + 48], xmm3 lea edx, [edx + 64] sub ecx, 16 ja convertloop ret } } // Convert 16 ARGB pixels (64 bytes) to 16 Y values __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ movdqa xmm5, kAddY16 movdqa xmm4, kARGBToY convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 ja convertloop ret } } __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ movdqa xmm5, kAddY16 movdqa xmm4, kBGRAToY convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 ja convertloop ret } } __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ movdqa xmm5, kAddY16 movdqa xmm4, kABGRToY convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 ja convertloop ret } } __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix movdqa xmm7, kARGBToU movdqa xmm6, kARGBToV movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] pavgb xmm0, [eax + esi] pavgb xmm1, [eax + esi + 16] pavgb xmm2, [eax + esi + 32] pavgb xmm3, [eax + esi + 48] lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm4, xmm1, 0xdd pavgb xmm0, xmm4 movdqa xmm4, xmm2 shufps xmm2, xmm3, 0x88 shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U pmaddubsw xmm2, xmm7 pmaddubsw xmm1, xmm6 // V pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 ja convertloop pop edi pop esi ret } } __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix movdqa xmm7, kBGRAToU movdqa xmm6, kBGRAToV movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] pavgb xmm0, [eax + esi] pavgb xmm1, [eax + esi + 16] pavgb xmm2, [eax + esi + 32] pavgb xmm3, [eax + esi + 48] lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm4, xmm1, 0xdd pavgb xmm0, xmm4 movdqa xmm4, xmm2 shufps xmm2, xmm3, 0x88 shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U pmaddubsw xmm2, xmm7 pmaddubsw xmm1, xmm6 // V pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 ja convertloop pop edi pop esi ret } } __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix movdqa xmm7, kABGRToU movdqa xmm6, kABGRToV movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] pavgb xmm0, [eax + esi] pavgb xmm1, [eax + esi + 16] pavgb xmm2, [eax + esi + 32] pavgb xmm3, [eax + esi + 48] lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm4, xmm1, 0xdd pavgb xmm0, xmm4 movdqa xmm4, xmm2 shufps xmm2, xmm3, 0x88 shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U pmaddubsw xmm2, xmm7 pmaddubsw xmm1, xmm6 // V pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 ja convertloop pop edi pop esi ret } } #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */ #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */ #define UR 0 #define VB 0 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */ #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */ // Bias #define BB UB * 128 + VB * 128 #define BG UG * 128 + VG * 128 #define BR UR * 128 + VR * 128 static const vec8 kUVToB = { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }; static const vec8 kUVToR = { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }; static const vec8 kUVToG = { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }; static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; #define YUVTORGB __asm { \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ __asm movd xmm0, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ __asm psubw xmm1, kUVBiasG \ __asm psubw xmm2, kUVBiasR \ /* Step 2: Find Y contribution to 8 R,G,B values */ \ __asm movq xmm3, qword ptr [eax] \ __asm lea eax, [eax + 8] \ __asm punpcklbw xmm3, xmm4 \ __asm psubsw xmm3, kYSub16 \ __asm pmullw xmm3, kYToRgb \ __asm paddsw xmm0, xmm3 /* B += Y */ \ __asm paddsw xmm1, xmm3 /* G += Y */ \ __asm paddsw xmm2, xmm3 /* R += Y */ \ __asm psraw xmm0, 6 \ __asm psraw xmm1, 6 \ __asm psraw xmm2, 6 \ __asm packuswb xmm0, xmm0 /* B */ \ __asm packuswb xmm1, xmm1 /* G */ \ __asm packuswb xmm2, xmm2 /* R */ \ } __declspec(naked) void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V mov edx, [esp + 8 + 16] // rgb mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 convertloop: YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG punpcklbw xmm2, xmm5 // RA movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels movdqa [edx], xmm0 punpckhwd xmm1, xmm2 // BGRA next 4 pixels movdqa [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 ja convertloop pop edi pop esi ret } } __declspec(naked) void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V mov edx, [esp + 8 + 16] // rgb mov ecx, [esp + 8 + 20] // width sub edi, esi pxor xmm4, xmm4 convertloop: YUVTORGB // Step 3: Weave into BGRA pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha punpcklbw xmm1, xmm0 // GB punpcklbw xmm5, xmm2 // AR movdqa xmm0, xmm5 punpcklwd xmm5, xmm1 // BGRA first 4 pixels movdqa [edx], xmm5 punpckhwd xmm0, xmm1 // BGRA next 4 pixels movdqa [edx + 16], xmm0 lea edx, [edx + 32] sub ecx, 8 ja convertloop pop edi pop esi ret } } __declspec(naked) void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V mov edx, [esp + 8 + 16] // rgb mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 convertloop: YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm2, xmm1 // RG punpcklbw xmm0, xmm5 // BA movdqa xmm1, xmm2 punpcklwd xmm2, xmm0 // RGBA first 4 pixels movdqa [edx], xmm2 punpckhwd xmm1, xmm0 // RGBA next 4 pixels movdqa [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 ja convertloop pop edi pop esi ret } } __declspec(naked) void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V mov edx, [esp + 8 + 16] // rgb mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 convertloop: // Step 1: Find 4 UV contributions to 4 R,G,B values movd xmm0, [esi] // U movd xmm1, [esi + edi] // V lea esi, [esi + 4] punpcklbw xmm0, xmm1 // UV movdqa xmm1, xmm0 movdqa xmm2, xmm0 pmaddubsw xmm0, kUVToB // scale B UV pmaddubsw xmm1, kUVToG // scale G UV pmaddubsw xmm2, kUVToR // scale R UV psubw xmm0, kUVBiasB // unbias back to signed psubw xmm1, kUVBiasG psubw xmm2, kUVBiasR // Step 2: Find Y contribution to 4 R,G,B values movd xmm3, [eax] lea eax, [eax + 4] punpcklbw xmm3, xmm4 psubsw xmm3, kYSub16 pmullw xmm3, kYToRgb paddsw xmm0, xmm3 // B += Y paddsw xmm1, xmm3 // G += Y paddsw xmm2, xmm3 // R += Y psraw xmm0, 6 psraw xmm1, 6 psraw xmm2, 6 packuswb xmm0, xmm0 // B packuswb xmm1, xmm1 // G packuswb xmm2, xmm2 // R // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG punpcklbw xmm2, xmm5 // RA punpcklwd xmm0, xmm2 // BGRA 4 pixels movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 ja convertloop pop edi pop esi ret } } #endif #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 __declspec(naked) void FastConvertYToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width) { __asm { mov eax, [esp + 4] // Y mov edx, [esp + 8] // rgb mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 pxor xmm4, xmm4 movdqa xmm3, kYSub16 movdqa xmm2, kYToRgb convertloop: // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 movq xmm0, qword ptr [eax] lea eax, [eax + 8] punpcklbw xmm0, xmm4 psubsw xmm0, xmm3 pmullw xmm0, xmm2 psraw xmm0, 6 packuswb xmm0, xmm0 // G // Step 2: Weave into ARGB punpcklbw xmm0, xmm0 // GG movdqa xmm1, xmm0 punpcklwd xmm0, xmm0 // BGRA first 4 pixels por xmm0, xmm5 movdqa [edx], xmm0 punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm1, xmm5 movdqa [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 ja convertloop ret } } #endif #endif #ifdef HAS_REVERSE_ROW_SSSE3 // Shuffle table for reversing the bytes. static const uvec8 kShuffleReverse = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; __declspec(naked) void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width movdqa xmm5, kShuffleReverse lea eax, [eax + ecx - 16] convertloop: movdqa xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 ja convertloop ret } } #endif } // extern "C"