Commit c7161d1c authored by fbarchard@google.com's avatar fbarchard@google.com

Remove code alignment declspec from Visual C versions for vs2014 compatibility.

BUG=422
TESTED=local vs2013 build still passes.

Review URL: https://webrtc-codereview.appspot.com/45959004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1365 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 1eb51bcf
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1364 Version: 1365
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1364 #define LIBYUV_VERSION 1365
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -19,7 +19,7 @@ extern "C" { ...@@ -19,7 +19,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__) defined(_MSC_VER) && !defined(__clang__)
__declspec(naked) __declspec(align(16)) __declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
__asm { __asm {
mov eax, [esp + 4] // src_a mov eax, [esp + 4] // src_a
...@@ -60,7 +60,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -60,7 +60,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
#if _MSC_VER >= 1700 #if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable: 4752) #pragma warning(disable: 4752)
__declspec(naked) __declspec(align(16)) __declspec(naked)
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
__asm { __asm {
mov eax, [esp + 4] // src_a mov eax, [esp + 4] // src_a
...@@ -134,7 +134,7 @@ static uvec32 kHashMul3 = { ...@@ -134,7 +134,7 @@ static uvec32 kHashMul3 = {
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg _asm _emit 0x40 _asm _emit reg
__declspec(naked) __declspec(align(16)) __declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -185,7 +185,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -185,7 +185,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
// Visual C 2012 required for AVX2. // Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700 #if _MSC_VER >= 1700
__declspec(naked) __declspec(align(16)) __declspec(naked)
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
......
...@@ -23,7 +23,7 @@ extern "C" { ...@@ -23,7 +23,7 @@ extern "C" {
#ifdef ENABLE_SCASB #ifdef ENABLE_SCASB
// Multiple of 1. // Multiple of 1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) { const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
__asm { __asm {
mov edx, edi mov edx, edi
......
...@@ -73,7 +73,7 @@ void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, ...@@ -73,7 +73,7 @@ void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__) defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
#define HAS_TRANSPOSE_WX8_SSSE3 #define HAS_TRANSPOSE_WX8_SSSE3
__declspec(naked) __declspec(align(16)) __declspec(naked)
static void TransposeWx8_SSSE3(const uint8* src, int src_stride, static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
__asm { __asm {
...@@ -165,7 +165,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -165,7 +165,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
} }
#define HAS_TRANSPOSE_UVWX8_SSE2 #define HAS_TRANSPOSE_UVWX8_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked)
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
......
...@@ -147,8 +147,6 @@ static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { ...@@ -147,8 +147,6 @@ static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
// 64 bit // 64 bit
#if defined(_M_X64) #if defined(_M_X64)
__declspec(align(16))
void I422ToARGBRow_SSSE3(const uint8* y_buf, void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -198,10 +196,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -198,10 +196,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
width -= 8; width -= 8;
} }
} }
// 32 bit // 32 bit
#else // defined(_M_X64) #else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
// Constants for ARGB. // Constants for ARGB.
...@@ -324,7 +320,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = { ...@@ -324,7 +320,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
}; };
// Duplicates gray value 3 times and fills in alpha opaque. // Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_y mov eax, [esp + 4] // src_y
...@@ -353,7 +349,7 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -353,7 +349,7 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
#ifdef HAS_J400TOARGBROW_AVX2 #ifdef HAS_J400TOARGBROW_AVX2
// Duplicates gray value 3 times and fills in alpha opaque. // Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_y mov eax, [esp + 4] // src_y
...@@ -383,7 +379,7 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -383,7 +379,7 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
} }
#endif // HAS_J400TOARGBROW_AVX2 #endif // HAS_J400TOARGBROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_rgb24 mov eax, [esp + 4] // src_rgb24
...@@ -421,7 +417,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -421,7 +417,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) { int pix) {
__asm { __asm {
...@@ -467,7 +463,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, ...@@ -467,7 +463,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
// v * (256 + 8) // v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
// 20 instructions. // 20 instructions.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
int pix) { int pix) {
__asm { __asm {
...@@ -523,7 +519,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, ...@@ -523,7 +519,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
// v * 256 + v * 8 // v * 256 + v * 8
// v * (256 + 8) // v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
__declspec(naked) __declspec(align(16)) __declspec(naked)
void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
int pix) { int pix) {
__asm { __asm {
...@@ -574,7 +570,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, ...@@ -574,7 +570,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
#endif // HAS_RGB565TOARGBROW_AVX2 #endif // HAS_RGB565TOARGBROW_AVX2
#ifdef HAS_ARGB1555TOARGBROW_AVX2 #ifdef HAS_ARGB1555TOARGBROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
int pix) { int pix) {
__asm { __asm {
...@@ -624,7 +620,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, ...@@ -624,7 +620,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
#endif // HAS_ARGB1555TOARGBROW_AVX2 #endif // HAS_ARGB1555TOARGBROW_AVX2
#ifdef HAS_ARGB4444TOARGBROW_AVX2 #ifdef HAS_ARGB4444TOARGBROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
int pix) { int pix) {
__asm { __asm {
...@@ -660,7 +656,7 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, ...@@ -660,7 +656,7 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
#endif // HAS_ARGB4444TOARGBROW_AVX2 #endif // HAS_ARGB4444TOARGBROW_AVX2
// 24 instructions // 24 instructions
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
int pix) { int pix) {
__asm { __asm {
...@@ -713,7 +709,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, ...@@ -713,7 +709,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
} }
// 18 instructions. // 18 instructions.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
int pix) { int pix) {
__asm { __asm {
...@@ -751,7 +747,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, ...@@ -751,7 +747,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
...@@ -789,7 +785,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -789,7 +785,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
...@@ -828,7 +824,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -828,7 +824,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
} }
// 4 pixels // 4 pixels
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
...@@ -866,7 +862,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -866,7 +862,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
} }
// 8 pixels // 8 pixels
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix) { const uint32 dither4, int pix) {
__asm { __asm {
...@@ -912,7 +908,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, ...@@ -912,7 +908,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
} }
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix) { const uint32 dither4, int pix) {
__asm { __asm {
...@@ -955,7 +951,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, ...@@ -955,7 +951,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
#endif // HAS_ARGBTORGB565DITHERROW_AVX2 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
// TODO(fbarchard): Improve sign extension/packing. // TODO(fbarchard): Improve sign extension/packing.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
...@@ -996,7 +992,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -996,7 +992,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
...@@ -1026,7 +1022,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -1026,7 +1022,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
} }
#ifdef HAS_ARGBTORGB565ROW_AVX2 #ifdef HAS_ARGBTORGB565ROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
...@@ -1063,7 +1059,7 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -1063,7 +1059,7 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
#endif // HAS_ARGBTORGB565ROW_AVX2 #endif // HAS_ARGBTORGB565ROW_AVX2
#ifdef HAS_ARGBTOARGB1555ROW_AVX2 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
...@@ -1103,7 +1099,7 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -1103,7 +1099,7 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
#endif // HAS_ARGBTOARGB1555ROW_AVX2 #endif // HAS_ARGBTOARGB1555ROW_AVX2
#ifdef HAS_ARGBTOARGB4444ROW_AVX2 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
...@@ -1134,7 +1130,7 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -1134,7 +1130,7 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
#endif // HAS_ARGBTOARGB4444ROW_AVX2 #endif // HAS_ARGBTOARGB4444ROW_AVX2
// Convert 16 ARGB pixels (64 bytes) to 16 Y values. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm { __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
...@@ -1169,7 +1165,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1169,7 +1165,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm { __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
...@@ -1288,7 +1284,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1288,7 +1284,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
} }
#endif // HAS_ARGBTOYJROW_AVX2 #endif // HAS_ARGBTOYJROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm { __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
...@@ -1321,7 +1317,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1321,7 +1317,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm { __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
...@@ -1354,7 +1350,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1354,7 +1350,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm { __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
...@@ -1387,7 +1383,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1387,7 +1383,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
__asm { __asm {
...@@ -1457,7 +1453,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1457,7 +1453,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
__asm { __asm {
...@@ -1594,7 +1590,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1594,7 +1590,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
} }
#endif // HAS_ARGBTOUVROW_AVX2 #endif // HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToUV444Row_SSSE3(const uint8* src_argb0, void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
__asm { __asm {
...@@ -1651,7 +1647,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, ...@@ -1651,7 +1647,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToUV422Row_SSSE3(const uint8* src_argb0, void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
__asm { __asm {
...@@ -1709,7 +1705,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, ...@@ -1709,7 +1705,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
__asm { __asm {
...@@ -1779,7 +1775,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1779,7 +1775,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
__asm { __asm {
...@@ -1849,7 +1845,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1849,7 +1845,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
__asm { __asm {
...@@ -2005,7 +2001,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -2005,7 +2001,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
#ifdef HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBROW_AVX2
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToARGBRow_AVX2(const uint8* y_buf, void I422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2041,7 +2037,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2041,7 +2037,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_J422TOARGBROW_AVX2 #ifdef HAS_J422TOARGBROW_AVX2
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void J422ToARGBRow_AVX2(const uint8* y_buf, void J422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2077,7 +2073,7 @@ void J422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2077,7 +2073,7 @@ void J422ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_I444TOARGBROW_AVX2 #ifdef HAS_I444TOARGBROW_AVX2
// 16 pixels // 16 pixels
// 16 UV values with 16 Y producing 16 ARGB (64 bytes). // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I444ToARGBRow_AVX2(const uint8* y_buf, void I444ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2113,7 +2109,7 @@ void I444ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2113,7 +2109,7 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_I411TOARGBROW_AVX2 #ifdef HAS_I411TOARGBROW_AVX2
// 16 pixels // 16 pixels
// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I411ToARGBRow_AVX2(const uint8* y_buf, void I411ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2149,7 +2145,7 @@ void I411ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2149,7 +2145,7 @@ void I411ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_NV12TOARGBROW_AVX2 #ifdef HAS_NV12TOARGBROW_AVX2
// 16 pixels. // 16 pixels.
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void NV12ToARGBRow_AVX2(const uint8* y_buf, void NV12ToARGBRow_AVX2(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* dst_argb, uint8* dst_argb,
...@@ -2180,7 +2176,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2180,7 +2176,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_NV21TOARGBROW_AVX2 #ifdef HAS_NV21TOARGBROW_AVX2
// 16 pixels. // 16 pixels.
// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void NV21ToARGBRow_AVX2(const uint8* y_buf, void NV21ToARGBRow_AVX2(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* dst_argb, uint8* dst_argb,
...@@ -2212,7 +2208,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf, ...@@ -2212,7 +2208,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToBGRARow_AVX2(const uint8* y_buf, void I422ToBGRARow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2258,7 +2254,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -2258,7 +2254,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToRGBARow_AVX2(const uint8* y_buf, void I422ToRGBARow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2304,7 +2300,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, ...@@ -2304,7 +2300,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToABGRRow_AVX2(const uint8* y_buf, void I422ToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2542,7 +2538,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -2542,7 +2538,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I444ToARGBRow_SSSE3(const uint8* y_buf, void I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2575,7 +2571,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2575,7 +2571,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToRGB24Row_SSSE3(const uint8* y_buf, void I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2609,7 +2605,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, ...@@ -2609,7 +2605,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToRAWRow_SSSE3(const uint8* y_buf, void I422ToRAWRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2643,7 +2639,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, ...@@ -2643,7 +2639,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
// 8 pixels // 8 pixels
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToRGB565Row_SSSE3(const uint8* y_buf, void I422ToRGB565Row_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2682,7 +2678,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, ...@@ -2682,7 +2678,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToARGBRow_SSSE3(const uint8* y_buf, void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2716,7 +2712,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2716,7 +2712,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// JPeg color space version of I422ToARGB // JPeg color space version of I422ToARGB
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void J422ToARGBRow_SSSE3(const uint8* y_buf, void J422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2750,7 +2746,7 @@ void J422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2750,7 +2746,7 @@ void J422ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
// Similar to I420 but duplicate UV once more. // Similar to I420 but duplicate UV once more.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I411ToARGBRow_SSSE3(const uint8* y_buf, void I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2785,7 +2781,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2785,7 +2781,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void NV12ToARGBRow_SSSE3(const uint8* y_buf, void NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* dst_argb, uint8* dst_argb,
...@@ -2813,7 +2809,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2813,7 +2809,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
// 8 pixels. // 8 pixels.
// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void NV21ToARGBRow_SSSE3(const uint8* y_buf, void NV21ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* dst_argb, uint8* dst_argb,
...@@ -2839,7 +2835,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2839,7 +2835,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToBGRARow_SSSE3(const uint8* y_buf, void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2869,7 +2865,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -2869,7 +2865,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToABGRRow_SSSE3(const uint8* y_buf, void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2900,7 +2896,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -2900,7 +2896,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToRGBARow_SSSE3(const uint8* y_buf, void I422ToRGBARow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -2934,7 +2930,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -2934,7 +2930,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
#ifdef HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I400ToARGBRow_SSE2(const uint8* y_buf, void I400ToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
...@@ -2982,7 +2978,7 @@ void I400ToARGBRow_SSE2(const uint8* y_buf, ...@@ -2982,7 +2978,7 @@ void I400ToARGBRow_SSE2(const uint8* y_buf,
#ifdef HAS_I400TOARGBROW_AVX2 #ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates. // note: vpunpcklbw mutates and vpackuswb unmutates.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I400ToARGBRow_AVX2(const uint8* y_buf, void I400ToARGBRow_AVX2(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
...@@ -3037,7 +3033,7 @@ static const uvec8 kShuffleMirror = { ...@@ -3037,7 +3033,7 @@ static const uvec8 kShuffleMirror = {
}; };
// TODO(fbarchard): Replace lea with -16 offset. // TODO(fbarchard): Replace lea with -16 offset.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3058,7 +3054,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -3058,7 +3054,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
#endif // HAS_MIRRORROW_SSSE3 #endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3081,7 +3077,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3081,7 +3077,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_MIRRORROW_AVX2 #endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORROW_SSE2 #ifdef HAS_MIRRORROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3112,7 +3108,7 @@ static const uvec8 kShuffleMirrorUV = { ...@@ -3112,7 +3108,7 @@ static const uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
}; };
__declspec(naked) __declspec(align(16)) __declspec(naked)
void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
int width) { int width) {
__asm { __asm {
...@@ -3142,7 +3138,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ...@@ -3142,7 +3138,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
#endif // HAS_MIRRORROW_UV_SSSE3 #endif // HAS_MIRRORROW_UV_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3169,7 +3165,7 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = { ...@@ -3169,7 +3165,7 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = {
7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
}; };
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3190,7 +3186,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3190,7 +3186,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBMIRRORROW_AVX2 #endif // HAS_ARGBMIRRORROW_AVX2
#ifdef HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
push edi push edi
...@@ -3228,7 +3224,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -3228,7 +3224,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
#endif // HAS_SPLITUVROW_SSE2 #endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_SPLITUVROW_AVX2 #ifdef HAS_SPLITUVROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
push edi push edi
...@@ -3266,7 +3262,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -3266,7 +3262,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
#endif // HAS_SPLITUVROW_AVX2 #endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) { int width) {
__asm { __asm {
...@@ -3297,7 +3293,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -3297,7 +3293,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#endif // HAS_MERGEUVROW_SSE2 #endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2 #ifdef HAS_MERGEUVROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) { int width) {
__asm { __asm {
...@@ -3331,7 +3327,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -3331,7 +3327,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#ifdef HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_SSE2
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3354,7 +3350,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -3354,7 +3350,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
#ifdef HAS_COPYROW_AVX #ifdef HAS_COPYROW_AVX
// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void CopyRow_AVX(const uint8* src, uint8* dst, int count) { void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3378,7 +3374,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { ...@@ -3378,7 +3374,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_AVX #endif // HAS_COPYROW_AVX
// Multiple of 1. // Multiple of 1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
__asm { __asm {
mov eax, esi mov eax, esi
...@@ -3395,7 +3391,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { ...@@ -3395,7 +3391,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
#ifdef HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels // width in pixels
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3431,7 +3427,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -3431,7 +3427,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBCOPYALPHAROW_AVX2 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
// width in pixels // width in pixels
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3460,7 +3456,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3460,7 +3456,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels // width in pixels
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3498,7 +3494,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -3498,7 +3494,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
// width in pixels // width in pixels
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
...@@ -3530,7 +3526,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -3530,7 +3526,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_SETROW_X86 #ifdef HAS_SETROW_X86
// Write 'count' bytes using an 8 bit value repeated. // Write 'count' bytes using an 8 bit value repeated.
// Count should be multiple of 4. // Count should be multiple of 4.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SetRow_X86(uint8* dst, uint8 v8, int count) { void SetRow_X86(uint8* dst, uint8 v8, int count) {
__asm { __asm {
movzx eax, byte ptr [esp + 8] // v8 movzx eax, byte ptr [esp + 8] // v8
...@@ -3547,7 +3543,7 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) { ...@@ -3547,7 +3543,7 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) {
} }
// Write 'count' bytes using an 8 bit value repeated. // Write 'count' bytes using an 8 bit value repeated.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SetRow_ERMS(uint8* dst, uint8 v8, int count) { void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
__asm { __asm {
mov edx, edi mov edx, edi
...@@ -3561,7 +3557,7 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) { ...@@ -3561,7 +3557,7 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
} }
// Write 'count' 32 bit values. // Write 'count' 32 bit values.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
__asm { __asm {
mov edx, edi mov edx, edi
...@@ -3576,7 +3572,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { ...@@ -3576,7 +3572,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
#endif // HAS_SETROW_X86 #endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void YUY2ToYRow_AVX2(const uint8* src_yuy2, void YUY2ToYRow_AVX2(const uint8* src_yuy2,
uint8* dst_y, int pix) { uint8* dst_y, int pix) {
__asm { __asm {
...@@ -3603,7 +3599,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, ...@@ -3603,7 +3599,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -3647,7 +3643,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, ...@@ -3647,7 +3643,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -3686,7 +3682,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, ...@@ -3686,7 +3682,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void UYVYToYRow_AVX2(const uint8* src_uyvy, void UYVYToYRow_AVX2(const uint8* src_uyvy,
uint8* dst_y, int pix) { uint8* dst_y, int pix) {
__asm { __asm {
...@@ -3711,7 +3707,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, ...@@ -3711,7 +3707,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -3755,7 +3751,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, ...@@ -3755,7 +3751,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void UYVYToUV422Row_AVX2(const uint8* src_uyvy, void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -3796,7 +3792,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, ...@@ -3796,7 +3792,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
#endif // HAS_YUY2TOYROW_AVX2 #endif // HAS_YUY2TOYROW_AVX2
#ifdef HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void YUY2ToYRow_SSE2(const uint8* src_yuy2, void YUY2ToYRow_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) { uint8* dst_y, int pix) {
__asm { __asm {
...@@ -3821,7 +3817,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, ...@@ -3821,7 +3817,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -3864,7 +3860,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -3864,7 +3860,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -3900,7 +3896,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, ...@@ -3900,7 +3896,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void UYVYToYRow_SSE2(const uint8* src_uyvy, void UYVYToYRow_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) { uint8* dst_y, int pix) {
__asm { __asm {
...@@ -3923,7 +3919,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, ...@@ -3923,7 +3919,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -3966,7 +3962,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -3966,7 +3962,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void UYVYToUV422Row_SSE2(const uint8* src_uyvy, void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -4005,7 +4001,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, ...@@ -4005,7 +4001,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
#ifdef HAS_ARGBBLENDROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time. // Blend 8 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -4139,7 +4135,7 @@ static const uvec8 kShuffleAlpha = { ...@@ -4139,7 +4135,7 @@ static const uvec8 kShuffleAlpha = {
// pshufb xmm3, kShuffleAlpha // alpha // pshufb xmm3, kShuffleAlpha // alpha
// Blend 8 pixels at a time. // Blend 8 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -4255,7 +4251,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4255,7 +4251,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBATTENUATEROW_SSE2 #ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time. // Attenuate 4 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm { __asm {
mov eax, [esp + 4] // src_argb0 mov eax, [esp + 4] // src_argb0
...@@ -4304,7 +4300,7 @@ static const uvec8 kShuffleAlpha1 = { ...@@ -4304,7 +4300,7 @@ static const uvec8 kShuffleAlpha1 = {
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
}; };
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm { __asm {
mov eax, [esp + 4] // src_argb0 mov eax, [esp + 4] // src_argb0
...@@ -4348,7 +4344,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4348,7 +4344,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
static const uvec8 kShuffleAlpha_AVX2 = { static const uvec8 kShuffleAlpha_AVX2 = {
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
}; };
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
__asm { __asm {
mov eax, [esp + 4] // src_argb0 mov eax, [esp + 4] // src_argb0
...@@ -4385,7 +4381,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4385,7 +4381,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time. // Unattenuate 4 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) { int width) {
__asm { __asm {
...@@ -4439,7 +4435,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = { ...@@ -4439,7 +4435,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = {
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
// USE_GATHER is not on by default, due to being a slow instruction. // USE_GATHER is not on by default, due to being a slow instruction.
#ifdef USE_GATHER #ifdef USE_GATHER
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
int width) { int width) {
__asm { __asm {
...@@ -4473,7 +4469,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -4473,7 +4469,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
} }
} }
#else // USE_GATHER #else // USE_GATHER
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
int width) { int width) {
__asm { __asm {
...@@ -4540,7 +4536,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -4540,7 +4536,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBGRAYROW_SSSE3 #ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm { __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
...@@ -4599,7 +4595,7 @@ static const vec8 kARGBToSepiaR = { ...@@ -4599,7 +4595,7 @@ static const vec8 kARGBToSepiaR = {
}; };
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
__asm { __asm {
mov eax, [esp + 4] /* dst_argb */ mov eax, [esp + 4] /* dst_argb */
...@@ -4656,7 +4652,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { ...@@ -4656,7 +4652,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// Same as Sepia except matrix is provided. // Same as Sepia except matrix is provided.
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) { const int8* matrix_argb, int width) {
__asm { __asm {
...@@ -4717,7 +4713,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, ...@@ -4717,7 +4713,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#ifdef HAS_ARGBQUANTIZEROW_SSE2 #ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes). // Quantize 4 ARGB pixels (16 bytes).
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) { int interval_offset, int width) {
__asm { __asm {
...@@ -4762,7 +4758,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, ...@@ -4762,7 +4758,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
#ifdef HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value. // Shade 4 pixels at a time by specified value.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) { uint32 value) {
__asm { __asm {
...@@ -4796,7 +4792,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -4796,7 +4792,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -4835,7 +4831,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4835,7 +4831,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBADDROW_SSE2 #ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions. // TODO(fbarchard): Port this to posix, neon and other math functions.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -4883,7 +4879,7 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4883,7 +4879,7 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBSUBTRACTROW_SSE2 #ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -4912,7 +4908,7 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4912,7 +4908,7 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBMULTIPLYROW_AVX2 #ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -4949,7 +4945,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4949,7 +4945,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBADDROW_AVX2 #ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -4978,7 +4974,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4978,7 +4974,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBSUBTRACTROW_AVX2 #ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -5010,7 +5006,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -5010,7 +5006,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
// -1 0 1 // -1 0 1
// -2 0 2 // -2 0 2
// -1 0 1 // -1 0 1
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) { const uint8* src_y2, uint8* dst_sobelx, int width) {
__asm { __asm {
...@@ -5066,7 +5062,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, ...@@ -5066,7 +5062,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// -1 -2 -1 // -1 -2 -1
// 0 0 0 // 0 0 0
// 1 2 1 // 1 2 1
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) { uint8* dst_sobely, int width) {
__asm { __asm {
...@@ -5119,7 +5115,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, ...@@ -5119,7 +5115,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
// R = Sobel // R = Sobel
// G = Sobel // G = Sobel
// B = Sobel // B = Sobel
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -5166,7 +5162,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -5166,7 +5162,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
#ifdef HAS_SOBELTOPLANEROW_SSE2 #ifdef HAS_SOBELTOPLANEROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into a plane. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) { uint8* dst_y, int width) {
__asm { __asm {
...@@ -5199,7 +5195,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -5199,7 +5195,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// R = Sobel X // R = Sobel X
// G = Sobel // G = Sobel
// B = Sobel Y // B = Sobel Y
__declspec(naked) __declspec(align(16)) __declspec(naked)
void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
__asm { __asm {
...@@ -5486,7 +5482,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -5486,7 +5482,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
#ifdef HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2
// Copy ARGB pixels from source image with slope to a row of destination. // Copy ARGB pixels from source image with slope to a row of destination.
__declspec(naked) __declspec(align(16)) __declspec(naked)
LIBYUV_API LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width) { uint8* dst_argb, const float* uv_dudv, int width) {
...@@ -5571,7 +5567,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -5571,7 +5567,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
#ifdef HAS_INTERPOLATEROW_AVX2 #ifdef HAS_INTERPOLATEROW_AVX2
// Bilinear filter 32x2 -> 32x1 // Bilinear filter 32x2 -> 32x1
__declspec(naked) __declspec(align(16)) __declspec(naked)
void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
...@@ -5668,7 +5664,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5668,7 +5664,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
#endif // HAS_INTERPOLATEROW_AVX2 #endif // HAS_INTERPOLATEROW_AVX2
// Bilinear filter 16x2 -> 16x1 // Bilinear filter 16x2 -> 16x1
__declspec(naked) __declspec(align(16)) __declspec(naked)
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
...@@ -5769,7 +5765,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5769,7 +5765,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
#ifdef HAS_INTERPOLATEROW_SSE2 #ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1 // Bilinear filter 16x2 -> 16x1
__declspec(naked) __declspec(align(16)) __declspec(naked)
void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
...@@ -5876,7 +5872,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5876,7 +5872,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
#endif // HAS_INTERPOLATEROW_SSE2 #endif // HAS_INTERPOLATEROW_SSE2
// Specialized ARGB to Bayer that just isolates G channel. // Specialized ARGB to Bayer that just isolates G channel.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) { uint32 selector, int pix) {
__asm { __asm {
...@@ -5906,7 +5902,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, ...@@ -5906,7 +5902,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
} }
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
__asm { __asm {
...@@ -5932,7 +5928,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, ...@@ -5932,7 +5928,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
} }
#ifdef HAS_ARGBSHUFFLEROW_AVX2 #ifdef HAS_ARGBSHUFFLEROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
__asm { __asm {
...@@ -5960,7 +5956,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -5960,7 +5956,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
} }
#endif // HAS_ARGBSHUFFLEROW_AVX2 #endif // HAS_ARGBSHUFFLEROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
__asm { __asm {
...@@ -6082,7 +6078,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -6082,7 +6078,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
// UYVY - Macro-pixel = 2 image pixels // UYVY - Macro-pixel = 2 image pixels
// U0Y0V0Y1 // U0Y0V0Y1
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToYUY2Row_SSE2(const uint8* src_y, void I422ToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -6119,7 +6115,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, ...@@ -6119,7 +6115,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
} }
} }
__declspec(naked) __declspec(align(16)) __declspec(naked)
void I422ToUYVYRow_SSE2(const uint8* src_y, void I422ToUYVYRow_SSE2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -6157,7 +6153,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, ...@@ -6157,7 +6153,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
} }
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBPolynomialRow_SSE2(const uint8* src_argb, void ARGBPolynomialRow_SSE2(const uint8* src_argb,
uint8* dst_argb, const float* poly, uint8* dst_argb, const float* poly,
int width) { int width) {
...@@ -6216,7 +6212,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, ...@@ -6216,7 +6212,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
#endif // HAS_ARGBPOLYNOMIALROW_SSE2 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBPolynomialRow_AVX2(const uint8* src_argb, void ARGBPolynomialRow_AVX2(const uint8* src_argb,
uint8* dst_argb, const float* poly, uint8* dst_argb, const float* poly,
int width) { int width) {
...@@ -6256,7 +6252,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, ...@@ -6256,7 +6252,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
#ifdef HAS_ARGBCOLORTABLEROW_X86 #ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table. // Tranform ARGB pixels with color table.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
int width) { int width) {
__asm { __asm {
...@@ -6290,7 +6286,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, ...@@ -6290,7 +6286,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
#ifdef HAS_RGBCOLORTABLEROW_X86 #ifdef HAS_RGBCOLORTABLEROW_X86
// Tranform RGB pixels with color table. // Tranform RGB pixels with color table.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
__asm { __asm {
push esi push esi
...@@ -6321,7 +6317,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { ...@@ -6321,7 +6317,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
// Tranform RGB pixels with luma table. // Tranform RGB pixels with luma table.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
int width, int width,
const uint8* luma, uint32 lumacoeff) { const uint8* luma, uint32 lumacoeff) {
......
...@@ -95,7 +95,7 @@ static uvec16 kScaleAb2 = ...@@ -95,7 +95,7 @@ static uvec16 kScaleAb2 =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
// Reads 32 pixels, throws half away and writes 16 pixels. // Reads 32 pixels, throws half away and writes 16 pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -121,7 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -121,7 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Blends 32x1 rectangle to 16x1. // Blends 32x1 rectangle to 16x1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -157,7 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -157,7 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Blends 32x2 rectangle to 16x1. // Blends 32x2 rectangle to 16x1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -200,7 +200,7 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -200,7 +200,7 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
#ifdef HAS_SCALEROWDOWN2_AVX2 #ifdef HAS_SCALEROWDOWN2_AVX2
// Reads 64 pixels, throws half away and writes 32 pixels. // Reads 64 pixels, throws half away and writes 32 pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -228,7 +228,7 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -228,7 +228,7 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Blends 64x1 rectangle to 32x1. // Blends 64x1 rectangle to 32x1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -265,7 +265,7 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -265,7 +265,7 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Blends 64x2 rectangle to 32x1. // Blends 64x2 rectangle to 32x1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -307,7 +307,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -307,7 +307,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
#endif // HAS_SCALEROWDOWN2_AVX2 #endif // HAS_SCALEROWDOWN2_AVX2
// Point samples 32 pixels to 8 pixels. // Point samples 32 pixels to 8 pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -338,7 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -338,7 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Blends 32x4 rectangle to 8x1. // Blends 32x4 rectangle to 8x1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -401,7 +401,7 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -401,7 +401,7 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Then shuffled to do the scaling. // Then shuffled to do the scaling.
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -448,7 +448,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -448,7 +448,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// xmm7 kRound34 // xmm7 kRound34
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
...@@ -505,7 +505,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ...@@ -505,7 +505,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
} }
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
...@@ -567,7 +567,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ...@@ -567,7 +567,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
// 3/8 point sampler // 3/8 point sampler
// Scale 32 pixels to 12 // Scale 32 pixels to 12
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -598,7 +598,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -598,7 +598,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Scale 16x3 pixels to 6x1 with interpolation // Scale 16x3 pixels to 6x1 with interpolation
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
...@@ -663,7 +663,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ...@@ -663,7 +663,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
} }
// Scale 16x2 pixels to 6x1 with interpolation // Scale 16x2 pixels to 6x1 with interpolation
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
...@@ -709,7 +709,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ...@@ -709,7 +709,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
// Reads 16xN bytes and produces 16 shorts at a time. // Reads 16xN bytes and produces 16 shorts at a time.
// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, uint16* dst_ptr, int src_width,
int src_height) { int src_height) {
...@@ -775,7 +775,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -775,7 +775,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// when drmemory bug fixed. // when drmemory bug fixed.
// https://code.google.com/p/drmemory/issues/detail?id=1396 // https://code.google.com/p/drmemory/issues/detail?id=1396
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
...@@ -852,7 +852,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -852,7 +852,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
} }
// Reads 16 pixels, duplicates them and writes 32 pixels. // Reads 16 pixels, duplicates them and writes 32 pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
...@@ -877,7 +877,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -877,7 +877,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
} }
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBRowDown2_SSE2(const uint8* src_argb, void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
...@@ -902,7 +902,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -902,7 +902,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
} }
// Blends 8x1 rectangle to 4x1. // Blends 8x1 rectangle to 4x1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
...@@ -930,7 +930,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ...@@ -930,7 +930,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
} }
// Blends 8x2 rectangle to 4x1. // Blends 8x2 rectangle to 4x1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
...@@ -964,7 +964,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ...@@ -964,7 +964,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
...@@ -1000,7 +1000,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -1000,7 +1000,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
} }
// Blends four 2x2 to 4x1. // Blends four 2x2 to 4x1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
...@@ -1048,7 +1048,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ...@@ -1048,7 +1048,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
} }
// Column scaling unfiltered. SSE2 version. // Column scaling unfiltered. SSE2 version.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
...@@ -1139,7 +1139,7 @@ static uvec8 kShuffleFractions = { ...@@ -1139,7 +1139,7 @@ static uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
}; };
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
...@@ -1210,7 +1210,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, ...@@ -1210,7 +1210,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
} }
// Reads 4 pixels, duplicates them and writes 8 pixels. // Reads 4 pixels, duplicates them and writes 8 pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
...@@ -1235,7 +1235,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1235,7 +1235,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
} }
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
__declspec(naked) __declspec(align(16)) __declspec(naked)
int FixedDiv_X86(int num, int div) { int FixedDiv_X86(int num, int div) {
__asm { __asm {
mov eax, [esp + 4] // num mov eax, [esp + 4] // num
...@@ -1248,7 +1248,7 @@ int FixedDiv_X86(int num, int div) { ...@@ -1248,7 +1248,7 @@ int FixedDiv_X86(int num, int div) {
} }
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
__declspec(naked) __declspec(align(16)) __declspec(naked)
int FixedDiv1_X86(int num, int div) { int FixedDiv1_X86(int num, int div) {
__asm { __asm {
mov eax, [esp + 4] // num mov eax, [esp + 4] // num
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment