Commit d2f4413d authored by fbarchard@google.com's avatar fbarchard@google.com

Remove old alpha blend, expose GetARGB2Blend, fix ComputeSumSquareErrorPlane on SSE2

BUG=29
TEST=none
Review URL: https://webrtc-codereview.appspot.com/469005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@234 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c757f308
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 233 Version: 234
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -133,24 +133,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, ...@@ -133,24 +133,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Alpha Blend ARGB row of pixels. typedef void (*ARGBBlendRow)(const uint8* src_argb0,
void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width); const uint8* src_argb1,
uint8* dst_argb, int width);
// Alpha Blend 2 rows of ARGB pixels and store to destination. // Get function to Alpha Blend ARGB pixels and store to destination.
void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1, ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width);
uint8* dst_argb, int width);
// Alpha Blend ARGB. // Alpha Blend ARGB images and store to destination.
int ARGBBlend(const uint8* src_argb, int src_stride_argb, int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Alpha Blend 2 ARGB images and store to destination.
int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I422 to YUY2. // Convert I422 to YUY2.
int I422ToYUY2(const uint8* src_y, int src_stride_y, int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
......
...@@ -20,7 +20,7 @@ extern "C" { ...@@ -20,7 +20,7 @@ extern "C" {
// Supported rotation // Supported rotation
enum RotationMode { enum RotationMode {
kRotate0 = 0, // No rotation kRotate0 = 0, // No rotation
kRotate90 = 90, // Rotate 90 degrees clockwise kRotate90 = 90, // Rotate 90 degrees clockwise
kRotate180 = 180, // Rotate 180 degrees kRotate180 = 180, // Rotate 180 degrees
kRotate270 = 270, // Rotate 270 degrees clockwise kRotate270 = 270, // Rotate 270 degrees clockwise
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION 233 #define LIBYUV_VERSION 234
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -25,18 +25,37 @@ namespace libyuv { ...@@ -25,18 +25,37 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// hash seed of 5381 recommended. // Internal C version of HashDjb2 with int sized count for efficiency.
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
uint32 hash = seed; uint32 hash = seed;
if (count > 0) { for (int i = 0; i < count; ++i) {
do { hash += (hash << 5) + src[i];
hash = hash * 33 + *src++;
} while (--count);
} }
return hash; return hash;
} }
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) // hash seed of 5381 recommended.
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
while (count >= static_cast<uint64>(kBlockSize)) {
seed = HashDjb2_C(src, kBlockSize, seed);
src += kBlockSize;
count -= kBlockSize;
}
int remainder = static_cast<int>(count) & ~15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
src += remainder;
count -= remainder;
}
remainder = static_cast<int>(count) & 15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
}
return seed;
}
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SUMSQUAREERROR_NEON #define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
...@@ -75,9 +94,9 @@ static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, ...@@ -75,9 +94,9 @@ static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
return sse; return sse;
} }
#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SUMSQUAREERROR_SSE2 #define HAS_SUMSQUAREERROR_SSE2
__declspec(naked) __declspec(naked) __declspec(align(16))
static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
int count) { int count) {
__asm { __asm {
...@@ -94,7 +113,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, ...@@ -94,7 +113,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
movdqa xmm2, [eax + edx] movdqa xmm2, [eax + edx]
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 16 sub ecx, 16
movdqa xmm3, xmm1 movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2 psubusb xmm1, xmm2
psubusb xmm2, xmm3 psubusb xmm2, xmm3
por xmm1, xmm2 por xmm1, xmm2
...@@ -116,7 +135,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, ...@@ -116,7 +135,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
} }
} }
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2 #define HAS_SUMSQUAREERROR_SSE2
static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
int count) { int count) {
...@@ -167,11 +186,9 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, ...@@ -167,11 +186,9 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
int count) { int count) {
uint32 sse = 0u; uint32 sse = 0u;
for (int x = 0; x < count; ++x) { for (int i = 0; i < count; ++i) {
int diff = src_a[0] - src_b[0]; int diff = src_a[i] - src_b[i];
sse += static_cast<uint32>(diff * diff); sse += static_cast<uint32>(diff * diff);
src_a += 1;
src_b += 1;
} }
return sse; return sse;
} }
...@@ -187,6 +204,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, ...@@ -187,6 +204,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
#elif defined(HAS_SUMSQUAREERROR_SSE2) #elif defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2; SumSquareError = SumSquareError_SSE2;
} }
#endif #endif
...@@ -225,8 +243,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, ...@@ -225,8 +243,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
SumSquareError = SumSquareError_NEON; SumSquareError = SumSquareError_NEON;
} }
#elif defined(HAS_SUMSQUAREERROR_SSE2) #elif defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
SumSquareError = SumSquareError_SSE2; SumSquareError = SumSquareError_SSE2;
} }
#endif #endif
......
...@@ -61,9 +61,9 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -61,9 +61,9 @@ int I420Copy(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_HALFROW_SSE2 #define HAS_HALFROW_SSE2
__declspec(naked) __declspec(naked) __declspec(align(16))
static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) { uint8* dst_uv, int pix) {
__asm { __asm {
...@@ -86,7 +86,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, ...@@ -86,7 +86,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
} }
} }
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_HALFROW_SSE2 #define HAS_HALFROW_SSE2
static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) { uint8* dst_uv, int pix) {
...@@ -179,12 +179,13 @@ int I422ToI420(const uint8* src_y, int src_stride_y, ...@@ -179,12 +179,13 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
// Blends 32x2 pixels to 16x1 // Blends 32x2 pixels to 16x1
// source in scale.cc // source in scale.cc
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN2_NEON
void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
#elif defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) && \ #elif !defined(YUV_DISABLE_ASM) && \
!defined(YUV_DISABLE_ASM) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
#endif #endif
...@@ -450,9 +451,9 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420, ...@@ -450,9 +451,9 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
width, height); width, height);
} }
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SPLITYUY2_SSE2 #define HAS_SPLITYUY2_SSE2
__declspec(naked) __declspec(naked) __declspec(align(16))
static void SplitYUY2_SSE2(const uint8* src_yuy2, static void SplitYUY2_SSE2(const uint8* src_yuy2,
uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
__asm { __asm {
...@@ -498,7 +499,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ...@@ -498,7 +499,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
} }
} }
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SPLITYUY2_SSE2 #define HAS_SPLITYUY2_SSE2
static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
......
...@@ -205,9 +205,9 @@ int I400Copy(const uint8* src_y, int src_stride_y, ...@@ -205,9 +205,9 @@ int I400Copy(const uint8* src_y, int src_stride_y,
// UYVY - Macro-pixel = 2 image pixels // UYVY - Macro-pixel = 2 image pixels
// U0Y0V0Y1 // U0Y0V0Y1
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_I42XTOYUY2ROW_SSE2 #define HAS_I42XTOYUY2ROW_SSE2
__declspec(naked) __declspec(naked) __declspec(align(16))
static void I42xToYUY2Row_SSE2(const uint8* src_y, static void I42xToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -246,7 +246,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, ...@@ -246,7 +246,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
} }
#define HAS_I42XTOUYVYROW_SSE2 #define HAS_I42XTOUYVYROW_SSE2
__declspec(naked) __declspec(naked) __declspec(align(16))
static void I42xToUYVYRow_SSE2(const uint8* src_y, static void I42xToUYVYRow_SSE2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
...@@ -283,7 +283,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, ...@@ -283,7 +283,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
ret ret
} }
} }
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_I42XTOYUY2ROW_SSE2 #define HAS_I42XTOYUY2ROW_SSE2
static void I42xToYUY2Row_SSE2(const uint8* src_y, static void I42xToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
......
...@@ -24,9 +24,9 @@ extern "C" { ...@@ -24,9 +24,9 @@ extern "C" {
// and vst would select which 2 components to write. The low level would need // and vst would select which 2 components to write. The low level would need
// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBTOBAYERROW_SSSE3 #define HAS_ARGBTOBAYERROW_SSSE3
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ARGBToBayerRow_SSSE3(const uint8* src_argb, static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) { uint8* dst_bayer, uint32 selector, int pix) {
__asm { __asm {
...@@ -36,6 +36,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, ...@@ -36,6 +36,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
mov ecx, [esp + 16] // pix mov ecx, [esp + 16] // pix
pshufd xmm5, xmm5, 0 pshufd xmm5, xmm5, 0
align 16
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -48,7 +49,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, ...@@ -48,7 +49,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
} }
} }
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_ARGBTOBAYERROW_SSSE3 #define HAS_ARGBTOBAYERROW_SSSE3
static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
......
...@@ -137,87 +137,38 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, ...@@ -137,87 +137,38 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
// Alpha Blend ARGB // Get a blender that optimized for the CPU, alignment and pixel count.
void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) { // As there are 6 blenders to choose from, the caller should try to use
#if defined(HAS_ARGBBLENDROW_SSSE3) // the same blend function for all pixels if possible.
if (TestCpuFlag(kCpuHasSSSE3)) { ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) {
ARGBBlendRow_SSSE3(src_argb, dst_argb, width); void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
return; uint8* dst_argb, int width) = ARGBBlendRow_C;
}
#endif
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlendRow_SSE2(src_argb, dst_argb, width);
return;
}
#endif
ARGBBlendRow_C(src_argb, dst_argb, width);
}
// Alpha Blend 2 rows of ARGB pixels and store to destination.
void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBBlend2Row_SSSE3(src_argb0, src_argb1, dst_argb, width);
return;
}
#endif
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlend2Row_SSE2(src_argb0, src_argb1, dst_argb, width);
return;
}
#endif
ARGBBlend2Row_C(src_argb0, src_argb1, dst_argb, width);
}
// Alpha Blend ARGB
// TODO(fbarchard): Call 3 pointer low levels to reduce code size.
int ARGBBlend(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
ARGBBlendRow_C;
#if defined(HAS_ARGBBLENDROW_SSE2) #if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlendRow = ARGBBlendRow_SSE2; ARGBBlendRow = ARGBBlendRow1_SSE2;
if (IS_ALIGNED(width, 4) && if (width >= 4) {
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBBlendRow = ARGBBlendRow_Any_SSE2;
ARGBBlendRow = ARGBBlendRow_Aligned_SSE2; if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
}
} }
} }
#endif #endif
#if defined(HAS_ARGBBLENDROW_SSSE3) #if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
ARGBBlendRow = ARGBBlendRow_SSSE3; ARGBBlendRow = ARGBBlendRow_Any_SSSE3;
if (IS_ALIGNED(width, 4) && if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3; ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
} }
} }
#endif #endif
return ARGBBlendRow;
for (int y = 0; y < height; ++y) {
ARGBBlendRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
return 0;
} }
// Alpha Blend 2 ARGB images and store to destination. // Alpha Blend 2 ARGB images and store to destination.
int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0, int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1, const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
...@@ -230,30 +181,12 @@ int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0, ...@@ -230,30 +181,12 @@ int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
void (*ARGBBlend2Row)(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width) =
uint8* dst_argb, int width) = ARGBBlend2Row_C; GetARGBBlend(dst_argb, dst_stride_argb, width);
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlend2Row = ARGBBlend2Row_SSE2;
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlend2Row = ARGBBlend2Row_Aligned_SSE2;
}
}
#endif
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBBlend2Row = ARGBBlend2Row_SSSE3;
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlend2Row = ARGBBlend2Row_Aligned_SSSE3;
}
}
#endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
ARGBBlend2Row(src_argb0, src_argb1, dst_argb, width); ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
src_argb0 += src_stride_argb0; src_argb0 += src_stride_argb0;
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
...@@ -725,7 +658,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -725,7 +658,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
// SetRow8 writes 'count' bytes using a 32 bit value repeated // SetRow8 writes 'count' bytes using a 32 bit value repeated
// SetRow32 writes 'count' words using a 32 bit value repeated // SetRow32 writes 'count' words using a 32 bit value repeated
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SETROW_NEON #define HAS_SETROW_NEON
static void SetRow8_NEON(uint8* dst, uint32 v32, int count) { static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( asm volatile (
...@@ -749,9 +682,9 @@ static void SetRows32_NEON(uint8* dst, uint32 v32, int width, ...@@ -749,9 +682,9 @@ static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
} }
} }
#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SETROW_X86 #define HAS_SETROW_X86
__declspec(naked) __declspec(naked) __declspec(align(16))
static void SetRow8_X86(uint8* dst, uint32 v32, int count) { static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
__asm { __asm {
mov edx, edi mov edx, edi
...@@ -765,7 +698,7 @@ static void SetRow8_X86(uint8* dst, uint32 v32, int count) { ...@@ -765,7 +698,7 @@ static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
} }
} }
__declspec(naked) __declspec(naked) __declspec(align(16))
static void SetRows32_X86(uint8* dst, uint32 v32, int width, static void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) { int dst_stride, int height) {
__asm { __asm {
...@@ -793,7 +726,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, ...@@ -793,7 +726,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
} }
} }
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SETROW_X86 #define HAS_SETROW_X86
static void SetRow8_X86(uint8* dst, uint32 v32, int width) { static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = static_cast<size_t>(width); size_t width_tmp = static_cast<size_t>(width);
...@@ -903,6 +836,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y, ...@@ -903,6 +836,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
return 0; return 0;
} }
// TODO(fbarchard): Add TestCpuFlag(kCpuHasX86) to allow C code to be tested.
// Draw a rectangle into ARGB // Draw a rectangle into ARGB
int ARGBRect(uint8* dst_argb, int dst_stride_argb, int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y, int dst_x, int dst_y,
...@@ -916,12 +850,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, ...@@ -916,12 +850,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
#if defined(HAS_SETROW_X86) #if defined(HAS_SETROW_X86)
SetRows32_X86(dst, value, width, dst_stride_argb, height); SetRows32_X86(dst, value, width, dst_stride_argb, height);
#elif defined(HAS_SETROW_NEON) #else
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) { IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
SetRows32_NEON(dst, value, width, dst_stride_argb, height); SetRows32_NEON(dst, value, width, dst_stride_argb, height);
return 0; return 0;
} }
#endif
SetRows32_C(dst, value, width, dst_stride_argb, height); SetRows32_C(dst, value, width, dst_stride_argb, height);
#endif #endif
return 0; return 0;
......
...@@ -21,8 +21,8 @@ namespace libyuv { ...@@ -21,8 +21,8 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ #if !defined(YUV_DISABLE_ASM) && \
!defined(YUV_DISABLE_ASM) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#if defined(__APPLE__) && defined(__i386__) #if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \ #define DECLARE_FUNCTION(name) \
".text \n" \ ".text \n" \
...@@ -59,9 +59,9 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, ...@@ -59,9 +59,9 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
int width); int width);
#endif #endif
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_TRANSPOSE_WX8_SSSE3 #define HAS_TRANSPOSE_WX8_SSSE3
__declspec(naked) __declspec(naked) __declspec(align(16))
static void TransposeWx8_SSSE3(const uint8* src, int src_stride, static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
__asm { __asm {
...@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
} }
#define HAS_TRANSPOSE_UVWX8_SSE2 #define HAS_TRANSPOSE_UVWX8_SSE2
__declspec(naked) __declspec(naked) __declspec(align(16))
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
...@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
ret ret
} }
} }
#elif defined(__i386__) || defined(__x86_64__) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
#define HAS_TRANSPOSE_WX8_SSSE3 #define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride, static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
...@@ -369,7 +369,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -369,7 +369,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
); );
} }
#if defined (__i386__) #if !defined(YUV_DISABLE_ASM) && defined (__i386__)
#define HAS_TRANSPOSE_UVWX8_SSE2 #define HAS_TRANSPOSE_UVWX8_SSE2
extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
...@@ -491,7 +491,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -491,7 +491,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"pop %ebx \n" "pop %ebx \n"
"ret \n" "ret \n"
); );
#elif defined(__x86_64__) #elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time. // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
......
...@@ -17,7 +17,7 @@ namespace libyuv { ...@@ -17,7 +17,7 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
static const uvec8 vtbl_4x4_transpose = static const uvec8 vtbl_4x4_transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
......
...@@ -18,6 +18,7 @@ namespace libyuv { ...@@ -18,6 +18,7 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// TODO(fbarchard): Remove kMaxStride
#define kMaxStride (2560 * 4) #define kMaxStride (2560 * 4)
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
...@@ -26,8 +27,9 @@ extern "C" { ...@@ -26,8 +27,9 @@ extern "C" {
#endif #endif
// The following are available on all x86 platforms // The following are available on all x86 platforms
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ #if !defined(YUV_DISABLE_ASM) && \
!defined(YUV_DISABLE_ASM) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ABGRTOARGBROW_SSSE3
#define HAS_BGRATOARGBROW_SSSE3 #define HAS_BGRATOARGBROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3
...@@ -66,7 +68,7 @@ extern "C" { ...@@ -66,7 +68,7 @@ extern "C" {
#endif #endif
// The following are available on Neon platforms // The following are available on Neon platforms
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_MIRRORROW_NEON #define HAS_MIRRORROW_NEON
#define HAS_MIRRORROWUV_NEON #define HAS_MIRRORROWUV_NEON
#define HAS_SPLITUV_NEON #define HAS_SPLITUV_NEON
...@@ -78,7 +80,7 @@ extern "C" { ...@@ -78,7 +80,7 @@ extern "C" {
// The following are only available on Win32 // The following are only available on Win32
// TODO(fbarchard): Port to GCC // TODO(fbarchard): Port to GCC
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3
#endif #endif
...@@ -265,25 +267,18 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -265,25 +267,18 @@ void YToARGBRow_SSE2(const uint8* y_buf,
int width); int width);
// ARGB preattenuated alpha blend. // ARGB preattenuated alpha blend.
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
int width);
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
int width);
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
// ARGB preattenuated alpha blend with 2 sources and a destination.
void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
// 'Any' functions handle any size and alignment. // 'Any' functions handle any size and alignment.
void I420ToARGBRow_Any_SSSE3(const uint8* y_buf, void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
......
...@@ -454,73 +454,10 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { ...@@ -454,73 +454,10 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
} }
#define BLENDER(f, b, a) (((256 - a) * b) >> 8) + f #define BLENDER(f, b, a) (((256 - a) * b) >> 8) + f
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
for (int x = 0; x < width - 1; x += 2) {
uint32 a = src_argb[3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[0];
const uint32 fg = src_argb[1];
const uint32 fr = src_argb[2];
const uint32 bb = dst_argb[0];
const uint32 bg = dst_argb[1];
const uint32 br = dst_argb[2];
dst_argb[0] = BLENDER(fb, bb, a);
dst_argb[1] = BLENDER(fg, bg, a);
dst_argb[2] = BLENDER(fr, br, a);
dst_argb[3] = 255u;
} else {
*reinterpret_cast<uint32*>(dst_argb) =
*reinterpret_cast<const uint32*>(src_argb);
}
}
a = src_argb[4 + 3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[4 + 0];
const uint32 fg = src_argb[4 + 1];
const uint32 fr = src_argb[4 + 2];
const uint32 bb = dst_argb[4 + 0];
const uint32 bg = dst_argb[4 + 1];
const uint32 br = dst_argb[4 + 2];
dst_argb[4 + 0] = BLENDER(fb, bb, a);
dst_argb[4 + 1] = BLENDER(fg, bg, a);
dst_argb[4 + 2] = BLENDER(fr, br, a);
dst_argb[4 + 3] = 255u;
} else {
*reinterpret_cast<uint32*>(dst_argb + 4) =
*reinterpret_cast<const uint32*>(src_argb + 4);
}
}
src_argb += 8;
dst_argb += 8;
}
if (width & 1) {
const uint32 a = src_argb[3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[0];
const uint32 fg = src_argb[1];
const uint32 fr = src_argb[2];
const uint32 bb = dst_argb[0];
const uint32 bg = dst_argb[1];
const uint32 br = dst_argb[2];
dst_argb[0] = BLENDER(fb, bb, a);
dst_argb[1] = BLENDER(fg, bg, a);
dst_argb[2] = BLENDER(fr, br, a);
dst_argb[3] = 255u;
} else {
*reinterpret_cast<uint32*>(dst_argb) =
*reinterpret_cast<const uint32*>(src_argb);
}
}
}
}
// Blend src_argb0 over src_argb1 and store to dst_argb. // Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1. // dst_argb may be src_argb0 or src_argb1.
void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
for (int x = 0; x < width - 1; x += 2) { for (int x = 0; x < width - 1; x += 2) {
uint32 a = src_argb0[3]; uint32 a = src_argb0[3];
......
...@@ -16,7 +16,7 @@ extern "C" { ...@@ -16,7 +16,7 @@ extern "C" {
#endif #endif
// This module is for GCC Neon // This module is for GCC Neon
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define YUVTORGB \ #define YUVTORGB \
"vld1.u8 {d0}, [%0]! \n" \ "vld1.u8 {d0}, [%0]! \n" \
......
...@@ -18,7 +18,7 @@ extern "C" { ...@@ -18,7 +18,7 @@ extern "C" {
#endif #endif
// This module is for GCC x86 and x64 // This module is for GCC x86 and x64
#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
// GCC 4.2 on OSX has link error when passing static or const to inline. // GCC 4.2 on OSX has link error when passing static or const to inline.
// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
...@@ -2031,162 +2031,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -2031,162 +2031,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
#ifdef HAS_ARGBBLENDROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time // Blend 8 pixels at a time
// Destination aligned to 16 bytes, multiple of 4 pixels // Destination aligned to 16 bytes, multiple of 4 pixels
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
int width) {
asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0xf,%%xmm7 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x8,%%xmm6 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"psllw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
// 8 pixel loop
"1: \n"
"movdqu (%0),%%xmm3 \n" // first 4 pixels
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqa (%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqa (%1),%%xmm1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"movdqu 0x10(%0),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1) \n"
"jle 9f \n"
"movdqa %%xmm3,%%xmm0 \n" // next 4 pixels
"pxor %%xmm4,%%xmm3 \n"
"movdqa 0x10(%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqa 0x10(%1),%%xmm1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"jg 1b \n"
"9: \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
// Blend 1 pixel at a time, unaligned
void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0xf,%%xmm7 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x8,%%xmm6 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"psllw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
// 1 pixel loop
"1: \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movd (%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movd (%1),%%xmm1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%2 \n"
"movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
// Do 1 to 3 pixels to get destination aligned.
if ((uintptr_t)(dst_argb) & 15) {
int count = width;
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
count = (-(intptr_t)(dst_argb) >> 2) & 3;
}
ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
src_argb += count * 4;
dst_argb += count * 4;
width -= count;
}
// Do multiple of 4 pixels
if (width & ~3) {
ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
}
// Do remaining 1 to 3 pixels
if (width & 3) {
src_argb += (width & ~3) * 4;
dst_argb += (width & ~3) * 4;
width &= 3;
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
}
}
#endif // HAS_ARGBBLENDROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time
// Destination aligned to 16 bytes, multiple of 4 pixels
void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n" "pcmpeqb %%xmm7,%%xmm7 \n"
...@@ -2259,7 +2104,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2259,7 +2104,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
} }
// Blend 1 pixel at a time, unaligned // Blend 1 pixel at a time, unaligned
void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n" "pcmpeqb %%xmm7,%%xmm7 \n"
...@@ -2309,15 +2154,15 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2309,15 +2154,15 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
); );
} }
void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
// Do 1 to 3 pixels to get destination aligned. // Do 1 to 3 pixels to get destination aligned.
if ((uintptr_t)(dst_argb) & 15) { if ((uintptr_t)(dst_argb) & 15) {
int count = width; int count = width;
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
count = (-(intptr_t)(dst_argb) >> 2) & 3; count = (-(intptr_t)(dst_argb) >> 2) & 3;
} }
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count); ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
src_argb0 += count * 4; src_argb0 += count * 4;
src_argb1 += count * 4; src_argb1 += count * 4;
dst_argb += count * 4; dst_argb += count * 4;
...@@ -2325,7 +2170,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2325,7 +2170,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
} }
// Do multiple of 4 pixels // Do multiple of 4 pixels
if (width & ~3) { if (width & ~3) {
ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
} }
// Do remaining 1 to 3 pixels // Do remaining 1 to 3 pixels
if (width & 3) { if (width & 3) {
...@@ -2333,19 +2178,11 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -2333,19 +2178,11 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
src_argb1 += (width & ~3) * 4; src_argb1 += (width & ~3) * 4;
dst_argb += (width & ~3) * 4; dst_argb += (width & ~3) * 4;
width &= 3; width &= 3;
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width); ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
} }
} }
#endif // HAS_ARGBBLENDROW_SSE2 #endif // HAS_ARGBBLENDROW_SSE2
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus
......
This diff is collapsed.
...@@ -55,7 +55,7 @@ void SetUseReferenceImpl(bool use) { ...@@ -55,7 +55,7 @@ void SetUseReferenceImpl(bool use) {
* *
*/ */
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN2_NEON
void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
...@@ -566,12 +566,13 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr, ...@@ -566,12 +566,13 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
*/ */
// Constants for SSE2 code // Constants for SSE2 code
#elif defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) && \ #elif !defined(YUV_DISABLE_ASM) && \
!defined(YUV_DISABLE_ASM) (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define TALIGN16(t, var) __declspec(align(16)) t _ ## var #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
#elif defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__) && \ #elif defined(__i386__) && \
defined(__i386__) (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__))
#define TALIGN16(t, var) t var __attribute__((aligned(16))) #define TALIGN16(t, var) t var __attribute__((aligned(16)))
#else #else
#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
...@@ -670,12 +671,12 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) = ...@@ -670,12 +671,12 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
#endif #endif
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SCALEROWDOWN2_SSE2 #define HAS_SCALEROWDOWN2_SSE2
// Reads 32 pixels, throws half away and writes 16 pixels. // Reads 32 pixels, throws half away and writes 16 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -704,7 +705,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ...@@ -704,7 +705,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
} }
// Blends 32x2 rectangle to 16x1. // Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -749,7 +750,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -749,7 +750,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
#define HAS_SCALEROWDOWN4_SSE2 #define HAS_SCALEROWDOWN4_SSE2
// Point samples 32 pixels to 8 pixels. // Point samples 32 pixels to 8 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -780,7 +781,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, ...@@ -780,7 +781,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
// Blends 32x4 rectangle to 8x1. // Blends 32x4 rectangle to 8x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -842,7 +843,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -842,7 +843,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
#define HAS_SCALEROWDOWN8_SSE2 #define HAS_SCALEROWDOWN8_SSE2
// Point samples 32 pixels to 4 pixels. // Point samples 32 pixels to 4 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -874,7 +875,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, ...@@ -874,7 +875,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
// Blends 32x8 rectangle to 4x1. // Blends 32x8 rectangle to 4x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -952,7 +953,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, ...@@ -952,7 +953,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -1001,7 +1002,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1001,7 +1002,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -1059,7 +1060,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1059,7 +1060,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -1122,7 +1123,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1122,7 +1123,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
// 3/8 point sampler // 3/8 point sampler
// Scale 32 pixels to 12 // Scale 32 pixels to 12
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -1154,7 +1155,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1154,7 +1155,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
} }
// Scale 16x3 pixels to 6x1 with interpolation // Scale 16x3 pixels to 6x1 with interpolation
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -1221,7 +1222,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1221,7 +1222,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
} }
// Scale 16x2 pixels to 6x1 with interpolation // Scale 16x2 pixels to 6x1 with interpolation
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
...@@ -1269,7 +1270,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ...@@ -1269,7 +1270,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
#define HAS_SCALEADDROWS_SSE2 #define HAS_SCALEADDROWS_SSE2
// Reads 16xN bytes and produces 16 shorts at a time. // Reads 16xN bytes and produces 16 shorts at a time.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, uint16* dst_ptr, int src_width,
int src_height) { int src_height) {
...@@ -1329,7 +1330,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1329,7 +1330,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
#define HAS_SCALEFILTERROWS_SSE2 #define HAS_SCALEFILTERROWS_SSE2
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int src_stride, int dst_width, int src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
...@@ -1420,7 +1421,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1420,7 +1421,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
#define HAS_SCALEFILTERROWS_SSSE3 #define HAS_SCALEFILTERROWS_SSSE3
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int src_stride, int dst_width, int src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
...@@ -1501,7 +1502,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1501,7 +1502,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(naked) __declspec(align(16))
static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width) { int dst_width) {
__asm { __asm {
...@@ -1547,7 +1548,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -1547,7 +1548,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
} }
} }
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
// GCC versions of row functions are verbatim conversions from Visual C. // GCC versions of row functions are verbatim conversions from Visual C.
// Generated using gcc disassembly on Visual C object file: // Generated using gcc disassembly on Visual C object file:
...@@ -1766,7 +1767,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ...@@ -1766,7 +1767,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
); );
} }
#if defined(__i386__) #if !defined(YUV_DISABLE_ASM) && defined(__i386__)
extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
asm( asm(
...@@ -2260,7 +2261,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, ...@@ -2260,7 +2261,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"ret \n" "ret \n"
); );
#elif defined(__x86_64__) #elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment