Commit d2f4413d authored by fbarchard@google.com's avatar fbarchard@google.com

Remove old alpha blend, expose GetARGB2Blend, fix ComputeSumSquareErrorPlane on SSE2

BUG=29
TEST=none
Review URL: https://webrtc-codereview.appspot.com/469005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@234 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c757f308
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 233
Version: 234
License: BSD
License File: LICENSE
......
......@@ -133,24 +133,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Alpha Blend ARGB row of pixels.
void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width);
typedef void (*ARGBBlendRow)(const uint8* src_argb0,
const uint8* src_argb1,
uint8* dst_argb, int width);
// Alpha Blend 2 rows of ARGB pixels and store to destination.
void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
// Get function to Alpha Blend ARGB pixels and store to destination.
ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width);
// Alpha Blend ARGB.
int ARGBBlend(const uint8* src_argb, int src_stride_argb,
// Alpha Blend ARGB images and store to destination.
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Alpha Blend 2 ARGB images and store to destination.
int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I422 to YUY2.
int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
......
......@@ -20,7 +20,7 @@ extern "C" {
// Supported rotation
enum RotationMode {
kRotate0 = 0, // No rotation
kRotate0 = 0, // No rotation
kRotate90 = 90, // Rotate 90 degrees clockwise
kRotate180 = 180, // Rotate 180 degrees
kRotate270 = 270, // Rotate 270 degrees clockwise
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION 233
#define LIBYUV_VERSION 234
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -25,18 +25,37 @@ namespace libyuv {
extern "C" {
#endif
// hash seed of 5381 recommended.
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
// Internal C version of HashDjb2 with int sized count for efficiency.
static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
uint32 hash = seed;
if (count > 0) {
do {
hash = hash * 33 + *src++;
} while (--count);
for (int i = 0; i < count; ++i) {
hash += (hash << 5) + src[i];
}
return hash;
}
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
// hash seed of 5381 recommended.
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
while (count >= static_cast<uint64>(kBlockSize)) {
seed = HashDjb2_C(src, kBlockSize, seed);
src += kBlockSize;
count -= kBlockSize;
}
int remainder = static_cast<int>(count) & ~15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
src += remainder;
count -= remainder;
}
remainder = static_cast<int>(count) & 15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
}
return seed;
}
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SUMSQUAREERROR_NEON
static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
......@@ -75,9 +94,9 @@ static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
return sse;
}
#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SUMSQUAREERROR_SSE2
__declspec(naked)
__declspec(naked) __declspec(align(16))
static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
int count) {
__asm {
......@@ -94,7 +113,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
movdqa xmm2, [eax + edx]
lea eax, [eax + 16]
sub ecx, 16
movdqa xmm3, xmm1
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
por xmm1, xmm2
......@@ -116,7 +135,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
}
}
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
int count) {
......@@ -167,11 +186,9 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
int count) {
uint32 sse = 0u;
for (int x = 0; x < count; ++x) {
int diff = src_a[0] - src_b[0];
for (int i = 0; i < count; ++i) {
int diff = src_a[i] - src_b[i];
sse += static_cast<uint32>(diff * diff);
src_a += 1;
src_b += 1;
}
return sse;
}
......@@ -187,6 +204,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
#elif defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}
#endif
......@@ -225,8 +243,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
SumSquareError = SumSquareError_NEON;
}
#elif defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
SumSquareError = SumSquareError_SSE2;
}
#endif
......
......@@ -61,9 +61,9 @@ int I420Copy(const uint8* src_y, int src_stride_y,
return 0;
}
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_HALFROW_SSE2
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
__asm {
......@@ -86,7 +86,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
}
}
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_HALFROW_SSE2
static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
......@@ -179,12 +179,13 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
// Blends 32x2 pixels to 16x1
// source in scale.cc
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SCALEROWDOWN2_NEON
void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
uint8* dst, int dst_width);
#elif defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) && \
!defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
#endif
......@@ -450,9 +451,9 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
width, height);
}
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SPLITYUY2_SSE2
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void SplitYUY2_SSE2(const uint8* src_yuy2,
uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
__asm {
......@@ -498,7 +499,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
}
}
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SPLITYUY2_SSE2
static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
uint8* dst_u, uint8* dst_v, int pix) {
......
......@@ -205,9 +205,9 @@ int I400Copy(const uint8* src_y, int src_stride_y,
// UYVY - Macro-pixel = 2 image pixels
// U0Y0V0Y1
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_I42XTOYUY2ROW_SSE2
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void I42xToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -246,7 +246,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
}
#define HAS_I42XTOUYVYROW_SSE2
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void I42xToUYVYRow_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
......@@ -283,7 +283,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
ret
}
}
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_I42XTOYUY2ROW_SSE2
static void I42xToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u,
......
......@@ -24,9 +24,9 @@ extern "C" {
// and vst would select which 2 components to write. The low level would need
// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBTOBAYERROW_SSSE3
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) {
__asm {
......@@ -36,6 +36,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
mov ecx, [esp + 16] // pix
pshufd xmm5, xmm5, 0
align 16
wloop:
movdqa xmm0, [eax]
lea eax, [eax + 16]
......@@ -48,7 +49,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
}
}
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_ARGBTOBAYERROW_SSSE3
static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
......
......@@ -137,87 +137,38 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
return 0;
}
// Alpha Blend ARGB
void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) {
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBBlendRow_SSSE3(src_argb, dst_argb, width);
return;
}
#endif
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlendRow_SSE2(src_argb, dst_argb, width);
return;
}
#endif
ARGBBlendRow_C(src_argb, dst_argb, width);
}
// Alpha Blend 2 rows of ARGB pixels and store to destination.
void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBBlend2Row_SSSE3(src_argb0, src_argb1, dst_argb, width);
return;
}
#endif
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlend2Row_SSE2(src_argb0, src_argb1, dst_argb, width);
return;
}
#endif
ARGBBlend2Row_C(src_argb0, src_argb1, dst_argb, width);
}
// Alpha Blend ARGB
// TODO(fbarchard): Call 3 pointer low levels to reduce code size.
int ARGBBlend(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
ARGBBlendRow_C;
// Get a blender that optimized for the CPU, alignment and pixel count.
// As there are 6 blenders to choose from, the caller should try to use
// the same blend function for all pixels if possible.
ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) {
void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width) = ARGBBlendRow_C;
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlendRow = ARGBBlendRow_SSE2;
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
ARGBBlendRow = ARGBBlendRow1_SSE2;
if (width >= 4) {
ARGBBlendRow = ARGBBlendRow_Any_SSE2;
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
}
}
}
#endif
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBBlendRow = ARGBBlendRow_SSSE3;
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
ARGBBlendRow = ARGBBlendRow_Any_SSSE3;
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
}
}
#endif
for (int y = 0; y < height; ++y) {
ARGBBlendRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
return 0;
return ARGBBlendRow;
}
// Alpha Blend 2 ARGB images and store to destination.
int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
......@@ -230,30 +181,12 @@ int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*ARGBBlend2Row)(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width) = ARGBBlend2Row_C;
#if defined(HAS_ARGBBLENDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBBlend2Row = ARGBBlend2Row_SSE2;
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlend2Row = ARGBBlend2Row_Aligned_SSE2;
}
}
#endif
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBBlend2Row = ARGBBlend2Row_SSSE3;
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBBlend2Row = ARGBBlend2Row_Aligned_SSSE3;
}
}
#endif
void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width) =
GetARGBBlend(dst_argb, dst_stride_argb, width);
for (int y = 0; y < height; ++y) {
ARGBBlend2Row(src_argb0, src_argb1, dst_argb, width);
ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
src_argb0 += src_stride_argb0;
src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb;
......@@ -725,7 +658,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
// SetRow8 writes 'count' bytes using a 32 bit value repeated
// SetRow32 writes 'count' words using a 32 bit value repeated
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SETROW_NEON
static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
asm volatile (
......@@ -749,9 +682,9 @@ static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
}
}
#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SETROW_X86
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
__asm {
mov edx, edi
......@@ -765,7 +698,7 @@ static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
}
}
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
__asm {
......@@ -793,7 +726,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
}
}
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SETROW_X86
static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = static_cast<size_t>(width);
......@@ -903,6 +836,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
return 0;
}
// TODO(fbarchard): Add TestCpuFlag(kCpuHasX86) to allow C code to be tested.
// Draw a rectangle into ARGB
int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y,
......@@ -916,12 +850,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
#if defined(HAS_SETROW_X86)
SetRows32_X86(dst, value, width, dst_stride_argb, height);
#elif defined(HAS_SETROW_NEON)
#else
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
SetRows32_NEON(dst, value, width, dst_stride_argb, height);
return 0;
}
#endif
SetRows32_C(dst, value, width, dst_stride_argb, height);
#endif
return 0;
......
......@@ -21,8 +21,8 @@ namespace libyuv {
extern "C" {
#endif
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
......@@ -59,9 +59,9 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
int width);
#endif
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_TRANSPOSE_WX8_SSSE3
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm {
......@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
}
#define HAS_TRANSPOSE_UVWX8_SSE2
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
......@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
ret
}
}
#elif defined(__i386__) || defined(__x86_64__) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
#define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
......@@ -369,7 +369,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
);
}
#if defined (__i386__)
#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
#define HAS_TRANSPOSE_UVWX8_SSE2
extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
......@@ -491,7 +491,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"pop %ebx \n"
"ret \n"
);
#elif defined(__x86_64__)
#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
......
......@@ -17,7 +17,7 @@ namespace libyuv {
extern "C" {
#endif
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
static const uvec8 vtbl_4x4_transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
......
......@@ -18,6 +18,7 @@ namespace libyuv {
extern "C" {
#endif
// TODO(fbarchard): Remove kMaxStride
#define kMaxStride (2560 * 4)
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
......@@ -26,8 +27,9 @@ extern "C" {
#endif
// The following are available on all x86 platforms
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_BGRATOARGBROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3
......@@ -66,7 +68,7 @@ extern "C" {
#endif
// The following are available on Neon platforms
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORROWUV_NEON
#define HAS_SPLITUV_NEON
......@@ -78,7 +80,7 @@ extern "C" {
// The following are only available on Win32
// TODO(fbarchard): Port to GCC
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBBLENDROW_SSSE3
#endif
......@@ -265,25 +267,18 @@ void YToARGBRow_SSE2(const uint8* y_buf,
int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
int width);
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
int width);
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
// ARGB preattenuated alpha blend with 2 sources and a destination.
void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
// 'Any' functions handle any size and alignment.
void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
......
......@@ -454,73 +454,10 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
}
#define BLENDER(f, b, a) (((256 - a) * b) >> 8) + f
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
for (int x = 0; x < width - 1; x += 2) {
uint32 a = src_argb[3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[0];
const uint32 fg = src_argb[1];
const uint32 fr = src_argb[2];
const uint32 bb = dst_argb[0];
const uint32 bg = dst_argb[1];
const uint32 br = dst_argb[2];
dst_argb[0] = BLENDER(fb, bb, a);
dst_argb[1] = BLENDER(fg, bg, a);
dst_argb[2] = BLENDER(fr, br, a);
dst_argb[3] = 255u;
} else {
*reinterpret_cast<uint32*>(dst_argb) =
*reinterpret_cast<const uint32*>(src_argb);
}
}
a = src_argb[4 + 3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[4 + 0];
const uint32 fg = src_argb[4 + 1];
const uint32 fr = src_argb[4 + 2];
const uint32 bb = dst_argb[4 + 0];
const uint32 bg = dst_argb[4 + 1];
const uint32 br = dst_argb[4 + 2];
dst_argb[4 + 0] = BLENDER(fb, bb, a);
dst_argb[4 + 1] = BLENDER(fg, bg, a);
dst_argb[4 + 2] = BLENDER(fr, br, a);
dst_argb[4 + 3] = 255u;
} else {
*reinterpret_cast<uint32*>(dst_argb + 4) =
*reinterpret_cast<const uint32*>(src_argb + 4);
}
}
src_argb += 8;
dst_argb += 8;
}
if (width & 1) {
const uint32 a = src_argb[3];
if (a) {
if (a < 255) {
const uint32 fb = src_argb[0];
const uint32 fg = src_argb[1];
const uint32 fr = src_argb[2];
const uint32 bb = dst_argb[0];
const uint32 bg = dst_argb[1];
const uint32 br = dst_argb[2];
dst_argb[0] = BLENDER(fb, bb, a);
dst_argb[1] = BLENDER(fg, bg, a);
dst_argb[2] = BLENDER(fr, br, a);
dst_argb[3] = 255u;
} else {
*reinterpret_cast<uint32*>(dst_argb) =
*reinterpret_cast<const uint32*>(src_argb);
}
}
}
}
// Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1.
void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
for (int x = 0; x < width - 1; x += 2) {
uint32 a = src_argb0[3];
......
......@@ -16,7 +16,7 @@ extern "C" {
#endif
// This module is for GCC Neon
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define YUVTORGB \
"vld1.u8 {d0}, [%0]! \n" \
......
......@@ -18,7 +18,7 @@ extern "C" {
#endif
// This module is for GCC x86 and x64
#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
// GCC 4.2 on OSX has link error when passing static or const to inline.
// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
......@@ -2031,162 +2031,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
#ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time
// Destination aligned to 16 bytes, multiple of 4 pixels
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0xf,%%xmm7 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x8,%%xmm6 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"psllw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
// 8 pixel loop
"1: \n"
"movdqu (%0),%%xmm3 \n" // first 4 pixels
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqa (%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqa (%1),%%xmm1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"movdqu 0x10(%0),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1) \n"
"jle 9f \n"
"movdqa %%xmm3,%%xmm0 \n" // next 4 pixels
"pxor %%xmm4,%%xmm3 \n"
"movdqa 0x10(%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqa 0x10(%1),%%xmm1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"jg 1b \n"
"9: \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
// Blend 1 pixel at a time, unaligned
void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0xf,%%xmm7 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x8,%%xmm6 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"psllw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
// 1 pixel loop
"1: \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movd (%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movd (%1),%%xmm1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%2 \n"
"movd %%xmm0,(%1) \n"
"lea 0x4(%1),%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
// Do 1 to 3 pixels to get destination aligned.
if ((uintptr_t)(dst_argb) & 15) {
int count = width;
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
count = (-(intptr_t)(dst_argb) >> 2) & 3;
}
ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
src_argb += count * 4;
dst_argb += count * 4;
width -= count;
}
// Do multiple of 4 pixels
if (width & ~3) {
ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
}
// Do remaining 1 to 3 pixels
if (width & 3) {
src_argb += (width & ~3) * 4;
dst_argb += (width & ~3) * 4;
width &= 3;
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
}
}
#endif // HAS_ARGBBLENDROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time
// Destination aligned to 16 bytes, multiple of 4 pixels
void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n"
......@@ -2259,7 +2104,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
}
// Blend 1 pixel at a time, unaligned
void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n"
......@@ -2309,15 +2154,15 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
);
}
void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
// Do 1 to 3 pixels to get destination aligned.
if ((uintptr_t)(dst_argb) & 15) {
int count = width;
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
count = (-(intptr_t)(dst_argb) >> 2) & 3;
}
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
src_argb0 += count * 4;
src_argb1 += count * 4;
dst_argb += count * 4;
......@@ -2325,7 +2170,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
}
// Do multiple of 4 pixels
if (width & ~3) {
ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
}
// Do remaining 1 to 3 pixels
if (width & 3) {
......@@ -2333,19 +2178,11 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
src_argb1 += (width & ~3) * 4;
dst_argb += (width & ~3) * 4;
width &= 3;
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
}
}
#endif // HAS_ARGBBLENDROW_SSE2
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
......
This diff is collapsed.
......@@ -55,7 +55,7 @@ void SetUseReferenceImpl(bool use) {
*
*/
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SCALEROWDOWN2_NEON
void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
uint8* dst, int dst_width) {
......@@ -566,12 +566,13 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
*/
// Constants for SSE2 code
#elif defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) && \
!defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
#if defined(_MSC_VER)
#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
#elif defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__) && \
defined(__i386__)
#elif defined(__i386__) && \
(defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__))
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
#else
#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
......@@ -670,12 +671,12 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
#endif
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SCALEROWDOWN2_SSE2
// Reads 32 pixels, throws half away and writes 16 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -704,7 +705,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
}
// Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -749,7 +750,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
#define HAS_SCALEROWDOWN4_SSE2
// Point samples 32 pixels to 8 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -780,7 +781,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
// Blends 32x4 rectangle to 8x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -842,7 +843,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
#define HAS_SCALEROWDOWN8_SSE2
// Point samples 32 pixels to 4 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -874,7 +875,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
// Blends 32x8 rectangle to 4x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -952,7 +953,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -1001,7 +1002,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -1059,7 +1060,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -1122,7 +1123,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
// 3/8 point sampler
// Scale 32 pixels to 12
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -1154,7 +1155,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
}
// Scale 16x3 pixels to 6x1 with interpolation
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -1221,7 +1222,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
}
// Scale 16x2 pixels to 6x1 with interpolation
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
......@@ -1269,7 +1270,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
#define HAS_SCALEADDROWS_SSE2
// Reads 16xN bytes and produces 16 shorts at a time.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width,
int src_height) {
......@@ -1329,7 +1330,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
#define HAS_SCALEFILTERROWS_SSE2
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int src_stride, int dst_width,
int source_y_fraction) {
......@@ -1420,7 +1421,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
#define HAS_SCALEFILTERROWS_SSSE3
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int src_stride, int dst_width,
int source_y_fraction) {
......@@ -1501,7 +1502,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked)
__declspec(naked) __declspec(align(16))
static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width) {
__asm {
......@@ -1547,7 +1548,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
}
}
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
// GCC versions of row functions are verbatim conversions from Visual C.
// Generated using gcc disassembly on Visual C object file:
......@@ -1766,7 +1767,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
);
}
#if defined(__i386__)
#if !defined(YUV_DISABLE_ASM) && defined(__i386__)
extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
asm(
......@@ -2260,7 +2261,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"ret \n"
);
#elif defined(__x86_64__)
#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment