Commit 8af0ebf8 authored by Frank Barchard's avatar Frank Barchard

planar blend use signed images

R=dhrosa@google.com, harryjin@google.com, jzern@chromium.org
BUG=libyuv:527

Review URL: https://codereview.chromium.org/1491533002 .
parent b6f37bd8
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1546
Version: 1547
License: BSD
License File: LICENSE
......
......@@ -252,6 +252,12 @@ extern "C" {
#define HAS_RGB565TOARGBROW_AVX2
#endif
// The following are available for 32 bit Visual C and clangcl 32 bit:
// TODO(fbarchard): Port to gcc.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#define HAS_BLENDPLANEROW_SSSE3
#endif
// The following are also available on x64 Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__))
......@@ -1454,6 +1460,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
// Unattenuated planar alpha blend.
void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width);
void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width);
// ARGB multiply images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1546
#define LIBYUV_VERSION 1547
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -2016,6 +2016,18 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
}
}
#undef BLEND
void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width) {
int x;
for (x = 0; x < width; ++x) {
uint32 f = *src0++;
uint32 b = *src1++;
uint32 a = *alpha++;
*dst++ = (((a) * f) + ((255 - a) * b) + 255) >> 8;
}
}
#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
// Multiply source RGB by alpha and store to destination.
......
......@@ -4063,6 +4063,58 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
}
#endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_BLENDPLANEROW_SSSE3
// Blend 8 pixels at a time.
// =((G2*C2)+(H2*(D2))+32768+127)/256
__declspec(naked)
void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width) {
__asm {
push esi
push edi
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
psllw xmm5, 8
mov eax, 0x80808080 // 128 for biasing image to signed.
movd xmm6, eax
pshufd xmm6, xmm6, 0x00
mov eax, 0x807f807f // 32768 + 127 for unbias and round.
movd xmm7, eax
pshufd xmm7, xmm7, 0x00
mov eax, [esp + 8 + 4] // src0
mov edx, [esp + 8 + 8] // src1
mov esi, [esp + 8 + 12] // alpha
mov edi, [esp + 8 + 16] // dst
mov ecx, [esp + 8 + 20] // width
sub eax, esi
sub edx, esi
sub edi, esi
// 8 pixel loop.
convertloop8:
movq xmm0, qword ptr [esi] // alpha
punpcklbw xmm0, xmm0
pxor xmm0, xmm5 // a, 255-a
movq xmm1, qword ptr [eax + esi] // src0
movq xmm2, qword ptr [edx + esi] // src1
punpcklbw xmm1, xmm2
psubb xmm1, xmm6 // bias src0/1 - 128
pmaddubsw xmm0, xmm1
paddw xmm0, xmm7 // unbias result - 32768 and round.
psrlw xmm0, 8
packuswb xmm0, xmm0
movq qword ptr [edi + esi], xmm0
lea esi, [esi + 8]
sub ecx, 8
jge convertloop8
pop edi
pop esi
ret
}
}
#endif // HAS_BLENDPLANEROW_SSSE3
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
static const uvec8 kShuffleAlpha = {
......
......@@ -1163,6 +1163,87 @@ TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
EXPECT_LE(max_diff, 1);
}
#ifdef HAS_BLENDPLANEROW_SSSE3
// TODO(fbarchard): Switch to I420Blend.
static void TestBlendPlane(int width, int height, int benchmark_iterations,
int invert, int off) {
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
width = width * height;
height = 1;
if (width < 1) {
width = 1;
}
if (width < 256) {
width = 256;
}
const int kBpp = 1;
const int kStride = width * kBpp;
align_buffer_64(src_argb_a, kStride * height + off);
align_buffer_64(src_argb_b, kStride * height + off);
align_buffer_64(src_argb_alpha, kStride * height + off);
align_buffer_64(dst_argb_c, kStride * height);
align_buffer_64(dst_argb_opt, kStride * height);
if (has_ssse3) {
for (int i = 0; i < 255; ++i) {
src_argb_a[i] = i;
src_argb_b[i] = 255 - i;
src_argb_alpha[i] = 255;
}
memset(dst_argb_opt, 0xfb, kStride * height);
BlendPlaneRow_SSSE3(src_argb_a + off,
src_argb_b + off,
src_argb_alpha + off,
dst_argb_opt,
width * height);
for (int i = 0; i < kStride * height; ++i) {
EXPECT_EQ(src_argb_a[i], dst_argb_opt[i]);
}
}
for (int i = 0; i < kStride * height; ++i) {
src_argb_a[i + off] = (fastrand() & 0xff);
src_argb_b[i + off] = (fastrand() & 0xff);
src_argb_alpha[i + off] = (fastrand() & 0xff);
}
memset(dst_argb_c, 255, kStride * height);
memset(dst_argb_opt, 255, kStride * height);
BlendPlaneRow_C(src_argb_a + off,
src_argb_b + off,
src_argb_alpha + off,
dst_argb_c,
width * height);
for (int i = 0; i < benchmark_iterations; ++i) {
if (has_ssse3) {
BlendPlaneRow_SSSE3(src_argb_a + off,
src_argb_b + off,
src_argb_alpha + off,
dst_argb_opt,
width * height);
} else {
BlendPlaneRow_C(src_argb_a + off,
src_argb_b + off,
src_argb_alpha + off,
dst_argb_opt,
width * height);
}
}
for (int i = 0; i < kStride * height; ++i) {
EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
}
free_aligned_buffer_64(src_argb_a);
free_aligned_buffer_64(src_argb_b);
free_aligned_buffer_64(dst_argb_c);
free_aligned_buffer_64(dst_argb_opt);
return;
}
TEST_F(LibYUVPlanarTest, BlendPlane_Opt) {
TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+1, 0);
}
#endif
TEST_F(LibYUVPlanarTest, TestAffine) {
SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment