Commit bea690b3 authored by Frank Barchard's avatar Frank Barchard

AVX2 YUV alpha blender and improved unittests

AVX2 version can process 16 pixels at a time for improved memory bandwidth and fewer instructions.

unittests improved to test unaligned memory, and test exactness when alpha is 0 or 255.

R=dhrosa@google.com, harryjin@google.com
BUG=libyuv:527

Review URL: https://codereview.chromium.org/1505433002 .
parent fa2618ee
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1547
Version: 1548
License: BSD
License File: LICENSE
......
......@@ -302,6 +302,7 @@ LIBYUV_API
ARGBBlendRow GetARGBBlend();
// Alpha Blend ARGB images and store to destination.
// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255.
LIBYUV_API
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
......@@ -309,6 +310,31 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Alpha Blend plane and store to destination.
// Source is not pre-multiplied by alpha.
LIBYUV_API
int BlendPlane(const uint8* src_y0, int src_stride_y0,
const uint8* src_y1, int src_stride_y1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Alpha Blend YUV images and store to destination.
// Source is not pre-multiplied by alpha.
// Alpha is full width x height and subsampled to half size to apply to UV.
LIBYUV_API
int I420Blend(const uint8* src_y0, int src_stride_y0,
const uint8* src_u0, int src_stride_u0,
const uint8* src_v0, int src_stride_v0,
const uint8* src_y1, int src_stride_y1,
const uint8* src_u1, int src_stride_u1,
const uint8* src_v1, int src_stride_v1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
......
......@@ -233,6 +233,7 @@ extern "C" {
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBSUBTRACTROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
#define HAS_BLENDPLANEROW_AVX2
#endif
// The following are available for AVX2 Visual C and clangcl 32 bit:
......@@ -253,12 +254,6 @@ extern "C" {
#define HAS_RGB565TOARGBROW_AVX2
#endif
// The following are available for 32 bit Visual C and clangcl 32 bit:
// TODO(fbarchard): Port to gcc.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#define HAS_BLENDPLANEROW_SSSE3
#endif
// The following are also available on x64 Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__))
......@@ -1464,6 +1459,12 @@ void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
// Unattenuated planar alpha blend.
void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width);
void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width);
void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width);
void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width);
void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1547
#define LIBYUV_VERSION 1548
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -17,6 +17,7 @@
#include "libyuv/mjpeg_decoder.h"
#endif
#include "libyuv/row.h"
#include "libyuv/scale_row.h" // for ScaleRowDown2
#ifdef __cplusplus
namespace libyuv {
......@@ -577,6 +578,167 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
return 0;
}
// Alpha Blend plane and store to destination.
LIBYUV_API
int BlendPlane(const uint8* src_y0, int src_stride_y0,
const uint8* src_y1, int src_stride_y1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
int width, int height) {
int y;
void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_stride_y = -dst_stride_y;
}
// Coalesce rows for Y plane.
if (src_stride_y0 == width &&
src_stride_y1 == width &&
alpha_stride == width &&
dst_stride_y == width) {
width *= height;
height = 1;
src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
}
#if defined(HAS_BLENDPLANEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
// TODO(fbarchard): Implement any versions for odd width.
// BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
BlendPlaneRow = BlendPlaneRow_SSSE3;
}
}
#endif
#if defined(HAS_BLENDPLANEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
// BlendPlaneRow = BlendPlaneRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
BlendPlaneRow = BlendPlaneRow_AVX2;
}
}
#endif
for (y = 0; y < height; ++y) {
BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
src_y0 += src_stride_y0;
src_y1 += src_stride_y1;
alpha += alpha_stride;
dst_y += dst_stride_y;
}
return 0;
}
#define MAXTWIDTH 2048
// Alpha Blend YUV images and store to destination.
LIBYUV_API
int I420Blend(const uint8* src_y0, int src_stride_y0,
const uint8* src_u0, int src_stride_u0,
const uint8* src_v0, int src_stride_v0,
const uint8* src_y1, int src_stride_y1,
const uint8* src_u1, int src_stride_u1,
const uint8* src_v1, int src_stride_v1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
!alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_stride_y = -dst_stride_y;
}
// Blend Y plane.
BlendPlane(src_y0, src_stride_y0,
src_y1, src_stride_y1,
alpha, alpha_stride,
dst_y, dst_stride_y,
width, height);
// Half width/height for UV.
width = (width + 1) >> 1;
height = (height + 1) >> 1;
#if defined(HAS_BLENDPLANEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
// TODO(fbarchard): Implement any versions for odd width.
// BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
BlendPlaneRow = BlendPlaneRow_SSSE3;
}
}
#endif
#if defined(HAS_BLENDPLANEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
// BlendPlaneRow = BlendPlaneRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
BlendPlaneRow = BlendPlaneRow_AVX2;
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ScaleRowDown2 = ScaleRowDown2Box_NEON;
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown2 = ScaleRowDown2Box_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
ScaleRowDown2 = ScaleRowDown2Box_SSE2;
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ScaleRowDown2 = ScaleRowDown2Box_AVX2;
}
}
#endif
// Row buffer for intermediate alpha pixels.
align_buffer_64(halfalpha, width);
for (y = 0; y < height; ++y) {
// Subsample 2 rows of UV to half width and half height.
ScaleRowDown2(alpha, alpha_stride, halfalpha, width);
alpha += alpha_stride * 2;
BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, width);
BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, width);
src_u0 += src_stride_u0;
src_u1 += src_stride_u1;
dst_u += dst_stride_u;
src_v0 += src_stride_v0;
src_v1 += src_stride_v1;
dst_v += dst_stride_v;
}
free_aligned_buffer_64(halfalpha);
return 0;
}
// Multiply 2 ARGB images and store to destination.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
......
......@@ -3467,7 +3467,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
}
#endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_BLENDPLANEROW_SSSE3
// Blend 8 pixels at a time.
// =((G2*C2)+(H2*(D2))+32768+127)/256
......@@ -3514,6 +3513,56 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
}
#endif // HAS_BLENDPLANEROW_SSSE3
#ifdef HAS_BLENDPLANEROW_AVX2
// Blend 16 pixels at a time.
// =((G2*C2)+(H2*(D2))+32768+127)/256
void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsllw $0x8,%%ymm5,%%ymm5 \n"
"mov $0x80808080,%%eax \n"
"vmovd %%eax,%%xmm6 \n"
"vbroadcastss %%xmm6,%%ymm6 \n"
"mov $0x807f807f,%%eax \n"
"vmovd %%eax,%%xmm7 \n"
"vbroadcastss %%xmm7,%%ymm7 \n"
"sub %2,%0 \n"
"sub %2,%1 \n"
"sub %2,%3 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vmovdqu (%2),%%xmm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
"vpxor %%ymm5,%%ymm0,%%ymm0 \n"
"vmovdqu (%0,%2,1),%%xmm1 \n"
"vmovdqu (%1,%2,1),%%xmm2 \n"
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
"vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%xmm0,(%3,%2,1) \n"
"lea 0x10(%2),%2 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(alpha), // %2
"+r"(dst), // %3
"+r"(width) // %4
:: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_BLENDPLANEROW_AVX2
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
......
......@@ -525,7 +525,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
vmovd xmm5, eax
vbroadcastss ymm5, xmm5
mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
movd xmm6, eax
vmovd xmm6, eax
vbroadcastss ymm6, xmm6
vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
vpsllw ymm3, ymm3, 11
......@@ -576,7 +576,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
vmovd xmm5, eax
vbroadcastss ymm5, xmm5
mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
movd xmm6, eax
vmovd xmm6, eax
vbroadcastss ymm6, xmm6
vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
vpsllw ymm3, ymm3, 11
......@@ -4106,7 +4106,7 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
movq qword ptr [edi + esi], xmm0
lea esi, [esi + 8]
sub ecx, 8
jge convertloop8
jg convertloop8
pop edi
pop esi
......@@ -4115,6 +4115,62 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
}
#endif // HAS_BLENDPLANEROW_SSSE3
#ifdef HAS_BLENDPLANEROW_AVX2
// Blend 16 pixels at a time.
// =((G2*C2)+(H2*(D2))+32768+127)/256
__declspec(naked)
void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
const uint8* alpha, uint8* dst, int width) {
__asm {
push esi
push edi
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
vpsllw ymm5, ymm5, 8
mov eax, 0x80808080 // 128 for biasing image to signed.
vmovd xmm6, eax
vbroadcastss ymm6, xmm6
mov eax, 0x807f807f // 32768 + 127 for unbias and round.
vmovd xmm7, eax
vbroadcastss ymm7, xmm7
mov eax, [esp + 8 + 4] // src0
mov edx, [esp + 8 + 8] // src1
mov esi, [esp + 8 + 12] // alpha
mov edi, [esp + 8 + 16] // dst
mov ecx, [esp + 8 + 20] // width
sub eax, esi
sub edx, esi
sub edi, esi
// 16 pixel loop.
convertloop16:
vmovdqu xmm0, [esi] // alpha
vpermq ymm0, ymm0, 0xd8
vpunpcklbw ymm0, ymm0, ymm0
vpxor ymm0, ymm0, ymm5 // a, 255-a
vmovdqu xmm1, [eax + esi] // src0
vmovdqu xmm2, [edx + esi] // src1
vpermq ymm1, ymm1, 0xd8
vpermq ymm2, ymm2, 0xd8
vpunpcklbw ymm1, ymm1, ymm2
vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
vpmaddubsw ymm0, ymm0, ymm1
vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
vpsrlw ymm0, ymm0, 8
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8
vmovdqu [edi + esi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg convertloop16
pop edi
pop esi
vzeroupper
ret
}
}
#endif // HAS_BLENDPLANEROW_AVX2
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
static const uvec8 kShuffleAlpha = {
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment