Commit 8fa76349 authored by fbarchard@google.com's avatar fbarchard@google.com

Blend style multiple

BUG=175
TEST=Multiply unittest
Review URL: https://webrtc-codereview.appspot.com/1048004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@542 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8ec60334
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 541
Version: 542
License: BSD
License File: LICENSE
......
......@@ -209,6 +209,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Multiply ARGB image by ARGB image.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I422 to YUY2.
LIBYUV_API
int I422ToYUY2(const uint8* src_y, int src_stride_y,
......@@ -265,11 +272,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value);
// Multiply ARGB image by ARGB image.
int ARGBMultiply(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Interpolate between two ARGB images using specified amount of interpolation
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
......
......@@ -967,6 +967,15 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
// ARGB preattenuated alpha blend. Same API as Blend, but these require
// pointer and width alignment for SSE2.
void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
......@@ -1270,7 +1279,6 @@ void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
const int32* previous_cumsum, int width);
LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
......@@ -1287,10 +1295,6 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride_argb, int dst_width,
int source_y_fraction);
void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
int width);
#ifdef __cplusplus
} // extern "C"
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 541
#define LIBYUV_VERSION 542
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -404,6 +404,50 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
return 0;
}
// Multiply 2 ARGB images together and store to destination.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBMultiplyRow_C;
#if defined(HAS_ARGBMULTIPLYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
}
}
#elif defined(HAS_ARGBMULTIPLYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_NEON;
}
#endif
// Multiply plane
for (int y = 0; y < height; ++y) {
ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
src_argb0 += src_stride_argb0;
src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb;
}
return 0;
}
// Convert I422 to BGRA.
LIBYUV_API
int I422ToBGRA(const uint8* src_y, int src_stride_y,
......@@ -1170,47 +1214,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
return 0;
}
// ARGB multiply 2 images together.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
void (*ARGBMultiplyRow)(const uint8* src, uint8* dst, int width) =
ARGBMultiplyRow_C;
#if defined(HAS_ARGBMULTIPLYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
}
}
#elif defined(HAS_ARGBMULTIPLYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_NEON;
}
#endif
// Multiply plane
for (int y = 0; y < height; ++y) {
ARGBMultiplyRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
return 0;
}
// Interpolate 2 ARGB images by specified amount (0 to 255).
// TODO(fbarchard): Consider selecting a specialization for interpolation so
// row function doesn't need to check interpolation on each row.
......
......@@ -373,10 +373,12 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
#undef MergeUVRow_ANY
#define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK) \
void NAMEANY(const uint8* src_argb, uint8* dst_argb, int width) { \
void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \
uint8* dst_argb, int width) { \
int n = width & ~MASK; \
ARGBMULT_SIMD(src_argb, dst_argb, n); \
ARGBMULT_C(src_argb + n * 4, \
ARGBMULT_SIMD(src_argb0, src_argb1, dst_argb, n); \
ARGBMULT_C(src_argb0 + n * 4, \
src_argb1 + n * 4, \
dst_argb + n * 4, \
width & MASK); \
}
......
......@@ -704,21 +704,23 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
#define REPEAT8(v) (v) | ((v) << 8)
#define SHADE(f, v) v * f >> 16
void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
for (int i = 0; i < width; ++i) {
const uint32 b = REPEAT8(src_argb[0]);
const uint32 g = REPEAT8(src_argb[1]);
const uint32 r = REPEAT8(src_argb[2]);
const uint32 a = REPEAT8(src_argb[3]);
const uint32 b_scale = dst_argb[0];
const uint32 g_scale = dst_argb[1];
const uint32 r_scale = dst_argb[2];
const uint32 a_scale = dst_argb[3];
const uint32 b = REPEAT8(src_argb0[0]);
const uint32 g = REPEAT8(src_argb0[1]);
const uint32 r = REPEAT8(src_argb0[2]);
const uint32 a = REPEAT8(src_argb0[3]);
const uint32 b_scale = src_argb1[0];
const uint32 g_scale = src_argb1[1];
const uint32 r_scale = src_argb1[2];
const uint32 a_scale = src_argb1[3];
dst_argb[0] = SHADE(b, b_scale);
dst_argb[1] = SHADE(g, g_scale);
dst_argb[2] = SHADE(r, r_scale);
dst_argb[3] = SHADE(a, a_scale);
src_argb += 4;
src_argb0 += 4;
src_argb1 += 4;
dst_argb += 4;
}
}
......
......@@ -3963,10 +3963,12 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
// Aligned to 16 bytes.
void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
"pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n"
"sub %0,%2 \n"
// 4 pixel loop.
".p2align 4 \n"
......@@ -3982,13 +3984,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%0,%1,1) \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,(%0,%2,1) \n"
"lea 0x10(%0),%0 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
......
......@@ -4280,18 +4280,22 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
// Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0
sub esi, eax
sub edx, eax
align 16
convertloop:
movdqa xmm0, [eax] // read 4 pixels
movdqa xmm2, [eax + edx] // read 4 dest pixels
movdqa xmm0, [eax] // read 4 pixels from src_argb0
movdqa xmm2, [eax + esi] // read 4 pixels from src_argb1
movdqa xmm1, xmm0
movdqa xmm3, xmm2
punpcklbw xmm0, xmm0 // first 2
......@@ -4306,6 +4310,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
lea eax, [eax + 16]
jg convertloop
pop esi
ret
}
}
......
......@@ -913,17 +913,21 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
src_argb_a[i + off] = (random() & 0xff);
src_argb_b[i + off] = (random() & 0xff);
}
memcpy(dst_argb_c, src_argb_b + off, kStride * height);
memcpy(dst_argb_opt, src_argb_b + off, kStride * height);
memset(dst_argb_c, 0, kStride * height);
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(0);
ARGBMultiply(src_argb_a + off, kStride,
src_argb_b + off, kStride,
dst_argb_c, kStride,
width, invert * height);
MaskCpuFlags(-1);
for (int i = 0; i < benchmark_iterations; ++i) {
ARGBMultiply(src_argb_a + off, kStride,
src_argb_b + off, kStride,
dst_argb_opt, kStride,
width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
int abs_diff =
......@@ -933,12 +937,6 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
max_diff = abs_diff;
}
}
// Benchmark.
for (int i = 0; i < benchmark_iterations - 1; ++i) {
ARGBMultiply(src_argb_a + off, kStride,
dst_argb_opt, kStride,
width, invert * height);
}
free_aligned_buffer_64(src_argb_a)
free_aligned_buffer_64(src_argb_b)
free_aligned_buffer_64(dst_argb_c)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment