Commit 8fa76349 authored by fbarchard@google.com's avatar fbarchard@google.com

Blend style multiple

BUG=175
TEST=Multiply unittest
Review URL: https://webrtc-codereview.appspot.com/1048004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@542 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 8ec60334
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 541 Version: 542
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -209,6 +209,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, ...@@ -209,6 +209,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Multiply ARGB image by ARGB image.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I422 to YUY2. // Convert I422 to YUY2.
LIBYUV_API LIBYUV_API
int I422ToYUY2(const uint8* src_y, int src_stride_y, int I422ToYUY2(const uint8* src_y, int src_stride_y,
...@@ -265,11 +272,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, ...@@ -265,11 +272,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value); int width, int height, uint32 value);
// Multiply ARGB image by ARGB image.
int ARGBMultiply(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Interpolate between two ARGB images using specified amount of interpolation // Interpolate between two ARGB images using specified amount of interpolation
// (0 to 255) and store to destination. // (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 // 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
......
...@@ -967,6 +967,15 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, ...@@ -967,6 +967,15 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
// ARGB preattenuated alpha blend. Same API as Blend, but these require
// pointer and width alignment for SSE2.
void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
...@@ -1270,7 +1279,6 @@ void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft, ...@@ -1270,7 +1279,6 @@ void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
const int32* previous_cumsum, int width); const int32* previous_cumsum, int width);
LIBYUV_API LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width); uint8* dst_argb, const float* uv_dudv, int width);
...@@ -1287,10 +1295,6 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, ...@@ -1287,10 +1295,6 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb, void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride_argb, int dst_width, ptrdiff_t src_stride_argb, int dst_width,
int source_y_fraction); int source_y_fraction);
void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
int width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 541 #define LIBYUV_VERSION 542
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -404,6 +404,50 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, ...@@ -404,6 +404,50 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
return 0; return 0;
} }
// Multiply 2 ARGB images together and store to destination.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBMultiplyRow_C;
#if defined(HAS_ARGBMULTIPLYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
}
}
#elif defined(HAS_ARGBMULTIPLYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_NEON;
}
#endif
// Multiply plane
for (int y = 0; y < height; ++y) {
ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
src_argb0 += src_stride_argb0;
src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb;
}
return 0;
}
// Convert I422 to BGRA. // Convert I422 to BGRA.
LIBYUV_API LIBYUV_API
int I422ToBGRA(const uint8* src_y, int src_stride_y, int I422ToBGRA(const uint8* src_y, int src_stride_y,
...@@ -1170,47 +1214,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, ...@@ -1170,47 +1214,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
// ARGB multiply 2 images together.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
void (*ARGBMultiplyRow)(const uint8* src, uint8* dst, int width) =
ARGBMultiplyRow_C;
#if defined(HAS_ARGBMULTIPLYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
}
}
#elif defined(HAS_ARGBMULTIPLYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_NEON;
}
#endif
// Multiply plane
for (int y = 0; y < height; ++y) {
ARGBMultiplyRow(src_argb, dst_argb, width);
src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
return 0;
}
// Interpolate 2 ARGB images by specified amount (0 to 255). // Interpolate 2 ARGB images by specified amount (0 to 255).
// TODO(fbarchard): Consider selecting a specialization for interpolation so // TODO(fbarchard): Consider selecting a specialization for interpolation so
// row function doesn't need to check interpolation on each row. // row function doesn't need to check interpolation on each row.
......
...@@ -373,10 +373,12 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) ...@@ -373,10 +373,12 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
#undef MergeUVRow_ANY #undef MergeUVRow_ANY
#define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK) \ #define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK) \
void NAMEANY(const uint8* src_argb, uint8* dst_argb, int width) { \ void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \
uint8* dst_argb, int width) { \
int n = width & ~MASK; \ int n = width & ~MASK; \
ARGBMULT_SIMD(src_argb, dst_argb, n); \ ARGBMULT_SIMD(src_argb0, src_argb1, dst_argb, n); \
ARGBMULT_C(src_argb + n * 4, \ ARGBMULT_C(src_argb0 + n * 4, \
src_argb1 + n * 4, \
dst_argb + n * 4, \ dst_argb + n * 4, \
width & MASK); \ width & MASK); \
} }
......
...@@ -704,21 +704,23 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -704,21 +704,23 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
#define REPEAT8(v) (v) | ((v) << 8) #define REPEAT8(v) (v) | ((v) << 8)
#define SHADE(f, v) v * f >> 16 #define SHADE(f, v) v * f >> 16
void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
for (int i = 0; i < width; ++i) { for (int i = 0; i < width; ++i) {
const uint32 b = REPEAT8(src_argb[0]); const uint32 b = REPEAT8(src_argb0[0]);
const uint32 g = REPEAT8(src_argb[1]); const uint32 g = REPEAT8(src_argb0[1]);
const uint32 r = REPEAT8(src_argb[2]); const uint32 r = REPEAT8(src_argb0[2]);
const uint32 a = REPEAT8(src_argb[3]); const uint32 a = REPEAT8(src_argb0[3]);
const uint32 b_scale = dst_argb[0]; const uint32 b_scale = src_argb1[0];
const uint32 g_scale = dst_argb[1]; const uint32 g_scale = src_argb1[1];
const uint32 r_scale = dst_argb[2]; const uint32 r_scale = src_argb1[2];
const uint32 a_scale = dst_argb[3]; const uint32 a_scale = src_argb1[3];
dst_argb[0] = SHADE(b, b_scale); dst_argb[0] = SHADE(b, b_scale);
dst_argb[1] = SHADE(g, g_scale); dst_argb[1] = SHADE(g, g_scale);
dst_argb[2] = SHADE(r, r_scale); dst_argb[2] = SHADE(r, r_scale);
dst_argb[3] = SHADE(a, a_scale); dst_argb[3] = SHADE(a, a_scale);
src_argb += 4; src_argb0 += 4;
src_argb1 += 4;
dst_argb += 4; dst_argb += 4;
} }
} }
......
...@@ -3963,10 +3963,12 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -3963,10 +3963,12 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiple 2 rows of ARGB pixels together, 4 pixels at a time. // Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
// Aligned to 16 bytes. // Aligned to 16 bytes.
void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
"sub %0,%2 \n"
// 4 pixel loop. // 4 pixel loop.
".p2align 4 \n" ".p2align 4 \n"
...@@ -3982,13 +3984,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3982,13 +3984,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n" "pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n" "sub $0x4,%3 \n"
"movdqa %%xmm0,(%0,%1,1) \n" "movdqa %%xmm0,(%0,%2,1) \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb0), // %0
"+r"(dst_argb), // %1 "+r"(src_argb1), // %1
"+r"(width) // %2 "+r"(dst_argb), // %2
"+r"(width) // %3
: :
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
......
...@@ -4280,18 +4280,22 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -4280,18 +4280,22 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
// Multiple 2 rows of ARGB pixels together, 4 pixels at a time. // Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
// Aligned to 16 bytes. // Aligned to 16 bytes.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm { __asm {
mov eax, [esp + 4] // src_argb push esi
mov edx, [esp + 8] // dst_argb mov eax, [esp + 4 + 4] // src_argb0
mov ecx, [esp + 12] // width mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0 pxor xmm5, xmm5 // constant 0
sub esi, eax
sub edx, eax sub edx, eax
align 16 align 16
convertloop: convertloop:
movdqa xmm0, [eax] // read 4 pixels movdqa xmm0, [eax] // read 4 pixels from src_argb0
movdqa xmm2, [eax + edx] // read 4 dest pixels movdqa xmm2, [eax + esi] // read 4 pixels from src_argb1
movdqa xmm1, xmm0 movdqa xmm1, xmm0
movdqa xmm3, xmm2 movdqa xmm3, xmm2
punpcklbw xmm0, xmm0 // first 2 punpcklbw xmm0, xmm0 // first 2
...@@ -4306,6 +4310,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4306,6 +4310,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
lea eax, [eax + 16] lea eax, [eax + 16]
jg convertloop jg convertloop
pop esi
ret ret
} }
} }
......
...@@ -913,17 +913,21 @@ static int TestMultiply(int width, int height, int benchmark_iterations, ...@@ -913,17 +913,21 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
src_argb_a[i + off] = (random() & 0xff); src_argb_a[i + off] = (random() & 0xff);
src_argb_b[i + off] = (random() & 0xff); src_argb_b[i + off] = (random() & 0xff);
} }
memcpy(dst_argb_c, src_argb_b + off, kStride * height); memset(dst_argb_c, 0, kStride * height);
memcpy(dst_argb_opt, src_argb_b + off, kStride * height); memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(0); MaskCpuFlags(0);
ARGBMultiply(src_argb_a + off, kStride, ARGBMultiply(src_argb_a + off, kStride,
src_argb_b + off, kStride,
dst_argb_c, kStride, dst_argb_c, kStride,
width, invert * height); width, invert * height);
MaskCpuFlags(-1); MaskCpuFlags(-1);
ARGBMultiply(src_argb_a + off, kStride, for (int i = 0; i < benchmark_iterations; ++i) {
dst_argb_opt, kStride, ARGBMultiply(src_argb_a + off, kStride,
width, invert * height); src_argb_b + off, kStride,
dst_argb_opt, kStride,
width, invert * height);
}
int max_diff = 0; int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) { for (int i = 0; i < kStride * height; ++i) {
int abs_diff = int abs_diff =
...@@ -933,12 +937,6 @@ static int TestMultiply(int width, int height, int benchmark_iterations, ...@@ -933,12 +937,6 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
max_diff = abs_diff; max_diff = abs_diff;
} }
} }
// Benchmark.
for (int i = 0; i < benchmark_iterations - 1; ++i) {
ARGBMultiply(src_argb_a + off, kStride,
dst_argb_opt, kStride,
width, invert * height);
}
free_aligned_buffer_64(src_argb_a) free_aligned_buffer_64(src_argb_a)
free_aligned_buffer_64(src_argb_b) free_aligned_buffer_64(src_argb_b)
free_aligned_buffer_64(dst_argb_c) free_aligned_buffer_64(dst_argb_c)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment