Commit 51d3e236 authored by fbarchard@google.com's avatar fbarchard@google.com

AVX2 math functions for images

BUG=none
TEST=ARGBMultiply ARGBAdd and ARGBSubtract unittests.
Review URL: https://webrtc-codereview.appspot.com/1146006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@588 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f4951e7a
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 587 Version: 588
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -141,6 +141,9 @@ extern "C" { ...@@ -141,6 +141,9 @@ extern "C" {
// Effects // Effects
#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBADDROW_AVX2
#define HAS_ARGBSUBTRACTROW_AVX2
#endif #endif
#endif #endif
...@@ -1011,6 +1014,10 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, ...@@ -1011,6 +1014,10 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
...@@ -1023,6 +1030,10 @@ void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, ...@@ -1023,6 +1030,10 @@ void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
...@@ -1036,6 +1047,10 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, ...@@ -1036,6 +1047,10 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 587 #define LIBYUV_VERSION 588
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -28,6 +28,7 @@ LIBYUV_API ...@@ -28,6 +28,7 @@ LIBYUV_API
void CopyPlane(const uint8* src_y, int src_stride_y, void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height) { int width, int height) {
// Coalesce contiguous rows.
if (src_stride_y == width && dst_stride_y == width) { if (src_stride_y == width && dst_stride_y == width) {
CopyPlane(src_y, 0, dst_y, 0, width * height, 1); CopyPlane(src_y, 0, dst_y, 0, width * height, 1);
return; return;
...@@ -503,7 +504,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, ...@@ -503,7 +504,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
return 0; return 0;
} }
// Multiply 2 ARGB images together and store to destination. // Multiply 2 ARGB images and store to destination.
LIBYUV_API LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1, const uint8* src_argb1, int src_stride_argb1,
...@@ -518,6 +519,15 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, ...@@ -518,6 +519,15 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
// Coalesce contiguous rows.
if (src_stride_argb0 == width * 4 &&
src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
return ARGBMultiply(src_argb0, 0,
src_argb1, 0,
dst_argb, 0,
width * height, 1);
}
void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBMultiplyRow_C; int width) = ARGBMultiplyRow_C;
...@@ -531,7 +541,18 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, ...@@ -531,7 +541,18 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
ARGBMultiplyRow = ARGBMultiplyRow_SSE2; ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
} }
} }
#elif defined(HAS_ARGBMULTIPLYROW_NEON) #endif
#if defined(HAS_ARGBMULTIPLYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBMULTIPLYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) { if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
...@@ -547,10 +568,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, ...@@ -547,10 +568,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBMULTIPLYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
// Add 2 ARGB images together and store to destination. // Add 2 ARGB images and store to destination.
LIBYUV_API LIBYUV_API
int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1, const uint8* src_argb1, int src_stride_argb1,
...@@ -565,6 +592,15 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, ...@@ -565,6 +592,15 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
// Coalesce contiguous rows.
if (src_stride_argb0 == width * 4 &&
src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
return ARGBAdd(src_argb0, 0,
src_argb1, 0,
dst_argb, 0,
width * height, 1);
}
void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBAddRow_C; int width) = ARGBAddRow_C;
...@@ -578,7 +614,18 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, ...@@ -578,7 +614,18 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
ARGBAddRow = ARGBAddRow_SSE2; ARGBAddRow = ARGBAddRow_SSE2;
} }
} }
#elif defined(HAS_ARGBADDROW_NEON) #endif
#if defined(HAS_ARGBADDROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBAddRow = ARGBAddRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBADDROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) { if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBAddRow = ARGBAddRow_Any_NEON; ARGBAddRow = ARGBAddRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
...@@ -594,6 +641,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, ...@@ -594,6 +641,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBADDROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
...@@ -612,6 +665,15 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, ...@@ -612,6 +665,15 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
// Coalesce contiguous rows.
if (src_stride_argb0 == width * 4 &&
src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
return ARGBSubtract(src_argb0, 0,
src_argb1, 0,
dst_argb, 0,
width * height, 1);
}
void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBSubtractRow_C; int width) = ARGBSubtractRow_C;
...@@ -625,7 +687,18 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, ...@@ -625,7 +687,18 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
ARGBSubtractRow = ARGBSubtractRow_SSE2; ARGBSubtractRow = ARGBSubtractRow_SSE2;
} }
} }
#elif defined(HAS_ARGBSUBTRACTROW_NEON) #endif
#if defined(HAS_ARGBSUBTRACTROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBSUBTRACTROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) { if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBSubtractRow = ARGBSubtractRow_Any_NEON; ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
...@@ -641,6 +714,12 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, ...@@ -641,6 +714,12 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBSUBTRACTROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
......
...@@ -420,6 +420,17 @@ MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3) ...@@ -420,6 +420,17 @@ MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C, MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,
3) 3)
#endif #endif
#ifdef HAS_ARGBMULTIPLYROW_AVX2
MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,
7)
#endif
#ifdef HAS_ARGBADDROW_AVX2
MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_AVX2
MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,
7)
#endif
#ifdef HAS_ARGBMULTIPLYROW_NEON #ifdef HAS_ARGBMULTIPLYROW_NEON
MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C, MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,
7) 7)
......
...@@ -751,7 +751,7 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, ...@@ -751,7 +751,7 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
} }
#undef SHADE #undef SHADE
#define SHADE(f, v) (v >= f) ? 0 : (f - v) #define SHADE(f, v) ((f - v) > f) ? 0 : (f - v)
void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
......
...@@ -4915,6 +4915,102 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4915,6 +4915,102 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
} }
#endif // HAS_ARGBSUBTRACTROW_SSE2 #endif // HAS_ARGBSUBTRACTROW_SSE2
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) __declspec(align(16))
void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0
sub esi, eax
sub edx, eax
align 16
convertloop:
vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
vmovdqu ymm3, [eax + esi] // read 8 pixels from src_argb1
vpunpcklbw ymm0, ymm1, ymm1 // low 4
vpunpckhbw ymm1, ymm1, ymm1 // high 4
vpunpcklbw ymm2, ymm3, ymm5 // low 4
vpunpckhbw ymm3, ymm3, ymm5 // high 4
vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
vpackuswb ymm0, ymm0, ymm1
sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
jg convertloop
pop esi
ret
}
}
#endif // HAS_ARGBMULTIPLYROW_AVX2
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) __declspec(align(16))
void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0
sub esi, eax
sub edx, eax
align 16
convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
vpaddusb ymm0, ymm0, [eax + esi] // add 8 pixels from src_argb1
sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
jg convertloop
pop esi
ret
}
}
#endif // HAS_ARGBADDROW_AVX2
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) __declspec(align(16))
void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
sub edx, eax
align 16
convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
vpsubusb ymm0, ymm0, [eax + esi] // src_argb0 - src_argb1
sub ecx, 8
vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
jg convertloop
pop esi
ret
}
}
#endif // HAS_ARGBSUBTRACTROW_AVX2
#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
// Consider float CumulativeSum. // Consider float CumulativeSum.
// Consider calling CumulativeSum one row at time as needed. // Consider calling CumulativeSum one row at time as needed.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment