Commit f51e8791 authored by fbarchard@google.com's avatar fbarchard@google.com

Blur functions

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/633005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@282 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 2d9fe082
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 281 Version: 282
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -275,6 +275,20 @@ int MJPGToARGB(const uint8* sample, ...@@ -275,6 +275,20 @@ int MJPGToARGB(const uint8* sample,
int w, int h, int w, int h,
int dw, int dh); int dw, int dh);
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height);
// Blur ARGB image.
// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
// to 16 byte boundary.
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height, int radius);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 281 #define LIBYUV_VERSION 282
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -1676,6 +1676,86 @@ int MJPGToARGB(const uint8* sample, ...@@ -1676,6 +1676,86 @@ int MJPGToARGB(const uint8* sample,
} }
#endif #endif
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height) {
if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
return -1;
}
void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
}
#endif
memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 ints per pixel.
int32* previous_cumsum = dst_cumsum;
for (int y = 0; y < height; ++y) {
ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
previous_cumsum = dst_cumsum;
dst_cumsum += dst_stride32_cumsum;
src_argb += src_stride_argb;
}
return 0;
}
// Blur ARGB image.
// Caller should allocate cumsum table of width * height * 16 bytes aligned
// to 16 byte boundary.
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height, int radius) {
void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C;
#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
}
#endif
ARGBComputeCumulativeSum(src_argb, src_stride_argb,
dst_cumsum, dst_stride32_cumsum,
width, height);
for (int y = 0; y < height; ++y) {
int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
int32* cumsum_top_row = &dst_cumsum[top_y * dst_stride32_cumsum];
int32* cumsum_bot_row = &dst_cumsum[bot_y * dst_stride32_cumsum];
// Left clipped.
int area = radius * (bot_y - top_y);
int boxwidth = radius * 4;
int x;
for (x = 0; x < radius + 1; ++x) {
CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
boxwidth, area, &dst_argb[x * 4], 1);
area += (bot_y - top_y);
boxwidth += 4;
}
// Middle unclipped.
int n = (width - 1) - radius - x + 1;
CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
boxwidth, area, &dst_argb[x * 4], n);
// Right clipped.
for (x += n; x <= width - 1; ++x) {
area -= (bot_y - top_y);
boxwidth -= 4;
CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4,
cumsum_bot_row + (x - radius - 1) * 4,
boxwidth, area, &dst_argb[x * 4], 1);
}
dst_argb += dst_stride_argb;
}
return 0;
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -76,6 +76,8 @@ extern "C" { ...@@ -76,6 +76,8 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2 #define HAS_YUY2TOYROW_SSE2
#define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBGRAYROW_SSSE3
#define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSEPIAROW_SSSE3
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
#define HAS_CUMULATIVESUMTOAVERAGE_SSE2
#endif #endif
// The following are disabled when SSSE3 is available: // The following are disabled when SSSE3 is available:
...@@ -105,6 +107,7 @@ typedef __declspec(align(16)) int8 vec8[16]; ...@@ -105,6 +107,7 @@ typedef __declspec(align(16)) int8 vec8[16];
typedef __declspec(align(16)) uint8 uvec8[16]; typedef __declspec(align(16)) uint8 uvec8[16];
typedef __declspec(align(16)) int16 vec16[8]; typedef __declspec(align(16)) int16 vec16[8];
typedef __declspec(align(16)) uint16 uvec16[8]; typedef __declspec(align(16)) uint16 uvec16[8];
typedef __declspec(align(16)) int32 vec32[4];
typedef __declspec(align(16)) uint32 uvec32[4]; typedef __declspec(align(16)) uint32 uvec32[4];
#else // __GNUC__ #else // __GNUC__
#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
...@@ -112,6 +115,7 @@ typedef int8 __attribute__((vector_size(16))) vec8; ...@@ -112,6 +115,7 @@ typedef int8 __attribute__((vector_size(16))) vec8;
typedef uint8 __attribute__((vector_size(16))) uvec8; typedef uint8 __attribute__((vector_size(16))) uvec8;
typedef int16 __attribute__((vector_size(16))) vec16; typedef int16 __attribute__((vector_size(16))) vec16;
typedef uint16 __attribute__((vector_size(16))) uvec16; typedef uint16 __attribute__((vector_size(16))) uvec16;
typedef int32 __attribute__((vector_size(16))) vec32;
typedef uint32 __attribute__((vector_size(16))) uvec32; typedef uint32 __attribute__((vector_size(16))) uvec32;
#endif #endif
...@@ -485,6 +489,17 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width); ...@@ -485,6 +489,17 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width);
void ARGBSepiaRow_C(uint8* dst_argb, int width); void ARGBSepiaRow_C(uint8* dst_argb, int width);
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
// Used for blur.
void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst, int count);
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
int32* previous_cumsum, int width);
void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst, int count);
void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
int32* previous_cumsum, int width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -950,6 +950,35 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -950,6 +950,35 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
} }
} }
void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
int32* previous_cumsum, int width) {
int32 row_sum[4] = {0, 0, 0, 0};
for (int x = 0; x < width; ++x) {
row_sum[0] += row[x * 4 + 0];
row_sum[1] += row[x * 4 + 1];
row_sum[2] += row[x * 4 + 2];
row_sum[3] += row[x * 4 + 3];
cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
}
}
void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
int w, int area, uint8* dst, int count) {
float ooa = 1.0f / area;
for (int i = 0; i < count; ++i) {
dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
dst += 4;
tl += 4;
bl += 4;
}
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
......
...@@ -2932,6 +2932,177 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { ...@@ -2932,6 +2932,177 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
} }
#endif // HAS_ARGBSEPIAROW_SSSE3 #endif // HAS_ARGBSEPIAROW_SSSE3
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
// Creates a table of cumulative sums where each value is a sum of all values
// above and to the left of the value, inclusive of the value.
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
int32* previous_cumsum, int width) {
asm volatile (
"sub %1,%2 \n"
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n"
"sub $0x4,%3 \n"
"jl 49f \n"
"test $0xf,%1 \n"
"jne 49f \n"
// 4 pixel loop \n"
".p2align 2 \n"
"40: \n"
"movdqu (%0),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
"movdqa %%xmm2,%%xmm4 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm1,%%xmm2 \n"
"punpckhwd %%xmm1,%%xmm3 \n"
"punpckhbw %%xmm1,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"punpcklwd %%xmm1,%%xmm4 \n"
"punpckhwd %%xmm1,%%xmm5 \n"
"paddd %%xmm2,%%xmm0 \n"
"movdqa (%1,%2,1),%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm0 \n"
"movdqa 0x10(%1,%2,1),%%xmm3 \n"
"paddd %%xmm0,%%xmm3 \n"
"paddd %%xmm4,%%xmm0 \n"
"movdqa 0x20(%1,%2,1),%%xmm4 \n"
"paddd %%xmm0,%%xmm4 \n"
"paddd %%xmm5,%%xmm0 \n"
"movdqa 0x30(%1,%2,1),%%xmm5 \n"
"paddd %%xmm0,%%xmm5 \n"
"movdqa %%xmm2,(%1) \n"
"movdqa %%xmm3,0x10(%1) \n"
"movdqa %%xmm4,0x20(%1) \n"
"movdqa %%xmm5,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
"49: \n"
"add $0x3,%3 \n"
"jl 19f \n"
// 1 pixel loop \n"
".p2align 2 \n"
"10: \n"
"movd (%0),%%xmm2 \n"
"lea 0x4(%0),%0 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpcklwd %%xmm4,%%xmm2 \n"
"paddd %%xmm2,%%xmm0 \n"
"movdqu (%1,%2,1),%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"movdqu %%xmm2,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
: "+r"(row), // %0
"+r"(cumsum), // %1
"+r"(previous_cumsum), // %2
"+r"(width) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst, int count) {
asm volatile (
"movd %5,%%xmm4 \n"
"cvtdq2ps %%xmm4,%%xmm4 \n"
"rcpss %%xmm4,%%xmm4 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
"sub $0x4,%3 \n"
"jl 49f \n"
// 4 pixel loop \n"
".p2align 2 \n"
"40: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm3 \n"
"psubd (%0,%4,4),%%xmm0 \n"
"psubd 0x10(%0,%4,4),%%xmm1 \n"
"psubd 0x20(%0,%4,4),%%xmm2 \n"
"psubd 0x30(%0,%4,4),%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"psubd (%1),%%xmm0 \n"
"psubd 0x10(%1),%%xmm1 \n"
"psubd 0x20(%1),%%xmm2 \n"
"psubd 0x30(%1),%%xmm3 \n"
"paddd (%1,%4,4),%%xmm0 \n"
"paddd 0x10(%1,%4,4),%%xmm1 \n"
"paddd 0x20(%1,%4,4),%%xmm2 \n"
"paddd 0x30(%1,%4,4),%%xmm3 \n"
"lea 0x40(%1),%1 \n"
"cvtdq2ps %%xmm0,%%xmm0 \n"
"cvtdq2ps %%xmm1,%%xmm1 \n"
"mulps %%xmm4,%%xmm0 \n"
"mulps %%xmm4,%%xmm1 \n"
"cvtdq2ps %%xmm2,%%xmm2 \n"
"cvtdq2ps %%xmm3,%%xmm3 \n"
"mulps %%xmm4,%%xmm2 \n"
"mulps %%xmm4,%%xmm3 \n"
"cvtps2dq %%xmm0,%%xmm0 \n"
"cvtps2dq %%xmm1,%%xmm1 \n"
"cvtps2dq %%xmm2,%%xmm2 \n"
"cvtps2dq %%xmm3,%%xmm3 \n"
"packssdw %%xmm1,%%xmm0 \n"
"packssdw %%xmm3,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"movdqu %%xmm0,(%2) \n"
"lea 0x10(%2),%2 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
"49: \n"
"add $0x3,%3 \n"
"jl 19f \n"
// 1 pixel loop \n"
".p2align 2 \n"
"10: \n"
"movdqa (%0),%%xmm0 \n"
"psubd (%0,%4,4),%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"psubd (%1),%%xmm0 \n"
"paddd (%1,%4,4),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
"cvtdq2ps %%xmm0,%%xmm0 \n"
"mulps %%xmm4,%%xmm0 \n"
"cvtps2dq %%xmm0,%%xmm0 \n"
"packssdw %%xmm0,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,(%2) \n"
"lea 0x4(%2),%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
: "+r"(topleft), // %0
"+r"(botleft), // %1
"+r"(dst), // %2
"+rm"(count) // %3
: "r"(static_cast<intptr_t>(width)), // %4
"rm"(area) // %5
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
);
}
#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -3011,6 +3011,197 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { ...@@ -3011,6 +3011,197 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
} }
} }
#endif // HAS_ARGBSEPIAROW_SSSE3 #endif // HAS_ARGBSEPIAROW_SSSE3
#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
// Consider float CumulativeSum.
// Consider calling CumulativeSum one row at time as needed.
// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
// Convert cumulative sum for an area to an average for 1 pixel.
// topleft is pointer to top left of CumulativeSum buffer for area.
// botleft is pointer to bottom left of CumulativeSum buffer.
// width is offset from left to right of area in CumulativeSum buffer measured
// in number of ints.
// area is the number of pixels in the area being averaged.
// dst points to pixel to store result to.
// count is number of averaged pixels to produce.
// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
// aligned.
void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst, int count) {
__asm {
mov eax, topleft // eax topleft
mov esi, botleft // esi botleft
mov edx, width
movd xmm4, area
mov edi, dst
mov ecx, count
cvtdq2ps xmm4, xmm4
rcpss xmm4, xmm4 // 1.0f / area
pshufd xmm4, xmm4, 0
sub ecx, 4
jl l4b
// 4 pixel loop
align 4
l4:
// top left
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
// - top right
psubd xmm0, [eax + edx * 4]
psubd xmm1, [eax + edx * 4 + 16]
psubd xmm2, [eax + edx * 4 + 32]
psubd xmm3, [eax + edx * 4 + 48]
lea eax, [eax + 64]
// - bottom left
psubd xmm0, [esi]
psubd xmm1, [esi + 16]
psubd xmm2, [esi + 32]
psubd xmm3, [esi + 48]
// + bottom right
paddd xmm0, [esi + edx * 4]
paddd xmm1, [esi + edx * 4 + 16]
paddd xmm2, [esi + edx * 4 + 32]
paddd xmm3, [esi + edx * 4 + 48]
lea esi, [esi + 64]
cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
cvtdq2ps xmm1, xmm1
mulps xmm0, xmm4
mulps xmm1, xmm4
cvtdq2ps xmm2, xmm2
cvtdq2ps xmm3, xmm3
mulps xmm2, xmm4
mulps xmm3, xmm4
cvtps2dq xmm0, xmm0
cvtps2dq xmm1, xmm1
cvtps2dq xmm2, xmm2
cvtps2dq xmm3, xmm3
packssdw xmm0, xmm1
packssdw xmm2, xmm3
packuswb xmm0, xmm2
movdqu [edi], xmm0
lea edi, [edi + 16]
sub ecx, 4
jge l4
l4b:
add ecx, 4 - 1
jl l1b
// 1 pixel loop
align 4
l1:
movdqa xmm0, [eax]
psubd xmm0, [eax + edx * 4]
lea eax, [eax + 16]
psubd xmm0, [esi]
paddd xmm0, [esi + edx * 4]
lea esi, [esi + 16]
cvtdq2ps xmm0, xmm0
mulps xmm0, xmm4
cvtps2dq xmm0, xmm0
packssdw xmm0, xmm0
packuswb xmm0, xmm0
movd dword ptr [edi], xmm0
lea edi, [edi + 4]
sub ecx, 1
jge l1
l1b:
}
}
#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
// Creates a table of cumulative sums where each value is a sum of all values
// above and to the left of the value.
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
int32* previous_cumsum, int width) {
__asm {
mov eax, row
mov edx, cumsum
mov esi, previous_cumsum
mov ecx, width
sub esi, edx
pxor xmm0, xmm0
pxor xmm1, xmm1
sub ecx, 4
jl l4b
test edx, 15
jne l4b
// 4 pixel loop
align 4
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
movdqa xmm4, xmm2
punpcklbw xmm2, xmm1
movdqa xmm3, xmm2
punpcklwd xmm2, xmm1
punpckhwd xmm3, xmm1
punpckhbw xmm4, xmm1
movdqa xmm5, xmm4
punpcklwd xmm4, xmm1
punpckhwd xmm5, xmm1
paddd xmm0, xmm2
movdqa xmm2, [edx + esi] // previous row above.
paddd xmm2, xmm0
paddd xmm0, xmm3
movdqa xmm3, [edx + esi + 16]
paddd xmm3, xmm0
paddd xmm0, xmm4
movdqa xmm4, [edx + esi + 32]
paddd xmm4, xmm0
paddd xmm0, xmm5
movdqa xmm5, [edx + esi + 48]
paddd xmm5, xmm0
movdqa [edx], xmm2
movdqa [edx + 16], xmm3
movdqa [edx + 32], xmm4
movdqa [edx + 48], xmm5
lea edx, [edx + 64]
sub ecx, 4
jge l4
l4b:
add ecx, 4 - 1
jl l1b
// 1 pixel loop
align 4
l1:
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
lea eax, [eax + 4]
punpcklbw xmm2, xmm4
punpcklwd xmm2, xmm4
paddd xmm0, xmm2
movdqu xmm2, [edx + esi]
paddd xmm2, xmm0
movdqu [edx], xmm2
lea edx, [edx + 16]
sub ecx, 1
jge l1
l1b:
}
}
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
#endif // _M_IX86 #endif // _M_IX86
......
...@@ -353,30 +353,30 @@ TEST_F(libyuvTest, TestAttenuate) { ...@@ -353,30 +353,30 @@ TEST_F(libyuvTest, TestAttenuate) {
EXPECT_EQ(255, atten_pixels[255][3]); EXPECT_EQ(255, atten_pixels[255][3]);
} }
TEST_F(libyuvTest, TestAddRow) { TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
SIMD_ALIGNED(uint8 orig_pixels[256]); SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
SIMD_ALIGNED(uint16 added_pixels[256]); SIMD_ALIGNED(int32 added_pixels[16][16][4]);
libyuv::AddRow AddRow = GetAddRow(added_pixels, 256); for (int y = 0; y < 16; ++y) {
libyuv::AddRow SubRow = GetSubRow(added_pixels, 256); for (int x = 0; x < 16; ++x) {
orig_pixels[y][x][0] = 1u;
for (int i = 0; i < 256; ++i) { orig_pixels[y][x][1] = 2u;
orig_pixels[i] = i; orig_pixels[y][x][2] = 3u;
orig_pixels[y][x][3] = 255u;
}
} }
memset(added_pixels, 0, sizeof(uint16) * 256);
AddRow(orig_pixels, added_pixels, 256); ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
EXPECT_EQ(7u, added_pixels[7]); &added_pixels[0][0][0], 16 * 4,
EXPECT_EQ(250u, added_pixels[250]); 16, 16);
AddRow(orig_pixels, added_pixels, 256);
EXPECT_EQ(14u, added_pixels[7]);
EXPECT_EQ(500u, added_pixels[250]);
SubRow(orig_pixels, added_pixels, 256);
EXPECT_EQ(7u, added_pixels[7]);
EXPECT_EQ(250u, added_pixels[250]);
for (int i = 0; i < 1000 * (1280 * 720 * 4 / 256); ++i) { for (int y = 0; y < 16; ++y) {
AddRow(orig_pixels, added_pixels, 256); for (int x = 0; x < 16; ++x) {
EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
}
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment