Commit 013e8122 authored by fbarchard@google.com's avatar fbarchard@google.com

Port box filter to AVX2.

BUG=libyuv:425
TESTED=c:\intelsde\sde -ast -hsw -- out\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*libyuvTest.ScaleTo640x360_Box
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/43149004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1367 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent b5ea79d8
...@@ -52,6 +52,7 @@ extern "C" { ...@@ -52,6 +52,7 @@ extern "C" {
// The following are available on VS2012. // The following are available on VS2012.
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_SCALEADDROWS_AVX2
#define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN2_AVX2
#endif #endif
...@@ -262,26 +263,22 @@ void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -262,26 +263,22 @@ void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, uint16* dst_ptr, int src_width, int src_height);
int src_height); void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx); int dst_width, int x, int dx);
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx); int dst_width, int x, int dx);
void ScaleARGBRowDown2_SSE2(const uint8* src_argb, void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx, uint8* dst_argb, int dst_width);
uint8* dst_argb, int dst_width); void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
......
...@@ -702,11 +702,22 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, ...@@ -702,11 +702,22 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
} }
} }
static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
const uint16* src_ptr, uint8* dst_ptr) {
int scaleval = 65536 / boxheight;
int i;
src_ptr += (x >> 16);
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = src_ptr[i] * scaleval >> 16;
}
}
static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) { const uint16* src_ptr, uint8* dst_ptr) {
int boxwidth = (dx >> 16); int boxwidth = (dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight); int scaleval = 65536 / (boxwidth * boxheight);
int i; int i;
x >>= 16;
for (i = 0; i < dst_width; ++i) { for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
x += boxwidth; x += boxwidth;
...@@ -768,15 +779,20 @@ static void ScalePlaneBox(int src_width, int src_height, ...@@ -768,15 +779,20 @@ static void ScalePlaneBox(int src_width, int src_height,
align_buffer_64(row16, src_width * 2); align_buffer_64(row16, src_width * 2);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) = const uint16* src_ptr, uint8* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C; (dx & 0xffff) ? ScaleAddCols2_C:
((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
#if defined(HAS_SCALEADDROWS_SSE2) #if defined(HAS_SCALEADDROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
ScaleAddRows = ScaleAddRows_SSE2; ScaleAddRows = ScaleAddRows_SSE2;
} }
#endif #endif
#if defined(HAS_SCALEADDROWS_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(src_width, 32)) {
ScaleAddRows = ScaleAddRows_AVX2;
}
#endif
#if defined(HAS_SCALEADDROWS_NEON) #if defined(HAS_SCALEADDROWS_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) {
ScaleAddRows = ScaleAddRows_NEON; ScaleAddRows = ScaleAddRows_NEON;
...@@ -1419,8 +1435,7 @@ void ScalePlane(const uint8* src, int src_stride, ...@@ -1419,8 +1435,7 @@ void ScalePlane(const uint8* src, int src_stride,
enum FilterMode filtering) { enum FilterMode filtering) {
// Simplify filtering when possible. // Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height, filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height, dst_width, dst_height, filtering);
filtering);
// Negative height means invert the image. // Negative height means invert the image.
if (src_height < 0) { if (src_height < 0) {
...@@ -1436,7 +1451,7 @@ void ScalePlane(const uint8* src, int src_stride, ...@@ -1436,7 +1451,7 @@ void ScalePlane(const uint8* src, int src_stride,
CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
return; return;
} }
if (dst_width == src_width) { if (dst_width == src_width && filtering != kFilterBox) {
int dy = FixedDiv(src_height, dst_height); int dy = FixedDiv(src_height, dst_height);
// Arbitrary scale vertically, but unscaled vertically. // Arbitrary scale vertically, but unscaled vertically.
ScalePlaneVertical(src_height, ScalePlaneVertical(src_height,
...@@ -1503,8 +1518,7 @@ void ScalePlane_16(const uint16* src, int src_stride, ...@@ -1503,8 +1518,7 @@ void ScalePlane_16(const uint16* src, int src_stride,
enum FilterMode filtering) { enum FilterMode filtering) {
// Simplify filtering when possible. // Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height, filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height, dst_width, dst_height, filtering);
filtering);
// Negative height means invert the image. // Negative height means invert the image.
if (src_height < 0) { if (src_height < 0) {
......
...@@ -1030,10 +1030,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height, ...@@ -1030,10 +1030,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
filtering = kFilterBilinear; filtering = kFilterBilinear;
} }
// If scaling to larger, switch from Box to Bilinear.
if (dst_width >= src_width || dst_height >= src_height) {
filtering = kFilterBilinear;
}
} }
if (filtering == kFilterBilinear) { if (filtering == kFilterBilinear) {
if (src_height == 1) { if (src_height == 1) {
......
...@@ -579,11 +579,11 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -579,11 +579,11 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
int tmp_height = 0; int tmp_height = 0;
intptr_t tmp_src = 0; intptr_t tmp_src = 0;
asm volatile ( asm volatile (
"pxor %%xmm4,%%xmm4 \n"
"mov %0,%3 \n" // row pointer "mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height "mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators "pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n" "pxor %%xmm1,%%xmm1 \n"
"pxor %%xmm4,%%xmm4 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
......
...@@ -721,16 +721,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -721,16 +721,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edi, [esp + 16 + 12] // dst_ptr mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height mov ebx, [esp + 16 + 20] // height
pxor xmm4, xmm4
mov eax, esi // row pointer mov eax, esi // row pointer
mov ebp, ebx // height mov ebp, ebx // height
pxor xmm0, xmm0 // clear accumulators pxor xmm0, xmm0 // clear accumulators
pxor xmm1, xmm1 pxor xmm1, xmm1
pxor xmm4, xmm4
// sum rows
xloop: xloop:
// sum rows
yloop:
movdqu xmm2, [eax] // read 16 pixels movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row lea eax, [eax + edx] // advance to next row
movdqa xmm3, xmm2 movdqa xmm3, xmm2
...@@ -739,7 +737,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -739,7 +737,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
paddusw xmm0, xmm2 // sum 16 words paddusw xmm0, xmm2 // sum 16 words
paddusw xmm1, xmm3 paddusw xmm1, xmm3
sub ebp, 1 sub ebp, 1
jg yloop jg xloop
movdqu [edi], xmm0 movdqu [edi], xmm0
movdqu [edi + 16], xmm1 movdqu [edi + 16], xmm1
...@@ -760,15 +758,59 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -760,15 +758,59 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
} }
// Bilinear column filtering. SSSE3 version. // Reads 32xN bytes and produces 32 shorts at a time.
// TODO(fbarchard): Switch the following: __declspec(naked)
// xor ebx, ebx void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
// mov bx, word ptr [esi + eax] // 2 source x0 pixels uint16* dst_ptr, int src_width, int src_height) {
// To __asm {
// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels push esi
// when drmemory bug fixed. push edi
// https://code.google.com/p/drmemory/issues/detail?id=1396 push ebx
push ebp
mov esi, [esp + 16 + 4] // src_ptr
mov edx, [esp + 16 + 8] // src_stride
mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
mov eax, esi // row pointer
mov ebp, ebx // height
vpxor ymm0, ymm0, ymm0 // clear accumulators
vpxor ymm1, ymm1, ymm1
vpxor ymm4, ymm4, ymm4
// sum rows
xloop:
vmovdqu ymm2, [eax] // read 16 pixels
vpermq ymm2, ymm2, 0xd8 // unmutate for vpunpck
lea eax, [eax + edx] // advance to next row
vpunpckhbw ymm3, ymm2, ymm4
vpunpcklbw ymm2, ymm2, ymm4
vpaddusw ymm0, ymm0, ymm2 // sum 16 words
vpaddusw ymm1, ymm1, ymm3
sub ebp, 1
jg xloop
vmovdqu [edi], ymm0
vmovdqu [edi + 32], ymm1
lea edi, [edi + 64] // dst_ptr
lea esi, [esi + 32] // src_ptr
mov eax, esi // row pointer
mov ebp, ebx // height
vpxor ymm0, ymm0, ymm0 // clear accumulators
vpxor ymm1, ymm1, ymm1
sub ecx, 32
jg xloop
pop ebp
pop ebx
pop edi
pop esi
vzeroupper
ret
}
}
// Bilinear column filtering. SSSE3 version.
__declspec(naked) __declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment