Commit 72673ac8 authored by fbarchard@google.com's avatar fbarchard@google.com

linear and point sample scale to half size for AVX2.

BUG=314
TESTED=out\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*.ScaleDownBy2*
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/44959004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1349 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 9ef8999f
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1348 Version: 1349
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#define INCLUDE_LIBYUV_SCALE_ROW_H_ #define INCLUDE_LIBYUV_SCALE_ROW_H_
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/scale.h"
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {
...@@ -214,6 +215,10 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -214,6 +215,10 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
...@@ -242,6 +247,10 @@ void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -242,6 +247,10 @@ void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1348 #define LIBYUV_VERSION 1349
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -77,13 +77,15 @@ static void ScalePlaneDown2(int src_width, int src_height, ...@@ -77,13 +77,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
} }
} }
#endif #endif
// TODO(fbarchard): Do other filter modes.
#if defined(HAS_SCALEROWDOWN2_AVX2) #if defined(HAS_SCALEROWDOWN2_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && if (TestCpuFlag(kCpuHasAVX2)) {
(filtering == kFilterBox || filtering == kFilterBilinear)) { // ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; // (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
// ScaleRowDown2Box_Any_AVX2);
if (IS_ALIGNED(dst_width, 32)) { if (IS_ALIGNED(dst_width, 32)) {
ScaleRowDown2 = ScaleRowDown2Box_AVX2; ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
ScaleRowDown2Box_AVX2);
} }
} }
#endif #endif
......
...@@ -56,7 +56,10 @@ SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, ...@@ -56,7 +56,10 @@ SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
2, 1, 15) 2, 1, 15)
#endif #endif
#ifdef HAS_SCALEROWDOWN2_AVX2 #ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2,ScaleRowDown2Box_C, SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31) 2, 1, 31)
#endif #endif
#ifdef HAS_SCALEROWDOWN2_NEON #ifdef HAS_SCALEROWDOWN2_NEON
......
...@@ -199,6 +199,70 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -199,6 +199,70 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
#ifdef HAS_SCALEROWDOWN2_AVX2 #ifdef HAS_SCALEROWDOWN2_AVX2
// Reads 64 pixels, throws half away and writes 32 pixels.
__declspec(naked) __declspec(align(16))
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
vzeroupper
ret
}
}
// Blends 64x1 rectangle to 32x1.
__declspec(naked) __declspec(align(16))
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm2, ymm0, 8 // average columns (32 to 16 pixels)
vpsrlw ymm3, ymm1, 8
vpand ymm0, ymm0, ymm5
vpand ymm1, ymm1, ymm5
vpavgw ymm0, ymm0, ymm2
vpavgw ymm1, ymm1, ymm3
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
vzeroupper
ret
}
}
// Blends 64x2 rectangle to 32x1. // Blends 64x2 rectangle to 32x1.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
...@@ -209,11 +273,8 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -209,11 +273,8 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpcmpeqb ymm4, ymm4, ymm4 vpsrlw ymm5, ymm5, 8
vpsrlw ymm4, ymm4, 15 // '1' constant, 16b
vpackuswb ymm4, ymm4, ymm4 // '1' constant, 8b
vpxor ymm5, ymm5, ymm5 // constant 0
wloop: wloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
...@@ -222,12 +283,14 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -222,12 +283,14 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vpavgb ymm1, ymm1, [eax + esi + 32] vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64] lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // add horizontally vpsrlw ymm2, ymm0, 8 // average columns (32 to 16 pixels)
vpmaddubsw ymm1, ymm1, ymm4 vpsrlw ymm3, ymm1, 8
vpavgw ymm0, ymm0, ymm5 // (x+1) >> 1 vpand ymm0, ymm0, ymm5
vpavgw ymm1, ymm1, ymm5 vpand ymm1, ymm1, ymm5
vpavgw ymm0, ymm0, ymm2
vpavgw ymm1, ymm1, ymm3
vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpermq ymm0, ymm0, 0xd8 // unmutate
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment