Commit 0e83b64e authored by Frank Barchard's avatar Frank Barchard

scalerow avx2 bug fix. was using ymm2 instead of ymm3.

R=harryjin@google.com
BUG=libyuv:462

Review URL: https://webrtc-codereview.appspot.com/56639004.
parent 715a2919
...@@ -50,8 +50,7 @@ extern "C" { ...@@ -50,8 +50,7 @@ extern "C" {
// The following are available on VS2012: // The following are available on VS2012:
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
// Some AVX2 versions disabled. See libyuv bug 462. #define HAS_SCALEADDROW_AVX2
// #define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2 #define HAS_SCALEROWDOWN4_AVX2
#endif #endif
......
...@@ -838,17 +838,15 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { ...@@ -838,17 +838,15 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
// sum rows // sum rows
xloop: xloop:
vmovdqu ymm3, [eax] // read 32 bytes vmovdqu ymm3, [eax] // read 32 bytes
vpermq ymm3, ymm2, 0xd8 // unmutate for vpunpck
lea eax, [eax + 32] lea eax, [eax + 32]
vmovdqu ymm0, [edx] // read 32 words from destination vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
vmovdqu ymm1, [edx + 32]
vpunpcklbw ymm2, ymm3, ymm5 vpunpcklbw ymm2, ymm3, ymm5
vpunpckhbw ymm3, ymm3, ymm5 vpunpckhbw ymm3, ymm3, ymm5
vpaddusw ymm0, ymm0, ymm2 // sum 16 words vpaddusw ymm0, ymm2, [edx] // sum 16 words
vpaddusw ymm1, ymm1, ymm3 vpaddusw ymm1, ymm3, [edx + 32]
vmovdqu [edx], ymm0 // write 32 words to destination vmovdqu [edx], ymm0 // write 32 words to destination
vmovdqu [edx + 32], ymm1 vmovdqu [edx + 32], ymm1
lea edx, [edx + 64] lea edx, [edx + 64]
sub ecx, 32 sub ecx, 32
jg xloop jg xloop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment