Commit 550cf829 authored by Frank Barchard's avatar Frank Barchard

HalfFloat avx2 unpack bug fix.

AVX unpack parameters were reverse ordered causing incorrect results
on AVX2 hardware.

TEST=/usr/local/google/home/fbarchard/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=*Half*

BUG=libyuv:560
R=wangcheng@google.com

Review URL: https://codereview.chromium.org/2438893002 .
parent f553db2d
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1628 Version: 1629
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1628 #define LIBYUV_VERSION 1629
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -5350,17 +5350,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5350,17 +5350,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n" "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
"vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n" "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n"
"vmulps %%ymm3,%%ymm4,%%ymm3 \n" "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
"vmulps %%ymm2,%%ymm4,%%ymm2 \n" "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
"vpsrld $0xd,%%ymm3,%%ymm3 \n" "vpsrld $0xd,%%ymm3,%%ymm3 \n"
"vpsrld $0xd,%%ymm2,%%ymm2 \n" "vpsrld $0xd,%%ymm2,%%ymm2 \n"
"vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
"vmovdqu %%ymm2," MEMACCESS(1) " \n" "vmovdqu %%ymm2," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
...@@ -5384,8 +5384,8 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5384,8 +5384,8 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n"
......
...@@ -2099,8 +2099,8 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height, ...@@ -2099,8 +2099,8 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
MaskCpuFlags(disable_cpu_flags); MaskCpuFlags(disable_cpu_flags);
double c_time = get_time(); double c_time = get_time();
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width * 2, HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
(uint16*)dst_c, benchmark_width * 2, reinterpret_cast<uint16*>(dst_c), benchmark_width * 2,
scale, benchmark_width, benchmark_height); scale, benchmark_width, benchmark_height);
} }
c_time = (get_time() - c_time) / benchmark_iterations; c_time = (get_time() - c_time) / benchmark_iterations;
...@@ -2109,8 +2109,8 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height, ...@@ -2109,8 +2109,8 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
MaskCpuFlags(benchmark_cpu_info); MaskCpuFlags(benchmark_cpu_info);
double opt_time = get_time(); double opt_time = get_time();
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width * 2, HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
(uint16*)dst_opt, benchmark_width * 2, reinterpret_cast<uint16*>(dst_opt), benchmark_width * 2,
scale, benchmark_width, benchmark_height); scale, benchmark_width, benchmark_height);
} }
opt_time = (get_time() - opt_time) / benchmark_iterations; opt_time = (get_time() - opt_time) / benchmark_iterations;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment