Commit 550cf829 authored by Frank Barchard's avatar Frank Barchard

HalfFloat avx2 unpack bug fix.

AVX unpack parameters were reverse ordered causing incorrect results
on AVX2 hardware.

TEST=/usr/local/google/home/fbarchard/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=*Half*

BUG=libyuv:560
R=wangcheng@google.com

Review URL: https://codereview.chromium.org/2438893002 .
parent f553db2d
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1628
Version: 1629
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1628
#define LIBYUV_VERSION 1629
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -5350,17 +5350,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
// 16 pixel loop.
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts
"vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
"lea " MEMLEA(0x20,0) ",%0 \n"
"vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n"
"vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n"
"vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
"vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vmulps %%ymm3,%%ymm4,%%ymm3 \n"
"vmulps %%ymm2,%%ymm4,%%ymm2 \n"
"vpsrld $0xd,%%ymm3,%%ymm3 \n"
"vpsrld $0xd,%%ymm2,%%ymm2 \n"
"vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
"vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
"vmovdqu %%ymm2," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n"
......@@ -5384,8 +5384,8 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
// 16 pixel loop.
LABELALIGN
"1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
......
......@@ -2099,8 +2099,8 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
MaskCpuFlags(disable_cpu_flags);
double c_time = get_time();
for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
(uint16*)dst_c, benchmark_width * 2,
HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
reinterpret_cast<uint16*>(dst_c), benchmark_width * 2,
scale, benchmark_width, benchmark_height);
}
c_time = (get_time() - c_time) / benchmark_iterations;
......@@ -2109,8 +2109,8 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
MaskCpuFlags(benchmark_cpu_info);
double opt_time = get_time();
for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
(uint16*)dst_opt, benchmark_width * 2,
HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
reinterpret_cast<uint16*>(dst_opt), benchmark_width * 2,
scale, benchmark_width, benchmark_height);
}
opt_time = (get_time() - opt_time) / benchmark_iterations;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment