HalfFloat avx2 unpack bug fix.

AVX unpack parameters were reverse ordered causing incorrect results on AVX2 hardware. TEST=/usr/local/google/home/fbarchard/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=*Half* BUG=libyuv:560 R=wangcheng@google.com Review URL: https://codereview.chromium.org/2438893002 .

HalfFloat avx2 unpack bug fix.
AVX unpack parameters were reverse ordered causing incorrect results on AVX2 hardware. TEST=/usr/local/google/home/fbarchard/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=*Half* BUG=libyuv:560 R=wangcheng@google.com Review URL: https://codereview.chromium.org/2438893002 .
550cf829 · Frank Barchard · f553db2d · 550cf829 · 550cf829 · 550cf829
Commit 550cf829 authored Oct 20, 2016 by Frank Barchard
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 12 deletions

README.chromium README.chromium +1 -1

version.h include/libyuv/version.h +1 -1

row_gcc.cc source/row_gcc.cc +6 -6

planar_test.cc unit_test/planar_test.cc +4 -4

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1628
+Version: 1629
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1628
+#define LIBYUV_VERSION 1629
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -5350,17 +5350,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
    // 16 pixel loop.
    LABELALIGN
  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm2        \n"  // 8 shorts
+    "vmovdqu    " MEMACCESS(0) ",%%ymm2        \n"  // 16 shorts
    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpunpckhwd %%ymm2,%%ymm5,%%ymm3           \n"
+    "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
-    "vpunpcklwd %%ymm2,%%ymm5,%%ymm2           \n"
+    "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
    "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
    "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
    "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
    "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
    "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
    "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
-    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
+    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
    "vmovdqu    %%ymm2," MEMACCESS(1) "        \n"
    "lea        " MEMLEA(0x20,1) ",%1          \n"
    "sub        $0x10,%2                       \n"
@@ -5384,8 +5384,8 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
    // 16 pixel loop.
    LABELALIGN
  "1:                                          \n"
-    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 8 shorts -> 8 ints
+    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
-    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"  // 8 more
+    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
    "lea         " MEMLEA(0x20,0) ",%0         \n"
    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2099,8 +2099,8 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
  MaskCpuFlags(disable_cpu_flags);
  double c_time = get_time();
  for (j = 0; j < benchmark_iterations; j++) {
-    HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
+    HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
-                   (uint16*)dst_c, benchmark_width * 2,
+                   reinterpret_cast<uint16*>(dst_c), benchmark_width * 2,
                   scale, benchmark_width, benchmark_height);
  }
  c_time = (get_time() - c_time) / benchmark_iterations;
@@ -2109,8 +2109,8 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
  MaskCpuFlags(benchmark_cpu_info);
  double opt_time = get_time();
  for (j = 0; j < benchmark_iterations; j++) {
-    HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
+    HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
-                   (uint16*)dst_opt, benchmark_width * 2,
+                   reinterpret_cast<uint16*>(dst_opt), benchmark_width * 2,
                   scale, benchmark_width, benchmark_height);
  }
  opt_time = (get_time() - opt_time) / benchmark_iterations;