xmmword cast for clang

clangcl use compare_win for 32 bit, allowing fallback and enabling avx2 code for clang. move defines/protos to compare_row.h fix issue with odd width ARGBCopyAlpha functions by copying destination to temp buffer, then doing alpha copy, then copy back to destination. R=harryjin@google.com TBR=harryjin@google.com BUG=libyuv:484 Review URL: https://webrtc-codereview.appspot.com/59379004.

xmmword cast for clang
clangcl use compare_win for 32 bit, allowing fallback and enabling avx2 code for clang. move defines/protos to compare_row.h fix issue with odd width ARGBCopyAlpha functions by copying destination to temp buffer, then doing alpha copy, then copy back to destination. R=harryjin@google.com TBR=harryjin@google.com BUG=libyuv:484 Review URL: https://webrtc-codereview.appspot.com/59379004.
cda9d38a · Frank Barchard · baf6a3c1 · cda9d38a · cda9d38a · cda9d38a
Commit cda9d38a authored Aug 18, 2015 by Frank Barchard
12 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1465
+Version: 1466
 License: BSD
 License File: LICENSE

--- a/include/libyuv/compare_row.h
+++ b/include/libyuv/compare_row.h
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_ROW_H_
+#include "libyuv/basic_types.h"
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+#if defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#endif
+// The following are available for Visual C and GCC:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
+#define HAS_HASHDJB2_SSE41
+#define HAS_SUMSQUAREERROR_SSE2
+#endif
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+#endif
+// The following are available for Neon:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+#endif
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+#endif  // INCLUDE_LIBYUV_COMPARE_ROW_H_  NOLINT
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -365,7 +365,7 @@ extern "C" {
 #endif
 #endif
-#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__CLR_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
 #define SIMD_ALIGNED32(var) __declspec(align(64)) var
 typedef __declspec(align(16)) int16 vec16[8];
@@ -380,7 +380,7 @@ typedef __declspec(align(32)) int8 lvec8[32];
 typedef __declspec(align(32)) uint16 ulvec16[16];
 typedef __declspec(align(32)) uint32 ulvec32[8];
 typedef __declspec(align(32)) uint8 ulvec8[32];
-#elif defined(__GNUC__) || defined(__clang__)
+#elif defined(__GNUC__)
 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 #define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1465
+#define LIBYUV_VERSION 1466
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -17,6 +17,7 @@
 #endif
 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
 #include "libyuv/video_common.h"
@@ -26,30 +27,13 @@ namespace libyuv {
 extern "C" {
 #endif
-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
-// This module is for Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
-#define HAS_HASHDJB2_SSE41
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
-#if defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
-#define HAS_HASHDJB2_AVX2
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
-#endif
-#endif  // HAS_HASHDJB2_SSE41
 // hash seed of 5381 recommended.
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
  const int kBlockSize = 1 << 15;  // 32768;
  int remainder;
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
+      HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
  if (TestCpuFlag(kCpuHasSSE41)) {
    HashDjb2_SSE = HashDjb2_SSE41;
@@ -127,23 +111,6 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
  return fourcc;
 }
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SUMSQUAREERROR_NEON
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
-#endif
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_SUMSQUAREERROR_SSE2
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
-#endif
-#ifdef VISUALC_HAS_AVX2
-#define HAS_SUMSQUAREERROR_AVX2
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
-#endif
 // TODO(fbarchard): Refactor into row function.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,

--- a/source/compare_common.cc
+++ b/source/compare_common.cc
@@ -10,6 +10,8 @@
 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {

--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@@ -9,6 +9,8 @@
 */
 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 #ifdef __cplusplus

--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -9,6 +9,8 @@
 */
 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 #ifdef __cplusplus

--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -9,6 +9,8 @@
 */
 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 #ifdef __cplusplus

--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -9,6 +9,8 @@
 */
 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 #ifdef __cplusplus
@@ -133,28 +135,28 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    movd       xmm0, [esp + 12]  // seed
    pxor       xmm7, xmm7        // constant 0 for unpck
-    movdqa     xmm6, kHash16x33
+    movdqa     xmm6, xmmword ptr kHash16x33
  wloop:
    movdqu     xmm1, [eax]       // src[0-15]
    lea        eax, [eax + 16]
    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
-    movdqa     xmm5, kHashMul0
+    movdqa     xmm5, xmmword ptr kHashMul0
    movdqa     xmm2, xmm1
    punpcklbw  xmm2, xmm7        // src[0-7]
    movdqa     xmm3, xmm2
    punpcklwd  xmm3, xmm7        // src[0-3]
    pmulld     xmm3, xmm5
-    movdqa     xmm5, kHashMul1
+    movdqa     xmm5, xmmword ptr kHashMul1
    movdqa     xmm4, xmm2
    punpckhwd  xmm4, xmm7        // src[4-7]
    pmulld     xmm4, xmm5
-    movdqa     xmm5, kHashMul2
+    movdqa     xmm5, xmmword ptr kHashMul2
    punpckhbw  xmm1, xmm7        // src[8-15]
    movdqa     xmm2, xmm1
    punpcklwd  xmm2, xmm7        // src[8-11]
    pmulld     xmm2, xmm5
-    movdqa     xmm5, kHashMul3
+    movdqa     xmm5, xmmword ptr kHashMul3
    punpckhwd  xmm1, xmm7        // src[12-15]
    pmulld     xmm1, xmm5
    paddd      xmm3, xmm4        // add 16 results
@@ -181,32 +183,32 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
    mov        ecx, [esp + 8]    // count
-    movd       xmm0, [esp + 12]  // seed
+    vmovd      xmm0, [esp + 12]  // seed
-    movdqa     xmm6, kHash16x33
  wloop:
    vpmovzxbd  xmm3, [eax]  // src[0-3]
-    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
-    pmulld     xmm3, kHashMul0
+    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
-    pmulld     xmm4, kHashMul1
+    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
-    pmulld     xmm2, kHashMul2
+    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
    lea        eax, [eax + 16]
-    pmulld     xmm1, kHashMul3
+    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
-    paddd      xmm3, xmm4        // add 16 results
+    vpaddd     xmm3, xmm3, xmm4        // add 16 results
-    paddd      xmm1, xmm2
+    vpaddd     xmm1, xmm1, xmm2
-    paddd      xmm1, xmm3
+    vpaddd     xmm1, xmm1, xmm3
-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
-    paddd      xmm1, xmm2
+    vpaddd     xmm1, xmm1,xmm2
-    pshufd     xmm2, xmm1, 0x01
+    vpshufd    xmm2, xmm1, 0x01
-    paddd      xmm1, xmm2
+    vpaddd     xmm1, xmm1, xmm2
-    paddd      xmm0, xmm1
+    vpaddd     xmm0, xmm0, xmm1
    sub        ecx, 16
    jg         wloop
-    movd       eax, xmm0         // return hash
+    vmovd      eax, xmm0         // return hash
+    vzeroupper
    ret
  }
 }

--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -245,18 +245,6 @@ ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
 #ifdef HAS_COPYROW_NEON
 ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
 #endif
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-ANY11(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-ANY11(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 1, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-ANY11(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-ANY11(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
-#endif
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
 ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
 ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
@@ -410,6 +398,36 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
 #endif
 #undef ANY11
+// Any 1 to 1 blended.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#undef ANY11B
 // Any 1 to 1 with parameter.
 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -517,7 +517,7 @@ TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4)
 #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
-                       YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)         \
+                       YALIGN, W1280, DIFF, N, NEG, OFF)                       \
 TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
@@ -558,21 +558,10 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
                          kWidth, NEG kHeight);                                \
  }                                                                            \
  int max_diff = 0;                                                            \
-  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                         \
-  align_buffer_64(dst_argb32_c, kWidth * BPP_C  * kHeight);                    \
-  align_buffer_64(dst_argb32_opt, kWidth * BPP_C  * kHeight);                  \
-  memset(dst_argb32_c, 2, kWidth * BPP_C  * kHeight);                          \
-  memset(dst_argb32_opt, 102, kWidth * BPP_C  * kHeight);                      \
-  FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB,                                 \
-                   dst_argb32_c, kWidth * BPP_C ,                              \
-                   kWidth, kHeight);                                           \
-  FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB,                               \
-                   dst_argb32_opt, kWidth * BPP_C ,                            \
-                   kWidth, kHeight);                                           \
-  for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                         \
    int abs_diff =                                                             \
-        abs(static_cast<int>(dst_argb32_c[i]) -                                \
+        abs(static_cast<int>(dst_argb_c[i + OFF]) -                            \
-            static_cast<int>(dst_argb32_opt[i]));                              \
+            static_cast<int>(dst_argb_opt[i + OFF]));                          \
    if (abs_diff > max_diff) {                                                 \
      max_diff = abs_diff;                                                     \
    }                                                                          \
@@ -584,22 +573,20 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
  free_aligned_buffer_64(src_a);                                               \
  free_aligned_buffer_64(dst_argb_c);                                          \
  free_aligned_buffer_64(dst_argb_opt);                                        \
-  free_aligned_buffer_64(dst_argb32_c);                                        \
-  free_aligned_buffer_64(dst_argb32_opt);                                      \
 }
 #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                       YALIGN, DIFF, FMT_C, BPP_C)                             \
+                       YALIGN, DIFF)                                           \
    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
-        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C)          \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0)                        \
    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
-        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C)        \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1)                      \
    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
-        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C)           \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0)                         \
    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
-        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0)
-TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
 #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
                         W1280, DIFF, N, NEG, OFF)                             \