ARGBMultiply_AVX2 ported to GCC.

BUG=269 TESTED=try bots R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/27139005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1160 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBMultiply_AVX2 ported to GCC.
BUG=269 TESTED=try bots R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/27139005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1160 16f28f9a-4ce2-e073-06de-1de4eb20be90
a843cafb · fbarchard@google.com · 0387df51 · a843cafb · a843cafb · a843cafb
Commit a843cafb authored Nov 11, 2014 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 4 deletions

README.chromium README.chromium +1 -1

row.h include/libyuv/row.h +1 -1

version.h include/libyuv/version.h +1 -1

row_posix.cc source/row_posix.cc +40 -1

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1159
+Version: 1160
 License: BSD
 License File: LICENSE


--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -203,6 +203,7 @@ extern "C" {
 // Effects:
 #define HAS_ARGBADDROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
 #endif

 // The following are require VS2012.
@@ -220,7 +221,6 @@ extern "C" {
 // Effects:
 #define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBMIRRORROW_AVX2
-#define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #endif  // defined(VISUALC_HAS_AVX2)


--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1159
+#define LIBYUV_VERSION 1160

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3890,7 +3890,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm5,%%xmm5                  \n"

    // 4 pixel loop.
    LABELALIGN
@@ -3925,6 +3925,45 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2

+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__AVX2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,