Scale Up2 ported to NaCL.

BUG=none TEST=none R=nfullagar@chromium.org, nfullagar@google.com Review URL: https://webrtc-codereview.appspot.com/3589004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@846 16f28f9a-4ce2-e073-06de-1de4eb20be90

Scale Up2 ported to NaCL.
BUG=none TEST=none R=nfullagar@chromium.org, nfullagar@google.com Review URL: https://webrtc-codereview.appspot.com/3589004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@846 16f28f9a-4ce2-e073-06de-1de4eb20be90
67a0987d · fbarchard@google.com · 1428b37c · 67a0987d · 67a0987d · 67a0987d
Commit 67a0987d authored Nov 07, 2013 by fbarchard@google.com
Showing with 36 additions and 5 deletions

README.chromium README.chromium +1 -1

planar_functions.h include/libyuv/planar_functions.h +1 -0

version.h include/libyuv/version.h +1 -1

scale_argb.cc source/scale_argb.cc +33 -3

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 845
+Version: 846
 License: BSD
 License File: LICENSE


--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -355,6 +355,7 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
 // dst_cumsum table of width * height * 16 bytes aligned to 16 byte boundary.
 // dst_stride32_cumsum is number of ints in a row (width * 4).
 // radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
 LIBYUV_API
 int ARGBBlur(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 845
+#define LIBYUV_VERSION 846

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -371,7 +371,6 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-#define HAS_SCALEARGBCOLSUP2_SSE2
 __declspec(naked) __declspec(align(16))
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                           int dst_width, int /* x */, int /* dx */) {
@@ -675,6 +674,39 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
  );
 }

+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int /* x */, int /* dx */) {
+  asm volatile (
+    ".p2align  4                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpckldq %%xmm0,%%xmm0                   \n"
+    "punpckhdq %%xmm1,%%xmm1                   \n"
+    "sub       $0x8,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+r"(dst_width)    // %2
+  :
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
 static uvec8 kShuffleColARGB = {
  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
@@ -1363,14 +1395,12 @@ static void ScaleARGBSimple(int src_width, int src_height,
 #if defined(HAS_SCALEARGBCOLS_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ScaleARGBCols = ScaleARGBCols_SSE2;
-#if defined(HAS_SCALEARGBCOLS_SSE2)
    if (src_width * 2 == dst_width && IS_ALIGNED(dst_width, 8) &&
        (x >> 16) == 0 &&
        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
    }
-#endif
  }
 #endif