ARGBRect use SetRows32 to do full image. 744 ms -> 288 ms on Linux, 688 ms ->…

ARGBRect use SetRows32 to do full image. 744 ms -> 288 ms on Linux, 688 ms -> 277 ms on Mac. Started ARGBCopy function. BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/290006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@95 16f28f9a-4ce2-e073-06de-1de4eb20be90

ARGBRect use SetRows32 to do full image. 744 ms -> 288 ms on Linux, 688 ms ->…
ARGBRect use SetRows32 to do full image. 744 ms -> 288 ms on Linux, 688 ms -> 277 ms on Mac. Started ARGBCopy function. BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/290006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@95 16f28f9a-4ce2-e073-06de-1de4eb20be90
ca26f860 · fbarchard@google.com · 120b8d7e · ca26f860 · ca26f860 · ca26f860
Commit ca26f860 authored Dec 01, 2011 by fbarchard@google.com
Hide whitespace changes
Inline Side-by-side

Showing with 138 additions and 85 deletions

README.chromium README.chromium +1 -1

planar_functions.h include/libyuv/planar_functions.h +5 -0

planar_functions.cc source/planar_functions.cc +132 -84

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 94
+Version: 95
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -189,6 +189,11 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
             int width, int height,
             uint32 value);
+// Copy ARGB to ARGB.
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
 }  // namespace libyuv
 #endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -129,6 +129,7 @@ static void SplitUV_C(const uint8* src_uv,
 // CopyRows copys 'count' bytes using a 16 byte load/store, 64 bytes at time
 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
 #define HAS_COPYROW_SSE2
+#define HAS_COPYROW_X86
 __declspec(naked)
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  __asm {
@@ -148,6 +149,21 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
    ret
  }
 }
+void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+  __asm {
+    push       esi
+    push       edi
+    mov        esi, [esp + 4 + 4]   // src
+    mov        edi, [esp + 4 + 8]   // dst
+    mov        ecx, [esp + 4 + 12]  // count
+    shr        ecx, 2
+    rep movsd
+    pop        edi
+    pop        esi
+    ret
+  }
+}
 #elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
 #define HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
@@ -184,9 +200,17 @@ static void CopyPlane(const uint8* src_y, int src_stride_y,
 #if defined(HAS_COPYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
    CopyRow = CopyRow_SSE2;
  } else
+#endif
+#if defined(HAS_COPYROW_X86)
+  if (IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+    CopyRow = CopyRow_X86;
+  } else
 #endif
  {
    CopyRow = CopyRow_C;
@@ -233,6 +257,28 @@ int I420Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }
+// Copy ARGB with optional flipping
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height) {
+  if (!src_argb ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+            width * 4, height);
+  return 0;
+}
 int I420Mirror(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
@@ -1673,109 +1719,111 @@ static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
  );
 }
-static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
+// TODO(fbarchard): Make fully assembler
-  asm volatile (
+static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+                           int dst_stride, int height) {
-    "1:                                        \n"
+  for (int y = 0; y < height; ++y) {
-    "vst1.u32  {q0}, [%0]!                     \n"  // store
+    SetRow8_NEON(dst, v32, width << 2);
-    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+    dst += dst_stride;
-    "bhi       1b                              \n"
+  }
-  : "+r"(dst),  // %0
-    "+r"(count) // %1
-  : "r"(v32)    // %2
-  : "q0", "memory", "cc"
-  );
 }
 #elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
-#define HAS_SETROW_SSE2
+#define HAS_SETROW_X86
 __declspec(naked)
-static void SetRow8_SSE2(uint8* dst, uint32 v32, int count) {
+static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
  __asm {
-    mov        eax, [esp + 4]    // dst
+    push       edi
-    movd       xmm5, [esp + 8]   // v32
+    mov        edi, [esp + 4 + 4]   // dst
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4 + 8]   // v32
-    pshufd     xmm5, xmm5, 0
+    mov        ecx, [esp + 4 + 12]  // count
+    shr        ecx, 2
-  convertloop:
+    rep stosd
-    movdqa     [eax], xmm5
+    pop        edi
-    lea        eax, [eax + 16]
-    sub        ecx, 16
-    ja         convertloop
    ret
  }
 }
 __declspec(naked)
-static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
+static void SetRows32_X86(uint8* dst, uint32 v32, int width,
+                         int dst_stride, int height) {
  __asm {
-    mov        eax, [esp + 4]    // dst
+    push       edi
-    movd       xmm5, [esp + 8]   // v32
+    push       ebp
-    mov        ecx, [esp + 12]   // count
+    mov        edi, [esp + 8 + 4]   // dst
-    pshufd     xmm5, xmm5, 0
+    mov        eax, [esp + 8 + 8]   // v32
+    mov        ebp, [esp + 8 + 12]  // width
+    mov        edx, [esp + 8 + 16]  // dst_stride
+    mov        ebx, [esp + 8 + 20]  // height
+    lea        ecx, [ebp * 4]
+    sub        edx, ecx             // stride - width * 4
  convertloop:
-    movdqa     [eax], xmm5
+    mov        ecx, ebp
-    lea        eax, [eax + 16]
+    rep stosd
-    sub        ecx, 4
+    add        edi, edx
+    sub        ebx, 1
    ja         convertloop
+    pop        ebp
+    pop        edi
    ret
  }
 }
 #elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
+#define HAS_SETROW_X86
-#define HAS_SETROW_SSE2
+static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
-static void SetRow8_SSE2(uint8* dst, uint32 v32, int count) {
+  size_t width_tmp = static_cast<size_t>(width);
  asm volatile (
-    "movd      %2, %%xmm5                      \n"
+    "shr       $0x2,%1                         \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "rep stos  %2,(%0)                         \n"
-  "1:                                          \n"
+  : "+D"(dst),  // %0
-    "movdqa    %%xmm5,(%0)                     \n"
+    "+c"(width_tmp) // %1
-    "lea       0x10(%0),%0                     \n"
+  : "a"(v32)    // %2
-    "sub       $0x10,%1                        \n"
-    "ja        1b                              \n"
-  : "+r"(dst),  // %0
-    "+r"(count) // %1
-  : "r"(v32)    // %2
  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm5"
-#endif
  );
 }
-static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
+static void SetRows32_X86(uint8* dst, uint32 v32, int width,
-  asm volatile (
+                         int dst_stride, int height) {
-    "movd      %2, %%xmm5                      \n"
+  for (int y = 0; y < height; ++y) {
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    size_t width_tmp = static_cast<size_t>(width);
-  "1:                                          \n"
+    uint32* d = reinterpret_cast<uint32*>(dst);
-    "movdqa    %%xmm5,(%0)                     \n"
+    asm volatile (
-    "lea       0x10(%0),%0                     \n"
+      "rep stos  %2,(%0)                       \n"
-    "sub       $0x4,%1                         \n"
+    : "+D"(d),  // %0
-    "ja        1b                              \n"
+      "+c"(width_tmp) // %1
-  : "+r"(dst),  // %0
+    : "a"(v32)    // %2
-    "+r"(count) // %1
+    : "memory", "cc"
-  : "r"(v32)    // %2
+    );
-  : "memory", "cc"
+    dst += dst_stride;
-#if defined(__SSE2__)
+  }
-    , "xmm5"
-#endif
-  );
 }
 #endif
+#if !defined(HAS_SETROW_X86)
 static void SetRow8_C(uint8* dst, uint32 v8, int count) {
+#ifdef _MSC_VER
+  for (int x = 0; x < count; ++x) {
+    dst[x] = v8;
+  }
+#else
  memset(dst, v8, count);
+#endif
 }
-// count measured in bytes
+static void SetRows32_C(uint8* dst, uint32 v32, int width,
-static void SetRow32_C(uint8* dst, uint32 v32, int count) {
+                        int dst_stride, int height) {
-  uint32* d = reinterpret_cast<uint32*>(dst);
+  for (int y = 0; y < height; ++y) {
-  for (int x = 0; x < count; ++x) {
+    uint32* d = reinterpret_cast<uint32*>(dst);
-    d[x] = v32;
+    for (int x = 0; x < width; ++x) {
+      d[x] = v32;
+    }
+    dst += dst_stride;
  }
 }
+#endif
 static void SetPlane(uint8* dst_y, int dst_stride_y,
                     int width, int height,
@@ -1795,7 +1843,11 @@ static void SetPlane(uint8* dst_y, int dst_stride_y,
  } else
 #endif
  {
+#if defined(HAS_SETROW_X86)
+    SetRow = SetRow8_X86;
+#else
    SetRow = SetRow8_C;
+#endif
  }
  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
@@ -1844,27 +1896,23 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
    return -1;
  }
  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  void (*SetRow)(uint8* dst, uint32 value, int count);
+  void (*SetRows)(uint8* dst, uint32 value, int width,
+                  int dst_stride, int height);
 #if defined(HAS_SETROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) &&
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    SetRow = SetRow32_NEON;
+    SetRows = SetRows32_NEON;
-  } else
-#elif defined(HAS_SETROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    SetRow = SetRow32_SSE2;
  } else
 #endif
  {
-    SetRow = SetRow32_C;
+#if defined(HAS_SETROW_X86)
-  }
+    SetRows = SetRows32_X86;
-  for (int y = 0; y < height; ++y) {
+#else
-    SetRow(dst, value, width);
+    SetRows = SetRows32_C;
-    dst += dst_stride_argb;
+#endif
  }
+  SetRows(dst, value, width, dst_stride_argb, height);
  return 0;
 }