p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/547007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90

p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/547007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90
5bf29b59 · fbarchard@google.com · f906ae13 · 5bf29b59 · 5bf29b59 · 5bf29b59
Commit 5bf29b59 authored May 02, 2012 by fbarchard@google.com
10 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 254
+Version: 255
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 254
+#define LIBYUV_VERSION 255
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -163,6 +163,7 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    "movd      %2,%%xmm0                       \n"
    "pxor      %%xmm7,%%xmm7                   \n"
    "movdqa    %4,%%xmm6                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm1                     \n"
    "lea       0x10(%0),%0                     \n"
@@ -331,7 +332,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
    "pxor      %%xmm0,%%xmm0                   \n"
    "pxor      %%xmm5,%%xmm5                   \n"
    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
    "1:                                        \n"
    "movdqa    (%0),%%xmm1                     \n"
    "movdqa    (%0,%1,1),%%xmm2                \n"

--- a/source/convert.cc
+++ b/source/convert.cc
@@ -74,6 +74,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
    mov        ecx, [esp + 4 + 16]   // pix
    sub        edi, eax
+    align      16
  convertloop:
    movdqa     xmm0, [eax]
    pavgb      xmm0, [eax + edx]
@@ -92,6 +93,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                         uint8* dst_uv, int pix) {
  asm volatile (
  "sub        %0,%1                            \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "pavgb      (%0,%3),%%xmm0                   \n"
@@ -467,6 +469,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    psrlw      xmm5, 8
+    align      16
  convertloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@@ -506,6 +509,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
  asm volatile (
  "pcmpeqb    %%xmm5,%%xmm5                    \n"
  "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm1                  \n"

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -291,6 +291,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
                               uint8* dst_frame, int width) {
 asm volatile (
    "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
  "1:                                            \n"
    "movq      (%1),%%xmm2                       \n"
    "movq      (%1,%2,1),%%xmm3                  \n"
@@ -326,6 +327,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
                               uint8* dst_frame, int width) {
 asm volatile (
    "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
  "1:                                            \n"
    "movq      (%1),%%xmm2                       \n"
    "movq      (%1,%2,1),%%xmm3                  \n"

--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -57,6 +57,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
  asm volatile (
    "movd   %3,%%xmm5                          \n"
    "pshufd $0x0,%%xmm5,%%xmm5                 \n"
+    ".p2align  4                               \n"
 "1:                                            \n"
    "movdqa (%0),%%xmm0                        \n"
    "lea    0x10(%0),%0                        \n"

--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -288,6 +288,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  asm volatile (
    // Read in the data from the source pointer.
    // First round of bit swap.
+    ".p2align  4                                 \n"
  "1:                                            \n"
    "movq       (%0),%%xmm0                      \n"
    "movq       (%0,%3),%%xmm1                   \n"
@@ -499,6 +500,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%3),%%xmm1                   \n"
@@ -639,6 +641,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%4),%%xmm1                   \n"

--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -32,6 +32,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
    "sub         %4, #8                        \n"
    // handle 8x8 blocks.  this should be the majority of the plane
+    ".p2align  4                               \n"
    "1:                                        \n"
      "mov         r9, %0                      \n"
@@ -198,6 +199,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "sub         %6, #8                        \n"
    // handle 8x8 blocks.  this should be the majority of the plane
+    ".p2align  4                               \n"
    "1:                                        \n"
      "mov         r9, %0                      \n"

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
--- a/source/scale.cc
+++ b/source/scale.cc