Change Attenuate and Unattenuate to unaligned memory ops.

BUG=279 TEST=ARGBAttenuate_Unaligned R=nfullagar@google.com, ryanpetrie@google.com Review URL: https://webrtc-codereview.appspot.com/2709004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@821 16f28f9a-4ce2-e073-06de-1de4eb20be90

Change Attenuate and Unattenuate to unaligned memory ops.
BUG=279 TEST=ARGBAttenuate_Unaligned R=nfullagar@google.com, ryanpetrie@google.com Review URL: https://webrtc-codereview.appspot.com/2709004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@821 16f28f9a-4ce2-e073-06de-1de4eb20be90
38157bdc · fbarchard@google.com · d2371686 · 38157bdc · 38157bdc · 38157bdc
Commit 38157bdc authored Oct 21, 2013 by fbarchard@google.com
5 changed files
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 820
+Version: 821
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 820
+#define LIBYUV_VERSION 821
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1134,9 +1134,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
    if (IS_ALIGNED(width, 4)) {
      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
@@ -1191,9 +1189,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
                             int width) = ARGBUnattenuateRow_C;
 #if defined(HAS_ARGBUNATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -4117,17 +4117,17 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
    // 4 pixel loop.
    ".p2align  4                               \n"
  "1:                                          \n"
-    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm0           \n"
    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm1           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm1           \n"
    "punpcklbw %%xmm1,%%xmm1                   \n"
    "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm1           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm1           \n"
    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm2           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm2           \n"
    "punpckhbw %%xmm2,%%xmm2                   \n"
    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm2           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm2           \n"
    "lea       "MEMLEA(0x10,0)",%0             \n"
    "pand      %%xmm3,%%xmm2                   \n"
    "psrlw     $0x8,%%xmm0                     \n"
@@ -4135,7 +4135,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
    "packuswb  %%xmm1,%%xmm0                   \n"
    "por       %%xmm2,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
+    "movdqu    %%xmm0,"MEMACCESS(1)"           \n"
    "lea       "MEMLEA(0x10,1)",%1             \n"
    "jg        1b                              \n"
  : "+r"(src_argb),    // %0
@@ -4161,7 +4161,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    // 4 pixel loop.
    ".p2align  4                               \n"
  "1:                                          \n"
-    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm0           \n"
    "movzb     "MEMACCESS2(0x03,0)",%3         \n"
    "punpcklbw %%xmm0,%%xmm0                   \n"
    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
@@ -4171,7 +4171,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
    "movlhps   %%xmm3,%%xmm2                   \n"
    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm1           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm1           \n"
    "movzb     "MEMACCESS2(0x0b,0)",%3         \n"
    "punpckhbw %%xmm1,%%xmm1                   \n"
    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
@@ -4184,7 +4184,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    "lea       "MEMLEA(0x10,0)",%0             \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
+    "movdqu    %%xmm0,"MEMACCESS(1)"           \n"
    "lea       "MEMLEA(0x10,1)",%1             \n"
    "jg        1b                              \n"
  : "+r"(src_argb),    // %0

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4586,7 +4586,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    mov        esi, [esp + 4 + 8]   // src_argb1
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
    psrlw      xmm7, 15
    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
    psrlw      xmm6, 8
@@ -4788,17 +4788,17 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
    align      16
 convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
    pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
    pmulhuw    xmm0, xmm1       // rgb * a
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
    pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqa     xmm2, [eax]      // read 4 pixels
+    movdqu     xmm2, [eax]      // read 4 pixels
    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqa     xmm2, [eax]      // mask original alpha
+    movdqu     xmm2, [eax]      // mask original alpha
    lea        eax, [eax + 16]
    pand       xmm2, xmm3
    psrlw      xmm0, 8
@@ -4806,7 +4806,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
    packuswb   xmm0, xmm1
    por        xmm0, xmm2       // copy original alpha
    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    jg         convertloop
@@ -4874,7 +4874,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    align      16
 convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
    movzx      esi, byte ptr [eax + 3]  // first alpha
    movzx      edi, byte ptr [eax + 7]  // second alpha
    punpcklbw  xmm0, xmm0       // first 2
@@ -4885,7 +4885,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    movlhps    xmm2, xmm3
    pmulhuw    xmm0, xmm2       // rgb * a
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
    movzx      esi, byte ptr [eax + 11]  // third alpha
    movzx      edi, byte ptr [eax + 15]  // forth alpha
    punpckhbw  xmm1, xmm1       // next 2
@@ -4899,7 +4899,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    packuswb   xmm0, xmm1
    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    jg         convertloop
    pop        edi