Commit 205c1440 authored by fbarchard@google.com's avatar fbarchard@google.com

Use movdqu then pavgb to allow unaligned memory for rgb subsampling code. …

Use movdqu then pavgb to allow unaligned memory for rgb subsampling code.  Allows this assembly to be used for unaligned pointers as well as aligned ones with no performance hit when memory is aligned on a modern cpu.
BUG=365
TESTED=libyuvTest.ARGBToI420_Unaligned (453 ms)
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/30679004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1116 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 883ce64a
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1115 Version: 1116
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -100,8 +100,8 @@ extern "C" { ...@@ -100,8 +100,8 @@ extern "C" {
#define HAS_SOBELYROW_SSE2 #define HAS_SOBELYROW_SSE2
// Conversions: // Conversions:
//#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3
//#define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2
...@@ -116,10 +116,10 @@ extern "C" { ...@@ -116,10 +116,10 @@ extern "C" {
#define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTORGB565ROW_SSE2
#define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV422ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3
//#define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3
//#define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS #define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2 #define HAS_COPYROW_SSE2
...@@ -153,7 +153,7 @@ extern "C" { ...@@ -153,7 +153,7 @@ extern "C" {
#define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3
#define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGB565TOARGBROW_SSE2
//#define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86 #define HAS_SETROW_X86
#define HAS_SPLITUVROW_SSE2 #define HAS_SPLITUVROW_SSE2
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1115 #define LIBYUV_VERSION 1116
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -807,14 +807,18 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -807,14 +807,18 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 "pavgb %%xmm7,%%xmm6 \n"
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
"lea " MEMLEA(0x40,0) ",%0 \n" "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n" "movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
...@@ -876,14 +880,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -876,14 +880,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 "pavgb %%xmm7,%%xmm6 \n"
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
"lea " MEMLEA(0x40,0) ",%0 \n" "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n" "movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
...@@ -1111,14 +1119,18 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, ...@@ -1111,14 +1119,18 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 "pavgb %%xmm7,%%xmm6 \n"
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
"lea " MEMLEA(0x40,0) ",%0 \n" "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n" "movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
...@@ -1251,14 +1263,18 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ...@@ -1251,14 +1263,18 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 "pavgb %%xmm7,%%xmm6 \n"
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
"lea " MEMLEA(0x40,0) ",%0 \n" "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n" "movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
...@@ -1317,14 +1333,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ...@@ -1317,14 +1333,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 "pavgb %%xmm7,%%xmm6 \n"
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
"lea " MEMLEA(0x40,0) ",%0 \n" "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n" "movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
......
...@@ -977,13 +977,18 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -977,13 +977,18 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi] movdqu xmm4, [eax + esi + 48]
pavgb xmm1, [eax + esi + 16] pavgb xmm3, xmm4
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64] lea eax, [eax + 64]
movdqa xmm4, xmm0 movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88 shufps xmm0, xmm1, 0x88
...@@ -1043,13 +1048,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1043,13 +1048,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi] movdqu xmm4, [eax + esi + 48]
pavgb xmm1, [eax + esi + 16] pavgb xmm3, xmm4
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64] lea eax, [eax + 64]
movdqa xmm4, xmm0 movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88 shufps xmm0, xmm1, 0x88
...@@ -1294,13 +1304,18 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1294,13 +1304,18 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi] movdqu xmm4, [eax + esi + 48]
pavgb xmm1, [eax + esi + 16] pavgb xmm3, xmm4
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64] lea eax, [eax + 64]
movdqa xmm4, xmm0 movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88 shufps xmm0, xmm1, 0x88
...@@ -1360,13 +1375,18 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1360,13 +1375,18 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi] movdqu xmm4, [eax + esi + 48]
pavgb xmm1, [eax + esi + 16] pavgb xmm3, xmm4
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64] lea eax, [eax + 64]
movdqa xmm4, xmm0 movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88 shufps xmm0, xmm1, 0x88
...@@ -1426,13 +1446,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1426,13 +1446,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32] movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48] movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi] movdqu xmm4, [eax + esi + 48]
pavgb xmm1, [eax + esi + 16] pavgb xmm3, xmm4
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64] lea eax, [eax + 64]
movdqa xmm4, xmm0 movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88 shufps xmm0, xmm1, 0x88
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment