Commit 205c1440 authored by fbarchard@google.com's avatar fbarchard@google.com

Use movdqu then pavgb to allow unaligned memory for rgb subsampling code. …

Use movdqu then pavgb to allow unaligned memory for rgb subsampling code.  Allows this assembly to be used for unaligned pointers as well as aligned ones with no performance hit when memory is aligned on a modern cpu.
BUG=365
TESTED=libyuvTest.ARGBToI420_Unaligned (453 ms)
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/30679004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1116 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 883ce64a
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1115
Version: 1116
License: BSD
License File: LICENSE
......
......@@ -100,8 +100,8 @@ extern "C" {
#define HAS_SOBELYROW_SSE2
// Conversions:
//#define HAS_ARGBTOUVROW_SSSE3
//#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
......@@ -116,10 +116,10 @@ extern "C" {
#define HAS_ARGBTORGB565ROW_SSE2
#define HAS_ARGBTOUV422ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3
//#define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
//#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
......@@ -153,7 +153,7 @@ extern "C" {
#define HAS_RGB24TOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RGB565TOARGBROW_SSE2
//#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86
#define HAS_SPLITUVROW_SSE2
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1115
#define LIBYUV_VERSION 1116
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -807,14 +807,18 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
......@@ -876,14 +880,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
......@@ -1111,14 +1119,18 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
......@@ -1251,14 +1263,18 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
......@@ -1317,14 +1333,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
......
......@@ -977,13 +977,18 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
......@@ -1043,13 +1048,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
......@@ -1294,13 +1304,18 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
......@@ -1360,13 +1375,18 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
......@@ -1426,13 +1446,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm1, [eax + 16]
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm2, [eax + 32]
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment