Commit b1dd02d6 authored by fbarchard@google.com's avatar fbarchard@google.com

Unaligned test/moves for some of the slower functions

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/368011

git-svn-id: http://libyuv.googlecode.com/svn/trunk@162 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent b5b27d13
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 161
Version: 162
License: BSD
License File: LICENSE
......
......@@ -16,7 +16,7 @@ namespace libyuv {
extern "C" {
#endif
#define LIBYUV_VERSION 161
#define LIBYUV_VERSION 162
#ifdef __cplusplus
} // extern "C"
......
......@@ -548,8 +548,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
} else
#endif
......@@ -561,6 +560,11 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -569,6 +573,9 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
} else
#endif
{
......@@ -610,8 +617,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
} else
#endif
......@@ -623,6 +629,11 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -631,6 +642,9 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
} else
#endif
{
......@@ -672,8 +686,7 @@ int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_RGB565TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
} else
#endif
......@@ -685,6 +698,11 @@ int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -693,6 +711,9 @@ int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
} else
#endif
{
......@@ -734,8 +755,7 @@ int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
} else
#endif
......@@ -747,6 +767,11 @@ int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -755,6 +780,9 @@ int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
} else
#endif
{
......@@ -796,8 +824,7 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
} else
#endif
......@@ -809,6 +836,11 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
ARGBToYRow = ARGBToYAnyRow_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
}
} else
#endif
{
......@@ -817,6 +849,9 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 2) && width <= kMaxStride) {
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
} else
#endif
{
......
......@@ -2147,7 +2147,6 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_raw, 16) && IS_ALIGNED(src_stride_raw, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
} else
......@@ -2177,7 +2176,6 @@ int BG24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_rgb24, 16) && IS_ALIGNED(src_stride_rgb24, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
} else
......
......@@ -172,9 +172,9 @@ __asm {
movdqa xmm4, kShuffleMaskRGB24ToARGB
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
......@@ -211,9 +211,9 @@ __asm {
movdqa xmm4, kShuffleMaskRAWToARGB
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
......@@ -270,7 +270,7 @@ __asm {
sub edx, eax
convertloop:
movdqa xmm0, [eax] // fetch 8 pixels of bgr565
movdqu xmm0, [eax] // fetch 8 pixels of bgr565
movdqa xmm1, xmm0
movdqa xmm2, xmm0
pand xmm1, xmm3 // R in upper 5 bits
......@@ -320,7 +320,7 @@ __asm {
sub edx, eax
convertloop:
movdqa xmm0, [eax] // fetch 8 pixels of 1555
movdqu xmm0, [eax] // fetch 8 pixels of 1555
movdqa xmm1, xmm0
movdqa xmm2, xmm0
psllw xmm1, 1 // R in upper 5 bits
......@@ -366,7 +366,7 @@ __asm {
sub edx, eax
convertloop:
movdqa xmm0, [eax] // fetch 8 pixels of bgra4444
movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
movdqa xmm2, xmm0
pand xmm0, xmm4 // mask low nibbles
pand xmm2, xmm5 // mask high nibbles
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment