Commit 1dafd444 authored by fbarchard@google.com's avatar fbarchard@google.com

MergeUV backport to SSE2

BUG=135
TESTED=unitest I420ToNV12
Review URL: https://webrtc-codereview.appspot.com/943006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@447 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 6bb9f53f
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 446
Version: 447
License: BSD
License File: LICENSE
......
......@@ -86,6 +86,7 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I422TOUYVYROW_SSE2
#define HAS_MERGEUV_SSE2
// Effects
#define HAS_ARGBAFFINEROW_SSE2
......@@ -120,6 +121,7 @@ extern "C" {
#define HAS_UYVYTOYROW_AVX2
#define HAS_YUY2TOYROW_MMX
#define HAS_UYVYTOYROW_MMX
#define HAS_MERGEUV_SSE2
#endif
// The following are disabled when SSSE3 is available:
......@@ -311,6 +313,8 @@ void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 446
#define LIBYUV_VERSION 447
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -521,7 +521,14 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
int halfwidth = (width + 1) >> 1;
void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUV_C;
#if defined(HAS_SPLITUV_NEON)
#if defined(HAS_MERGEUV_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) &&
IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
MergeUV = MergeUV_SSE2;
}
#elif defined(HAS_MERGEUV_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
MergeUV = MergeUV_NEON;
}
......@@ -529,7 +536,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
int halfheight = (height + 1) >> 1;
for (int y = 0; y < halfheight; ++y) {
// Copy a row of UV.
// Merge a row of U and V into a row of UV.
MergeUV(src_u, src_v, dst_uv, halfwidth);
src_u += src_stride_u;
src_v += src_stride_v;
......
......@@ -2547,6 +2547,37 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
}
#endif // HAS_SPLITUV_SSE2
#ifdef HAS_MERGEUV_SSE2
void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%1,1),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm2 \n"
"movdqa %%xmm0,(%2) \n"
"movdqa %%xmm2,0x10(%2) \n"
"lea 0x20(%2),%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
#endif
);
}
#endif // HAS_MERGEUV_SSE2
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile (
......
......@@ -2620,6 +2620,38 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
}
#endif // HAS_SPLITUV_SSE2
#ifdef HAS_MERGEUV_SSE2
__declspec(naked) __declspec(align(16))
void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_u
mov edx, [esp + 4 + 8] // src_v
mov edi, [esp + 4 + 12] // dst_uv
mov ecx, [esp + 4 + 16] // width
sub edx, eax
align 16
convertloop:
movdqa xmm0, [eax] // read 16 U's
movdqa xmm1, [eax + edx] // and 16 V's
lea eax, [eax + 16]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1 // first 8 UV pairs
punpckhbw xmm2, xmm1 // next 8 UV pairs
movdqa [edi], xmm0
movdqa [edi + 16], xmm2
lea edi, [edi + 32]
sub ecx, 16
jg convertloop
pop edi
ret
}
}
#endif // HAS_MERGEUV_SSE2
#ifdef HAS_COPYROW_SSE2
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
__declspec(naked) __declspec(align(16))
......
......@@ -17,6 +17,8 @@
SECTION .text
; cglobal numeric constants are parameters, gpr regs, mm regs
; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
%macro YUY2TOYROW 2-3
......@@ -63,8 +65,7 @@ YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
; void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
; int pix) {
; void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
%macro SPLITUV 1-2
cglobal SplitUV%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
......@@ -103,3 +104,36 @@ INIT_YMM AVX2
SPLITUV a,
SPLITUV u,_Unaligned
; void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
; int width);
%macro MergeUV 1-2
cglobal MergeUV%2, 4, 4, 3, src_u, src_v, dst_uv, pix
sub src_vq, src_uq
ALIGN 16
.convertloop:
mov%1 m0, [src_uq]
mov%1 m1, [src_vq]
lea src_uq, [src_uq + mmsize]
mova m2, m0
punpcklbw m0, m0, m1 // first 8 UV pairs
punpckhbw m2, m2, m1 // next 8 UV pairs
mov%1 [dst_uvq], m0
mov%1 [dst_uvq + mmsize], m2
lea dst_uvq, [dst_uvq + mmsize * 2]
sub pixd, mmsize
jg .convertloop
REP_RET
%endmacro
INIT_MMX MMX
MERGEUV a,
MERGEUV u,_Unaligned
INIT_XMM SSE2
MERGEUV a,
MERGEUV u,_Unaligned
INIT_YMM AVX2
MERGEUV a,
MERGEUV u,_Unaligned
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment