Commit 1dafd444 authored by fbarchard@google.com's avatar fbarchard@google.com

MergeUV backport to SSE2

BUG=135
TESTED=unitest I420ToNV12
Review URL: https://webrtc-codereview.appspot.com/943006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@447 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 6bb9f53f
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 446 Version: 447
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -86,6 +86,7 @@ extern "C" { ...@@ -86,6 +86,7 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2 #define HAS_YUY2TOYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2
#define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOUYVYROW_SSE2
#define HAS_MERGEUV_SSE2
// Effects // Effects
#define HAS_ARGBAFFINEROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2
...@@ -120,6 +121,7 @@ extern "C" { ...@@ -120,6 +121,7 @@ extern "C" {
#define HAS_UYVYTOYROW_AVX2 #define HAS_UYVYTOYROW_AVX2
#define HAS_YUY2TOYROW_MMX #define HAS_YUY2TOYROW_MMX
#define HAS_UYVYTOYROW_MMX #define HAS_UYVYTOYROW_MMX
#define HAS_MERGEUV_SSE2
#endif #endif
// The following are disabled when SSSE3 is available: // The following are disabled when SSSE3 is available:
...@@ -311,6 +313,8 @@ void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -311,6 +313,8 @@ void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width); int width);
void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width); int width);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 446 #define LIBYUV_VERSION 447
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -521,7 +521,14 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, ...@@ -521,7 +521,14 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUV_C; int width) = MergeUV_C;
#if defined(HAS_SPLITUV_NEON) #if defined(HAS_MERGEUV_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) &&
IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
MergeUV = MergeUV_SSE2;
}
#elif defined(HAS_MERGEUV_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
MergeUV = MergeUV_NEON; MergeUV = MergeUV_NEON;
} }
...@@ -529,7 +536,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, ...@@ -529,7 +536,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
int halfheight = (height + 1) >> 1; int halfheight = (height + 1) >> 1;
for (int y = 0; y < halfheight; ++y) { for (int y = 0; y < halfheight; ++y) {
// Copy a row of UV. // Merge a row of U and V into a row of UV.
MergeUV(src_u, src_v, dst_uv, halfwidth); MergeUV(src_u, src_v, dst_uv, halfwidth);
src_u += src_stride_u; src_u += src_stride_u;
src_v += src_stride_v; src_v += src_stride_v;
......
...@@ -2547,6 +2547,37 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -2547,6 +2547,37 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
} }
#endif // HAS_SPLITUV_SSE2 #endif // HAS_SPLITUV_SSE2
#ifdef HAS_MERGEUV_SSE2
void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
"sub %0,%1 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%1,1),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm2 \n"
"movdqa %%xmm0,(%2) \n"
"movdqa %%xmm2,0x10(%2) \n"
"lea 0x20(%2),%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
#endif
);
}
#endif // HAS_MERGEUV_SSE2
#ifdef HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile ( asm volatile (
......
...@@ -2620,6 +2620,38 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ...@@ -2620,6 +2620,38 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
} }
#endif // HAS_SPLITUV_SSE2 #endif // HAS_SPLITUV_SSE2
#ifdef HAS_MERGEUV_SSE2
__declspec(naked) __declspec(align(16))
void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_u
mov edx, [esp + 4 + 8] // src_v
mov edi, [esp + 4 + 12] // dst_uv
mov ecx, [esp + 4 + 16] // width
sub edx, eax
align 16
convertloop:
movdqa xmm0, [eax] // read 16 U's
movdqa xmm1, [eax + edx] // and 16 V's
lea eax, [eax + 16]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1 // first 8 UV pairs
punpckhbw xmm2, xmm1 // next 8 UV pairs
movdqa [edi], xmm0
movdqa [edi + 16], xmm2
lea edi, [edi + 32]
sub ecx, 16
jg convertloop
pop edi
ret
}
}
#endif // HAS_MERGEUV_SSE2
#ifdef HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_SSE2
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
......
; ;
; Copyright 2012 The LibYuv Project Authors. All rights reserved. ; Copyright 2012 The LibYuv Project Authors. All rights reserved.
; ;
; Use of this source code is governed by a BSD-style license ; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source ; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found ; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may ; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree. ; be found in the AUTHORS file in the root of the source tree.
; ;
%ifdef __YASM_VERSION_ID__ %ifdef __YASM_VERSION_ID__
%if __YASM_VERSION_ID__ < 01020000h %if __YASM_VERSION_ID__ < 01020000h
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
SECTION .text SECTION .text
; cglobal numeric constants are parameters, gpr regs, mm regs
; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
%macro YUY2TOYROW 2-3 %macro YUY2TOYROW 2-3
...@@ -63,8 +65,7 @@ YUY2TOYROW YUY2,u,_Unaligned ...@@ -63,8 +65,7 @@ YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a, YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned YUY2TOYROW UYVY,u,_Unaligned
; void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ; void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
; int pix) {
%macro SPLITUV 1-2 %macro SPLITUV 1-2
cglobal SplitUV%2, 4, 4, 5, src_uv, dst_u, dst_v, pix cglobal SplitUV%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
...@@ -103,3 +104,36 @@ INIT_YMM AVX2 ...@@ -103,3 +104,36 @@ INIT_YMM AVX2
SPLITUV a, SPLITUV a,
SPLITUV u,_Unaligned SPLITUV u,_Unaligned
; void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
; int width);
%macro MergeUV 1-2
cglobal MergeUV%2, 4, 4, 3, src_u, src_v, dst_uv, pix
sub src_vq, src_uq
ALIGN 16
.convertloop:
mov%1 m0, [src_uq]
mov%1 m1, [src_vq]
lea src_uq, [src_uq + mmsize]
mova m2, m0
punpcklbw m0, m0, m1 // first 8 UV pairs
punpckhbw m2, m2, m1 // next 8 UV pairs
mov%1 [dst_uvq], m0
mov%1 [dst_uvq + mmsize], m2
lea dst_uvq, [dst_uvq + mmsize * 2]
sub pixd, mmsize
jg .convertloop
REP_RET
%endmacro
INIT_MMX MMX
MERGEUV a,
MERGEUV u,_Unaligned
INIT_XMM SSE2
MERGEUV a,
MERGEUV u,_Unaligned
INIT_YMM AVX2
MERGEUV a,
MERGEUV u,_Unaligned
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment