Commit 16a96645 authored by fbarchard@google.com's avatar fbarchard@google.com

splituv and mirroruv in row use 2 pixels at a time in C

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/432006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@201 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f69e90a1
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 200
Version: 201
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 200
#define LIBYUV_VERSION 201
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -23,12 +23,6 @@ extern "C" {
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(YUV_DISABLE_ASM)
// Note static const preferred, but gives internal compiler error on gcc 4.2
// Shuffle table for reversing the bytes of UV channels.
uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
};
#if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
......@@ -759,8 +753,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int w) {
int i;
for (i = 0; i < w; ++i) {
for (int i = 0; i < w; ++i) {
dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
......@@ -777,9 +770,8 @@ static void TransposeWx8_C(const uint8* src, int src_stride,
static void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
int i, j;
for (i = 0; i < width; ++i)
for (j = 0; j < height; ++j)
for (int i = 0; i < width; ++i)
for (int j = 0; j < height; ++j)
dst[i * dst_stride + j] = src[j * src_stride + i];
}
......@@ -1005,79 +997,6 @@ void RotateUV270(const uint8* src, int src_stride,
width, height);
}
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
#define HAS_MIRRORROW_UV_SSSE3
__declspec(naked)
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src
mov edx, [esp + 4 + 8] // dst_a
mov edi, [esp + 4 + 12] // dst_b
mov ecx, [esp + 4 + 16] // width
movdqa xmm1, kShuffleMirrorUV
lea eax, [eax + ecx * 2 - 16]
sub edi, edx
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm1
sub ecx, 8
movlpd qword ptr [edx], xmm0
movhpd qword ptr [edx + edi], xmm0
lea edx, [edx + 8]
ja convertloop
pop edi
ret
}
}
#elif (defined(__i386__) || defined(__x86_64__)) && \
!defined(YUV_DISABLE_ASM)
#define HAS_MIRRORROW_UV_SSSE3
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa %4,%%xmm1 \n"
"lea -16(%0,%3,2),%0 \n"
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -16(%0),%0 \n"
"pshufb %%xmm1,%%xmm0 \n"
"sub $8,%3 \n"
"movlpd %%xmm0,(%1) \n"
"movhpd %%xmm0,(%1,%2) \n"
"lea 8(%1),%1 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(temp_width) // %3
: "m"(kShuffleMirrorUV) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
);
}
#endif
static void MirrorRowUV_C(const uint8* src,
uint8* dst_a, uint8* dst_b,
int width) {
src += (width << 1) - 2;
for (int i = 0; i < width; ++i) {
dst_a[i] = src[0];
dst_b[i] = src[1];
src -= 2;
}
}
void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
......
......@@ -19,94 +19,6 @@ extern "C" {
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// compute where to start writing destination
"add %1, %2 \n"
// work on segments that are multiples of 16
"lsrs r3, %2, #4 \n"
// the output is written in two block. 8 bytes followed
// by another 8. reading is done sequentially, from left to
// right. writing is done from right to left in block sizes
// %1, the destination pointer is incremented after writing
// the first of the two blocks. need to subtract that 8 off
// along with 16 to get the next location.
"mov r3, #-24 \n"
"beq 2f \n"
// back of destination by the size of the register that is
// going to be mirrord
"sub %1, #16 \n"
// the loop needs to run on blocks of 16. what will be left
// over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time.
"sub %2, #16 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // src += 16
// mirror the bytes in the 64 bit segments. unable to mirror
// the bytes in the entire 128 bits in one go.
"vrev64.8 q0, q0 \n"
// because of the inability to mirror the entire 128 bits
// mirror the writing out of the two 64 bit segments.
"vst1.8 {d1}, [%1]! \n"
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16
"subs %2, #16 \n"
"bge 1b \n"
// add 16 back to the counter. if the result is 0 there is no
// residuals so jump past
"adds %2, #16 \n"
"beq 5f \n"
"add %1, #16 \n"
"2: \n"
"mov r3, #-3 \n"
"sub %1, #2 \n"
"subs %2, #2 \n"
// check for 16*n+1 scenarios where segments_of_2 should not
// be run, but there is something left over.
"blt 4f \n"
// do this in neon registers as per
// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"vst1.8 {d1[0]}, [%1]! \n"
"vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
"subs %2, #2 \n"
"bge 3b \n"
"adds %2, #2 \n"
"beq 5f \n"
"4: \n"
"add %1, #1 \n"
"vld1.8 {d0[0]}, [%0] \n"
"vst1.8 {d0[0]}, [%1] \n"
"5: \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "memory", "cc", "r3", "q0"
);
}
static const uvec8 vtbl_4x4_transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
......@@ -272,80 +184,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
);
}
void MirrorRowUV_NEON(const uint8* src,
uint8* dst_a, uint8* dst_b,
int width) {
asm volatile (
// compute where to start writing destination
"add %1, %3 \n" // dst_a + width
"add %2, %3 \n" // dst_b + width
// work on input segments that are multiples of 16, but
// width that has been passed is output segments, half
// the size of input.
"lsrs r12, %3, #3 \n"
"beq 2f \n"
// the output is written in to two blocks.
"mov r12, #-8 \n"
// back of destination by the size of the register that is
// going to be mirrord
"sub %1, #8 \n"
"sub %2, #8 \n"
// the loop needs to run on blocks of 8. what will be left
// over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time.
"sub %3, #8 \n"
"1: \n"
"vld2.8 {d0, d1}, [%0]! \n" // src += 16
// mirror the bytes in the 64 bit segments
"vrev64.8 q0, q0 \n"
"vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
"subs %3, #8 \n"
"bge 1b \n"
// add 8 back to the counter. if the result is 0 there is no
// residuals so return
"adds %3, #8 \n"
"beq 4f \n"
"add %1, #8 \n"
"add %2, #8 \n"
"2: \n"
"mov r12, #-1 \n"
"sub %1, #1 \n"
"sub %2, #1 \n"
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
"vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
"subs %3, %3, #1 \n"
"bgt 3b \n"
"4: \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
:
: "memory", "cc", "r12", "q0"
);
}
static const uvec8 vtbl_4x4_transpose_di =
{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
......
......@@ -54,6 +54,7 @@ extern "C" {
#define HAS_I444TOARGBROW_SSSE3
#define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROWUV_SSSE3
#define HAS_SPLITUV_SSE2
#define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
......@@ -66,6 +67,7 @@ extern "C" {
// The following are available on Neon platforms
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORROWUV_NEON
#define HAS_SPLITUV_NEON
#define HAS_COPYROW_NEON
#define HAS_I420TOARGBROW_NEON
......@@ -126,6 +128,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
void MirrorRow_C(const uint8* src, uint8* dst, int width);
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
......
......@@ -18,8 +18,8 @@ namespace libyuv {
extern "C" {
#endif
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
for (int x = 0; x < width; ++x) {
// To support in-place conversion.
uint8 r = src_abgr[0];
uint8 g = src_abgr[1];
......@@ -34,8 +34,8 @@ void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
}
}
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
for (int x = 0; x < width; ++x) {
// To support in-place conversion.
uint8 a = src_bgra[0];
uint8 r = src_bgra[1];
......@@ -50,8 +50,8 @@ void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
}
}
void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
for (int x = 0; x < width; ++x) {
uint8 b = src_rgb24[0];
uint8 g = src_rgb24[1];
uint8 r = src_rgb24[2];
......@@ -64,8 +64,8 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
}
}
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
for (int x = 0; x < width; ++x) {
uint8 r = src_raw[0];
uint8 g = src_raw[1];
uint8 b = src_raw[2];
......@@ -78,8 +78,8 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
}
}
void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
for (int x = 0; x < width; ++x) {
uint8 b = src_rgb[0] & 0x1f;
uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
uint8 r = src_rgb[1] >> 3;
......@@ -92,8 +92,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
}
}
void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
for (int x = 0; x < width; ++x) {
uint8 b = src_rgb[0] & 0x1f;
uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
uint8 r = (src_rgb[1] & 0x7c) >> 2;
......@@ -107,8 +107,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
}
}
void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
for (int x = 0; x < width; ++x) {
uint8 a = src_rgb[1] >> 4;
uint8 r = src_rgb[1] & 0x0f;
uint8 g = src_rgb[0] >> 4;
......@@ -122,8 +122,8 @@ void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
}
}
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
for (int x = 0; x < pix; ++x) {
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
for (int x = 0; x < width; ++x) {
uint8 b = src_argb[0];
uint8 g = src_argb[1];
uint8 r = src_argb[2];
......@@ -135,8 +135,8 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
for (int x = 0; x < pix; ++x) {
void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
for (int x = 0; x < width; ++x) {
uint8 b = src_argb[0];
uint8 g = src_argb[1];
uint8 r = src_argb[2];
......@@ -149,8 +149,8 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
// TODO(fbarchard): support big endian CPU
void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
for (int x = 0; x < pix; ++x) {
void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
for (int x = 0; x < width; ++x) {
uint8 b = src_argb[0] >> 3;
uint8 g = src_argb[1] >> 2;
uint8 r = src_argb[2] >> 3;
......@@ -160,8 +160,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
for (int x = 0; x < pix; ++x) {
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
for (int x = 0; x < width; ++x) {
uint8 b = src_argb[0] >> 3;
uint8 g = src_argb[1] >> 3;
uint8 r = src_argb[2] >> 3;
......@@ -172,8 +172,8 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
for (int x = 0; x < pix; ++x) {
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
for (int x = 0; x < width; ++x) {
uint8 b = src_argb[0] >> 4;
uint8 g = src_argb[1] >> 4;
uint8 r = src_argb[2] >> 4;
......@@ -233,9 +233,9 @@ MAKEROWY(ARGB,2,1,0)
MAKEROWY(BGRA,1,2,3)
MAKEROWY(ABGR,0,1,2)
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
// Copy a Y to RGB.
for (int x = 0; x < pix; ++x) {
for (int x = 0; x < width; ++x) {
uint8 y = src_y[0];
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
dst_argb[3] = 255u;
......@@ -360,20 +360,42 @@ void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
void MirrorRow_C(const uint8* src, uint8* dst, int width) {
src += width - 1;
for (int i = 0; i < width; ++i) {
dst[i] = src[0];
--src;
for (int x = 0; x < width - 1; x += 2) {
dst[x] = src[0];
dst[x + 1] = src[-1];
src -= 2;
}
if (width & 1) {
dst[width - 1] = src[0];
}
}
void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
// Copy a row of UV.
for (int x = 0; x < pix; ++x) {
dst_u[0] = src_uv[0];
dst_v[0] = src_uv[1];
src_uv += 2;
dst_u += 1;
dst_v += 1;
void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
src_uv += (width - 1) << 1;
for (int x = 0; x < width - 1; x += 2) {
dst_u[x] = src_uv[0];
dst_u[x + 1] = src_uv[-2];
dst_v[x] = src_uv[1];
dst_v[x + 1] = src_uv[-2 + 1];
src_uv -= 4;
}
if (width & 1) {
dst_u[width - 1] = src_uv[0];
dst_v[width - 1] = src_uv[1];
}
}
void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
for (int x = 0; x < width - 1; x += 2) {
dst_u[x] = src_uv[0];
dst_u[x + 1] = src_uv[2];
dst_v[x] = src_uv[1];
dst_v[x + 1] = src_uv[3];
src_uv += 4;
}
if (width & 1) {
dst_u[width - 1] = src_uv[0];
dst_v[width - 1] = src_uv[1];
}
}
......@@ -383,9 +405,9 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
// Filter 2 rows of YUY2 UV's (422) into U and V (420)
void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
uint8* dst_u, uint8* dst_v, int width) {
// Output a row of UV values, filtering 2 rows of YUY2
for (int x = 0; x < pix; x += 2) {
for (int x = 0; x < width; x += 2) {
dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
src_yuy2 += 4;
......@@ -394,20 +416,22 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
}
}
void YUY2ToYRow_C(const uint8* src_yuy2,
uint8* dst_y, int pix) {
void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
// Copy a row of yuy2 Y values
for (int x = 0; x < pix; ++x) {
dst_y[0] = src_yuy2[0];
src_yuy2 += 2;
dst_y += 1;
for (int x = 0; x < width - 1; x += 2) {
dst_y[x] = src_yuy2[0];
dst_y[x + 1] = src_yuy2[2];
src_yuy2 += 4;
}
if (width & 1) {
dst_y[width - 1] = src_yuy2[0];
}
}
void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
uint8* dst_u, uint8* dst_v, int width) {
// Copy a row of uyvy UV values
for (int x = 0; x < pix; x += 2) {
for (int x = 0; x < width; x += 2) {
dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
src_uyvy += 4;
......@@ -416,13 +440,15 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
}
}
void UYVYToYRow_C(const uint8* src_uyvy,
uint8* dst_y, int pix) {
void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
// Copy a row of uyvy Y values
for (int x = 0; x < pix; ++x) {
dst_y[0] = src_uyvy[1];
src_uyvy += 2;
dst_y += 1;
for (int x = 0; x < width - 1; x += 2) {
dst_y[x] = src_yuy2[1];
dst_y[x + 1] = src_yuy2[3];
src_yuy2 += 4;
}
if (width & 1) {
dst_y[width - 1] = src_yuy2[1];
}
}
......
This diff is collapsed.
......@@ -1493,7 +1493,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
#endif
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
CONST uvec8 kShuffleMirror = {
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
......@@ -1524,7 +1523,6 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
#endif
#ifdef HAS_MIRRORROW_SSE2
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
......@@ -1554,6 +1552,40 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
}
#endif
#ifdef HAS_MIRRORROW_UV_SSSE3
// Shuffle table for reversing the bytes of UV channels.
CONST uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
};
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa %4,%%xmm1 \n"
"lea -16(%0,%3,2),%0 \n"
"sub %1,%2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"lea -16(%0),%0 \n"
"pshufb %%xmm1,%%xmm0 \n"
"sub $8,%3 \n"
"movlpd %%xmm0,(%1) \n"
"movhpd %%xmm0,(%1,%2) \n"
"lea 8(%1),%1 \n"
"ja 1b \n"
: "+r"(src), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(temp_width) // %3
: "m"(kShuffleMirrorUV) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
);
}
#endif
#ifdef HAS_SPLITUV_SSE2
void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
......
......@@ -1501,7 +1501,6 @@ __asm {
#endif
#ifdef HAS_MIRRORROW_SSE2
// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
// version can not.
__declspec(naked)
......@@ -1529,6 +1528,41 @@ __asm {
}
#endif
#ifdef HAS_MIRRORROW_UV_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
};
__declspec(naked)
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
movdqa xmm1, kShuffleMirrorUV
lea eax, [eax + ecx * 2 - 16]
sub edi, edx
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
pshufb xmm0, xmm1
sub ecx, 8
movlpd qword ptr [edx], xmm0
movhpd qword ptr [edx + edi], xmm0
lea edx, [edx + 8]
ja convertloop
pop edi
ret
}
}
#endif
#ifdef HAS_SPLITUV_SSE2
__declspec(naked)
void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment