Commit e6dd1fa0 authored by fbarchard@google.com's avatar fbarchard@google.com

Port I420ToARGB to intrinsics for win64

BUG=336
TESTED=out\release_x64\libyuv_unittest --gunit_also_run_disabled_tests --gtest_filter=*I420To*B*
R=bryan.bernhart@intel.com, tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/15809005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1018 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f67b426b
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1017
Version: 1018
License: BSD
License File: LICENSE
......
......@@ -152,6 +152,11 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2
#endif
// The following are available on x64 Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
#define HAS_I422TOARGBROW_SSSE3
#endif
// GCC >= 4.7.0 required for AVX2.
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1017
#define LIBYUV_VERSION 1018
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -35,10 +35,12 @@ extern "C" {
}
#ifdef HAS_I422TOARGBROW_SSSE3
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
0, 4, 7)
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
1, 4, 7)
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_I444TOARGBROW_SSSE3
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
0, 4, 7)
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
2, 4, 7)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
......@@ -59,7 +61,7 @@ YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOARGBROW_SSSE3
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TOARGBROW_AVX2
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
#endif // HAS_I422TOARGBROW_AVX2
......
......@@ -10,13 +10,177 @@
#include "libyuv/row.h"
#if defined (_M_X64)
#include <emmintrin.h>
#include <tmmintrin.h> // For _mm_maddubs_epi16
#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
// This module is for Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
#define UB 127 /* min(127,(int8)(2.018 * 64)) */
#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
#define UR 0
#define VB 0
#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
// Bias
#define BB UB * 128 + VB * 128
#define BG UG * 128 + VG * 128
#define BR UR * 128 + VR * 128
static const vec8 kUVToB = {
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
};
static const vec8 kUVToR = {
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
};
static const vec8 kUVToG = {
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
};
static const vec8 kVUToB = {
VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
};
static const vec8 kVUToR = {
VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
};
static const vec8 kVUToG = {
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
};
static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
// 64 bit
#if defined(_M_X64)
// Aligned destination version.
__declspec(align(16))
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
__m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128();
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) {
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
xmm1 = _mm_load_si128(&xmm0);
xmm2 = _mm_load_si128(&xmm0);
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
xmm0 = _mm_adds_epi16(xmm0, xmm3);
xmm1 = _mm_adds_epi16(xmm1, xmm3);
xmm2 = _mm_adds_epi16(xmm2, xmm3);
xmm0 = _mm_srai_epi16(xmm0, 6);
xmm1 = _mm_srai_epi16(xmm1, 6);
xmm2 = _mm_srai_epi16(xmm2, 6);
xmm0 = _mm_packus_epi16(xmm0, xmm0);
xmm1 = _mm_packus_epi16(xmm1, xmm1);
xmm2 = _mm_packus_epi16(xmm2, xmm2);
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
xmm1 = _mm_load_si128(&xmm0);
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
_mm_store_si128((__m128i *)dst_argb, xmm0);
_mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
y_buf += 8;
u_buf += 4;
dst_argb += 32;
width -= 8;
}
}
// Unaligned destination version.
void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
__m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128();
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) {
xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
xmm1 = _mm_load_si128(&xmm0);
xmm2 = _mm_load_si128(&xmm0);
xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
xmm0 = _mm_adds_epi16(xmm0, xmm3);
xmm1 = _mm_adds_epi16(xmm1, xmm3);
xmm2 = _mm_adds_epi16(xmm2, xmm3);
xmm0 = _mm_srai_epi16(xmm0, 6);
xmm1 = _mm_srai_epi16(xmm1, 6);
xmm2 = _mm_srai_epi16(xmm2, 6);
xmm0 = _mm_packus_epi16(xmm0, xmm0);
xmm1 = _mm_packus_epi16(xmm1, xmm1);
xmm2 = _mm_packus_epi16(xmm2, xmm2);
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
xmm1 = _mm_load_si128(&xmm0);
xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
_mm_storeu_si128((__m128i *)dst_argb, xmm0);
_mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
y_buf += 8;
u_buf += 4;
dst_argb += 32;
width -= 8;
}
}
// 32 bit
#else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3
......@@ -2030,21 +2194,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
#endif // HAS_ARGBTOYROW_SSSE3
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
#define UB 127 /* min(63,(int8)(2.018 * 64)) */
#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
#define UR 0
#define VB 0
#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
// Bias
#define BB UB * 128 + VB * 128
#define BG UG * 128 + VG * 128
#define BR UR * 128 + VR * 128
#ifdef HAS_I422TOARGBROW_AVX2
static const lvec8 kUVToB_AVX = {
......@@ -2079,10 +2228,10 @@ static const lvec16 kUVBiasR_AVX = {
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16))
void I422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
__asm {
push esi
push edi
......@@ -2150,36 +2299,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_I422TOARGBROW_SSSE3
static const vec8 kUVToB = {
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
};
static const vec8 kUVToR = {
UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
};
static const vec8 kUVToG = {
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
};
static const vec8 kVUToB = {
VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
};
static const vec8 kVUToR = {
VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
};
static const vec8 kVUToG = {
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
};
static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
// Read 8 UV from 444.
......@@ -7276,7 +7395,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#endif // defined(_M_X64)
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#ifdef __cplusplus
} // extern "C"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment