Commit c3d09f60 authored by fbarchard@google.com's avatar fbarchard@google.com

Improve accuracy of luma channel in YUV to RGB conversion

BUG=324
TESTED=TestFullYUV
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/36859004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1233 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 292c2286
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1231 Version: 1232
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1231 #define LIBYUV_VERSION 1232
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -962,7 +962,8 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { ...@@ -962,7 +962,8 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
// C reference code that mimics the YUV assembly. // C reference code that mimics the YUV assembly.
#define YG 74 /* round(1.164 * 64) */ #define YG 4901247 /* round(1.164 * 64 * 256) = 19071 * 0x0101 */
#define YGB 1192 /* round(1.164 * 64 * 16) */
#define UB 127 /* min(127, round(2.018 * 64)) */ #define UB 127 /* min(127, round(2.018 * 64)) */
#define UG -25 /* round(-0.391 * 64) */ #define UG -25 /* round(-0.391 * 64) */
...@@ -973,13 +974,13 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { ...@@ -973,13 +974,13 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
#define VR 102 /* round(1.596 * 64) */ #define VR 102 /* round(1.596 * 64) */
// Bias // Bias
#define BB (UB * 128 + VB * 128 + YG * 16) #define BB (UB * 128 + VB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YG * 16) #define BG (UG * 128 + VG * 128 + YGB)
#define BR (UR * 128 + VR * 128 + YG * 16) #define BR (UR * 128 + VR * 128 + YGB)
static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) { uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * YG); uint32 y1 = (uint32)(y * YG) >> 16;
*b = Clamp((int32)(u * UB + v * VB + y1 - BB) >> 6); *b = Clamp((int32)(u * UB + v * VB + y1 - BB) >> 6);
*g = Clamp((int32)(u * UG + v * VG + y1 - BG) >> 6); *g = Clamp((int32)(u * UG + v * VG + y1 - BG) >> 6);
*r = Clamp((int32)(u * UR + v * VR + y1 - BR) >> 6); *r = Clamp((int32)(u * UR + v * VR + y1 - BR) >> 6);
......
...@@ -24,7 +24,8 @@ extern "C" { ...@@ -24,7 +24,8 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
(defined(_M_IX86) || defined(_M_X64)) (defined(_M_IX86) || defined(_M_X64))
#define YG 74 /* (int8)round(1.164 * 64 + 0.5) */ #define YG 19071 /* round(1.164 * 64 * 256) */
#define YGB 1192 /* round(1.164 * 64 * 16) */
#define UB 127 /* min(63,(int8)round(2.018 * 64)) */ #define UB 127 /* min(63,(int8)round(2.018 * 64)) */
#define UG -25 /* (int8)round(-0.391 * 64 - 0.5) */ #define UG -25 /* (int8)round(-0.391 * 64 - 0.5) */
...@@ -35,9 +36,9 @@ extern "C" { ...@@ -35,9 +36,9 @@ extern "C" {
#define VR 102 /* (int8)round(1.596 * 64 + 0.5) */ #define VR 102 /* (int8)round(1.596 * 64 + 0.5) */
// Bias // Bias
#define BB (UB * 128 + VB * 128 + YG * 16) #define BB (UB * 128 + VB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YG * 16) #define BG (UG * 128 + VG * 128 + YGB)
#define BR (UR * 128 + VR * 128 + YG * 16) #define BR (UR * 128 + VR * 128 + YGB)
static const vec8 kUVToB = { static const vec8 kUVToB = {
UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
...@@ -79,7 +80,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -79,7 +80,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
int width) { int width) {
__m128i xmm0, xmm1, xmm2, xmm3; __m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128();
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) { while (width > 0) {
...@@ -96,8 +96,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -96,8 +96,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
xmm3 = _mm_loadl_epi64((__m128i*)y_buf); xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYToRgb);
xmm0 = _mm_adds_epi16(xmm0, xmm3); xmm0 = _mm_adds_epi16(xmm0, xmm3);
xmm1 = _mm_adds_epi16(xmm1, xmm3); xmm1 = _mm_adds_epi16(xmm1, xmm3);
xmm2 = _mm_adds_epi16(xmm2, xmm3); xmm2 = _mm_adds_epi16(xmm2, xmm3);
...@@ -1521,8 +1521,8 @@ static const lvec16 kUVBiasR_AVX = { ...@@ -1521,8 +1521,8 @@ static const lvec16 kUVBiasR_AVX = {
__asm vmovdqu xmm3, [eax] /* NOLINT */ \ __asm vmovdqu xmm3, [eax] /* NOLINT */ \
__asm lea eax, [eax + 16] \ __asm lea eax, [eax + 16] \
__asm vpermq ymm3, ymm3, 0xd8 \ __asm vpermq ymm3, ymm3, 0xd8 \
__asm vpunpcklbw ymm3, ymm3, ymm4 \ __asm vpunpcklbw ymm3, ymm3, ymm3 \
__asm vpmullw ymm3, ymm3, kYToRgb_AVX \ __asm vpmulhuw ymm3, ymm3, kYToRgb_AVX \
__asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \
__asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \
__asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \
...@@ -1553,7 +1553,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -1553,7 +1553,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
...@@ -1600,7 +1599,6 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -1600,7 +1599,6 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
...@@ -1647,7 +1645,6 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, ...@@ -1647,7 +1645,6 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
...@@ -1694,7 +1691,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1694,7 +1691,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
...@@ -1774,8 +1770,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1774,8 +1770,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
/* Step 2: Find Y contribution to 8 R,G,B values */ \ /* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \ __asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \ __asm punpcklbw xmm3, xmm3 \
__asm pmullw xmm3, kYToRgb \ __asm pmulhuw xmm3, kYToRgb \
__asm paddsw xmm0, xmm3 /* B += Y */ \ __asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \ __asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \ __asm paddsw xmm2, xmm3 /* R += Y */ \
...@@ -1801,8 +1797,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1801,8 +1797,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
/* Step 2: Find Y contribution to 8 R,G,B values */ \ /* Step 2: Find Y contribution to 8 R,G,B values */ \
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
__asm lea eax, [eax + 8] \ __asm lea eax, [eax + 8] \
__asm punpcklbw xmm3, xmm4 \ __asm punpcklbw xmm3, xmm3 \
__asm pmullw xmm3, kYToRgb \ __asm pmulhuw xmm3, kYToRgb \
__asm paddsw xmm0, xmm3 /* B += Y */ \ __asm paddsw xmm0, xmm3 /* B += Y */ \
__asm paddsw xmm1, xmm3 /* G += Y */ \ __asm paddsw xmm1, xmm3 /* G += Y */ \
__asm paddsw xmm2, xmm3 /* R += Y */ \ __asm paddsw xmm2, xmm3 /* R += Y */ \
...@@ -1832,7 +1828,6 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1832,7 +1828,6 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop: convertloop:
READYUV444 READYUV444
...@@ -1873,7 +1868,6 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, ...@@ -1873,7 +1868,6 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // rgb24 mov edx, [esp + 8 + 16] // rgb24
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pxor xmm4, xmm4
movdqa xmm5, kShuffleMaskARGBToRGB24_0 movdqa xmm5, kShuffleMaskARGBToRGB24_0
movdqa xmm6, kShuffleMaskARGBToRGB24 movdqa xmm6, kShuffleMaskARGBToRGB24
...@@ -1919,7 +1913,6 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, ...@@ -1919,7 +1913,6 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // raw mov edx, [esp + 8 + 16] // raw
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pxor xmm4, xmm4
movdqa xmm5, kShuffleMaskARGBToRAW_0 movdqa xmm5, kShuffleMaskARGBToRAW_0
movdqa xmm6, kShuffleMaskARGBToRAW movdqa xmm6, kShuffleMaskARGBToRAW
...@@ -1965,7 +1958,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, ...@@ -1965,7 +1958,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // rgb565 mov edx, [esp + 8 + 16] // rgb565
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pxor xmm4, xmm4
pcmpeqb xmm5, xmm5 // generate mask 0x0000001f pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
psrld xmm5, 27 psrld xmm5, 27
pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
...@@ -2038,7 +2030,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2038,7 +2030,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop: convertloop:
READYUV422 READYUV422
...@@ -2082,7 +2073,6 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2082,7 +2073,6 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
mov ecx, [esp + 12 + 20] // width mov ecx, [esp + 12 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop: convertloop:
READYUV411 // modifies EBX READYUV411 // modifies EBX
...@@ -2121,7 +2111,6 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2121,7 +2111,6 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
mov edx, [esp + 4 + 12] // argb mov edx, [esp + 4 + 12] // argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop: convertloop:
READNV12 READNV12
...@@ -2158,7 +2147,6 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2158,7 +2147,6 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
mov edx, [esp + 4 + 12] // argb mov edx, [esp + 4 + 12] // argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop: convertloop:
READNV12 READNV12
...@@ -2196,7 +2184,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -2196,7 +2184,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // bgra mov edx, [esp + 8 + 16] // bgra
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pxor xmm4, xmm4
convertloop: convertloop:
READYUV422 READYUV422
...@@ -2237,7 +2224,6 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -2237,7 +2224,6 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
convertloop: convertloop:
READYUV422 READYUV422
...@@ -2276,7 +2262,6 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -2276,7 +2262,6 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
mov edx, [esp + 8 + 16] // rgba mov edx, [esp + 8 + 16] // rgba
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edi, esi sub edi, esi
pxor xmm4, xmm4
convertloop: convertloop:
READYUV422 READYUV422
...@@ -2303,21 +2288,24 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -2303,21 +2288,24 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
#endif // HAS_I422TOARGBROW_SSSE3 #endif // HAS_I422TOARGBROW_SSSE3
#define YG 19071 /* round(1.164 * 64 * 256) */
#define YGB 1192 /* round(1.164 * 64 * 16) */
#ifdef HAS_YTOARGBROW_SSE2 #ifdef HAS_YTOARGBROW_SSE2
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void YToARGBRow_SSE2(const uint8* y_buf, void YToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
__asm { __asm {
pxor xmm5, xmm5
pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24 pslld xmm4, 24
mov eax, 0x00100010 mov eax, 0x04a804a8 // 04a8 = 1192 = round(1.164 * 64 * 16)
movd xmm3, eax movd xmm3, eax
pshufd xmm3, xmm3, 0 pshufd xmm3, xmm3, 0
mov eax, 0x004a004a // 74 mov eax, 0x4a7f4a7f // 4a7f = 19071 = round(1.164 * 64 * 256)
movd xmm2, eax movd xmm2, eax
pshufd xmm2, xmm2,0 pshufd xmm2, xmm2,0
mov eax, [esp + 4] // Y mov eax, [esp + 4] // Y
mov edx, [esp + 8] // rgb mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
...@@ -2326,9 +2314,9 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -2326,9 +2314,9 @@ void YToARGBRow_SSE2(const uint8* y_buf,
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax] movq xmm0, qword ptr [eax]
lea eax, [eax + 8] lea eax, [eax + 8]
punpcklbw xmm0, xmm5 // 0.Y punpcklbw xmm0, xmm0 // Y.Y
psubusw xmm0, xmm3 pmulhuw xmm0, xmm2
pmullw xmm0, xmm2 psubusw xmm0, xmm3 // TODO(fbarchard): round 0.5
psrlw xmm0, 6 psrlw xmm0, 6
packuswb xmm0, xmm0 // G packuswb xmm0, xmm0 // G
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment