Commit 050b39a5 authored by fbarchard@google.com's avatar fbarchard@google.com

Recomputed JPeg coefficients normalized to 128. Apply to ARGBGray function…

Recomputed JPeg coefficients normalized to 128.  Apply to ARGBGray function reusing YJ function/coefficients and rounding.
BUG=201
TESTED=Gray unittest improved
Review URL: https://webrtc-codereview.appspot.com/1269006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@629 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 6a352141
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 628 Version: 629
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -54,6 +54,7 @@ extern "C" { ...@@ -54,6 +54,7 @@ extern "C" {
#define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV422ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3
...@@ -203,6 +204,7 @@ extern "C" { ...@@ -203,6 +204,7 @@ extern "C" {
#define HAS_ARGBTOUV422ROW_NEON #define HAS_ARGBTOUV422ROW_NEON
#define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUV444ROW_NEON
#define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOYROW_NEON #define HAS_ARGBTOYROW_NEON
#define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYJROW_NEON
#define HAS_BGRATOUVROW_NEON #define HAS_BGRATOUVROW_NEON
...@@ -423,6 +425,8 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -423,6 +425,8 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix); int pix);
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix);
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
...@@ -481,6 +485,8 @@ void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, ...@@ -481,6 +485,8 @@ void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
...@@ -489,6 +495,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, ...@@ -489,6 +495,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra, void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr, void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
...@@ -497,6 +505,8 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba, ...@@ -497,6 +505,8 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra, void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
...@@ -511,6 +521,8 @@ void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ...@@ -511,6 +521,8 @@ void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix); int pix);
void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix);
void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
...@@ -531,6 +543,8 @@ void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, ...@@ -531,6 +543,8 @@ void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 628 #define LIBYUV_VERSION 629
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -994,19 +994,19 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, ...@@ -994,19 +994,19 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb; src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb; src_stride_argb = -src_stride_argb;
} }
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C; ARGBToYJRow_C;
#if defined(HAS_ARGBTOYJROW_SSSE3) #if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToUVJRow = ARGBToUVJRow_SSSE3;
if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
ARGBToYJRow = ARGBToYJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3;
} }
...@@ -1021,16 +1021,16 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, ...@@ -1021,16 +1021,16 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
ARGBToYJRow = ARGBToYJRow_NEON; ARGBToYJRow = ARGBToYJRow_NEON;
} }
if (width >= 16) { if (width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVJRow = ARGBToUVJRow_NEON;
} }
} }
} }
#endif #endif
for (int y = 0; y < height - 1; y += 2) { for (int y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
ARGBToYJRow(src_argb, dst_yj, width); ARGBToYJRow(src_argb, dst_yj, width);
ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
src_argb += src_stride_argb * 2; src_argb += src_stride_argb * 2;
...@@ -1039,7 +1039,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, ...@@ -1039,7 +1039,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
if (height & 1) { if (height & 1) {
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYJRow(src_argb, dst_yj, width); ARGBToYJRow(src_argb, dst_yj, width);
} }
return 0; return 0;
......
...@@ -293,6 +293,8 @@ UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) ...@@ -293,6 +293,8 @@ UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
#endif #endif
#ifdef HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_SSSE3
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
4, 15)
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15) UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15) UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15) UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
...@@ -301,6 +303,7 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) ...@@ -301,6 +303,7 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
#endif #endif
#ifdef HAS_ARGBTOUVROW_NEON #ifdef HAS_ARGBTOUVROW_NEON
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
......
...@@ -256,25 +256,44 @@ MAKEROWY(RGB24, 2, 1, 0, 3) ...@@ -256,25 +256,44 @@ MAKEROWY(RGB24, 2, 1, 0, 3)
MAKEROWY(RAW, 0, 1, 2, 3) MAKEROWY(RAW, 0, 1, 2, 3)
#undef MAKEROWY #undef MAKEROWY
// BT.601 mpeg range // JPeg uses a variation on BT.601-1 full range
// y = 0.29900 * r + 0.58700 * g + 0.11400 * b
// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center
// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center
// BT.601 Mpeg range uses:
// b 0.1016 * 255 = 25.908 = 25 // b 0.1016 * 255 = 25.908 = 25
// g 0.5078 * 255 = 129.489 = 129 // g 0.5078 * 255 = 129.489 = 129
// r 0.2578 * 255 = 65.739 = 66 // r 0.2578 * 255 = 65.739 = 66
// = 0.8672. 1/.8672 = 1.1531 // JPeg 8 bit Y (not used):
// BT.601 full range 8 bit (not used) // b 0.11400 * 256 = 29.184 = 29
// b 0.1016 * 1.1531 = 0.1172 * 255 = 29.886 = 30 // g 0.58700 * 256 = 150.272 = 150
// g 0.5078 * 1.1531 = 0.5855 * 255 = 149.3025 = 149 // r 0.29900 * 256 = 76.544 = 77
// r 0.2578 * 1.1531 = 0.2973 * 255 = 75.8115 = 76 // JPeg 7 bit Y:
// 30 + 149 + 76 = 255 // b 0.11400 * 128 = 14.592 = 15
// BT.601 full range 7 bit // g 0.58700 * 128 = 75.136 = 75
// b 0.1172 * 127 = 14.8844 = 15 // r 0.29900 * 128 = 38.272 = 38
// g 0.5855 * 127 = 74.35855 = 74 // JPeg 8 bit U:
// r 0.2973 * 127 = 37.7571 = 38 // b 0.50000 * 255 = 127.5 = 127
// g -0.33126 * 255 = -84.4713 = -84
// r -0.16874 * 255 = -43.0287 = -43
// JPeg 8 bit V:
// b -0.08131 * 255 = -20.73405 = -20
// g -0.41869 * 255 = -106.76595 = -107
// r 0.50000 * 255 = 127.5 = 127
static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
return (38 * r + 74 * g + 15 * b + 64) >> 7; return (38 * r + 75 * g + 15 * b + 64) >> 7;
} }
static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
}
static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
}
#define AVGB(a, b) (((a) + (b) + 1) >> 1)
#define MAKEROWYJ(NAME, R, G, B, BPP) \ #define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
for (int x = 0; x < width; ++x) { \ for (int x = 0; x < width; ++x) { \
...@@ -283,6 +302,31 @@ void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ ...@@ -283,6 +302,31 @@ void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
dst_y += 1; \ dst_y += 1; \
} \ } \
} \ } \
void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
uint8* dst_u, uint8* dst_v, int width) { \
const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
for (int x = 0; x < width - 1; x += 2) { \
uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
src_rgb0 += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
} \
}
MAKEROWYJ(ARGB, 2, 1, 0, 4) MAKEROWYJ(ARGB, 2, 1, 0, 4)
#undef MAKEROWYJ #undef MAKEROWYJ
...@@ -537,16 +581,9 @@ void ARGBToUV411Row_C(const uint8* src_argb, ...@@ -537,16 +581,9 @@ void ARGBToUV411Row_C(const uint8* src_argb,
} }
} }
// http://en.wikipedia.org/wiki/Grayscale.
// 0.11 * B + 0.59 * G + 0.30 * R
// Coefficients rounded to multiple of 2 for consistency with SSSE3 version.
static __inline int RGBToGray(uint8 r, uint8 g, uint8 b) {
return (28 * b + 152 * g + 76 * r) >> 8;
}
void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
for (int x = 0; x < width; ++x) { for (int x = 0; x < width; ++x) {
uint8 y = RGBToGray(src_argb[2], src_argb[1], src_argb[0]); uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
dst_argb[3] = src_argb[3]; dst_argb[3] = src_argb[3];
dst_argb += 4; dst_argb += 4;
......
...@@ -1338,9 +1338,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1338,9 +1338,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d24, #15 \n" // B * 0.1172 coefficient "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #74 \n" // G * 0.5855 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.2973 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
...@@ -1348,7 +1348,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -1348,7 +1348,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
...@@ -1547,6 +1547,45 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, ...@@ -1547,6 +1547,45 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
); );
} }
// TODO(fbarchard): Subsample match C code.
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #127 / 4 \n" // UB / VR 0.500 coefficient
"vmov.s16 q11, #84 / 4 \n" // UG -0.33126 coefficient
"vmov.s16 q12, #43 / 4 \n" // UR -0.16874 coefficient
"vmov.s16 q13, #20 / 4 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 4 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile ( asm volatile (
...@@ -2365,13 +2404,13 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -2365,13 +2404,13 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
} }
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToY but different constants, no round and stores ARGB. // Similar to ARGBToYJ but stores ARGB.
// C code is (28 * b + 152 * g + 76 * r) >> 8; // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"vmov.u8 d24, #14 \n" // B * 0.1016 coefficient "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #76 \n" // G * 0.5078 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.2578 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
...@@ -2379,7 +2418,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -2379,7 +2418,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit B "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
"vmov d1, d0 \n" // G "vmov d1, d0 \n" // G
"vmov d2, d0 \n" // R "vmov d2, d0 \n" // R
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
......
...@@ -37,17 +37,25 @@ CONST vec8 kARGBToY = { ...@@ -37,17 +37,25 @@ CONST vec8 kARGBToY = {
// JPeg full range. // JPeg full range.
CONST vec8 kARGBToYJ = { CONST vec8 kARGBToYJ = {
15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
}; };
CONST vec8 kARGBToU = { CONST vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
}; };
CONST vec8 kARGBToUJ = {
127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
};
CONST vec8 kARGBToV = { CONST vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
}; };
CONST vec8 kARGBToVJ = {
-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
};
// Constants for BGRA // Constants for BGRA
CONST vec8 kBGRAToY = { CONST vec8 kBGRAToY = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
...@@ -100,6 +108,10 @@ CONST uvec8 kAddUV128 = { ...@@ -100,6 +108,10 @@ CONST uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
}; };
CONST uvec16 kAddUVJ128 = {
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
};
// Shuffle table for converting RGB24 to ARGB. // Shuffle table for converting RGB24 to ARGB.
CONST uvec8 kShuffleMaskRGB24ToARGB = { CONST uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
...@@ -830,6 +842,69 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -830,6 +842,69 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
); );
} }
// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kARGBToUJ), // %0
"m"(kARGBToVJ), // %1
"m"(kAddUVJ128) // %2
);
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa 0x10(%0),%%xmm1 \n"
"movdqa 0x20(%0),%%xmm2 \n"
"movdqa 0x30(%0),%%xmm6 \n"
"pavgb (%0,%4,1),%%xmm0 \n"
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"paddw %%xmm5,%%xmm0 \n"
"paddw %%xmm5,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_argb))
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
);
}
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
asm volatile ( asm volatile (
...@@ -895,6 +970,72 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -895,6 +970,72 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
); );
} }
void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(kARGBToUJ), // %0
"m"(kARGBToVJ), // %1
"m"(kAddUVJ128) // %2
);
asm volatile (
"sub %1,%2 \n"
".p2align 4 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm6 \n"
"movdqu (%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm1 \n"
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"paddw %%xmm5,%%xmm0 \n"
"paddw %%xmm5,%%xmm1 \n"
"psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0,(%1) \n"
"movhps %%xmm0,(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_argb))
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
);
}
void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int width) { int width) {
asm volatile ( asm volatile (
...@@ -3764,15 +3905,11 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -3764,15 +3905,11 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
#endif // HAS_ARGBUNATTENUATEROW_SSE2 #endif // HAS_ARGBUNATTENUATEROW_SSE2
#ifdef HAS_ARGBGRAYROW_SSSE3 #ifdef HAS_ARGBGRAYROW_SSSE3
// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
CONST vec8 kARGBToGray = {
14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
};
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
// 8 pixel loop. // 8 pixel loop.
...@@ -3783,6 +3920,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3783,6 +3920,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm1,%%xmm0 \n"
"paddw %%xmm5,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"movdqa (%0),%%xmm2 \n" "movdqa (%0),%%xmm2 \n"
...@@ -3805,10 +3943,11 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3805,10 +3943,11 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kARGBToGray) // %3 : "m"(kARGBToYJ), // %3
"m"(kAddYJ64) // %4
: "memory", "cc" : "memory", "cc"
#if defined(__SSE2__) #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif #endif
); );
} }
......
...@@ -27,7 +27,7 @@ static const vec8 kARGBToY = { ...@@ -27,7 +27,7 @@ static const vec8 kARGBToY = {
// JPeg full range. // JPeg full range.
static const vec8 kARGBToYJ = { static const vec8 kARGBToYJ = {
15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
}; };
static const lvec8 kARGBToY_AVX = { static const lvec8 kARGBToY_AVX = {
...@@ -39,6 +39,10 @@ static const vec8 kARGBToU = { ...@@ -39,6 +39,10 @@ static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
}; };
static const vec8 kARGBToUJ = {
127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
};
// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version. // TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
static const lvec8 kARGBToU_AVX = { static const lvec8 kARGBToU_AVX = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
...@@ -49,6 +53,10 @@ static const vec8 kARGBToV = { ...@@ -49,6 +53,10 @@ static const vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
}; };
static const vec8 kARGBToVJ = {
-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
};
static const lvec8 kARGBToV_AVX = { static const lvec8 kARGBToV_AVX = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
...@@ -124,6 +132,10 @@ static const uvec8 kAddUV128 = { ...@@ -124,6 +132,10 @@ static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
}; };
static const uvec16 kAddUVJ128 = {
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
};
static const ulvec8 kAddUV128_AVX = { static const ulvec8 kAddUV128_AVX = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
...@@ -1087,6 +1099,73 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1087,6 +1099,73 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
} }
} }
__declspec(naked) __declspec(align(16))
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, kARGBToUJ
movdqa xmm6, kARGBToVJ
movdqa xmm5, kAddUVJ128
sub edi, edx // stride from u to v
align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
paddw xmm0, xmm5 // +.5 rounding -> unsigned
paddw xmm1, xmm5
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
jg convertloop
pop edi
pop esi
ret
}
}
#ifdef HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(32)) __declspec(naked) __declspec(align(32))
void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
...@@ -1223,6 +1302,77 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1223,6 +1302,77 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
} }
} }
__declspec(naked) __declspec(align(16))
void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, kARGBToUJ
movdqa xmm6, kARGBToVJ
movdqa xmm5, kAddUVJ128
sub edi, edx // stride from u to v
align 16
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
movdqu xmm4, [eax + esi]
pavgb xmm0, xmm4
movdqu xmm4, [eax + esi + 16]
pavgb xmm1, xmm4
movdqu xmm4, [eax + esi + 32]
pavgb xmm2, xmm4
movdqu xmm4, [eax + esi + 48]
pavgb xmm3, xmm4
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
paddw xmm0, xmm5 // +.5 rounding -> unsigned
paddw xmm1, xmm5
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
// step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
jg convertloop
pop edi
pop esi
ret
}
}
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBToUV444Row_SSSE3(const uint8* src_argb0, void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
...@@ -4597,11 +4747,6 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -4597,11 +4747,6 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
#endif // HAS_ARGBATTENUATEROW_AVX2 #endif // HAS_ARGBATTENUATEROW_AVX2
#ifdef HAS_ARGBGRAYROW_SSSE3 #ifdef HAS_ARGBGRAYROW_SSSE3
// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
static const vec8 kARGBToGray = {
14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
};
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
...@@ -4609,7 +4754,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4609,7 +4754,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */ mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* width */ mov ecx, [esp + 12] /* width */
movdqa xmm4, kARGBToGray movdqa xmm4, kARGBToYJ
movdqa xmm5, kAddYJ64
sub edx, eax sub edx, eax
align 16 align 16
...@@ -4619,6 +4765,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -4619,6 +4765,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
pmaddubsw xmm0, xmm4 pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm4 pmaddubsw xmm1, xmm4
phaddw xmm0, xmm1 phaddw xmm0, xmm1
paddw xmm0, xmm5 // Add .5 for rounding.
psrlw xmm0, 7 psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 G bytes packuswb xmm0, xmm0 // 8 G bytes
movdqa xmm2, [eax] // A movdqa xmm2, [eax] // A
......
...@@ -689,7 +689,11 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ ...@@ -689,7 +689,11 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
benchmark_width_, DIFF, _Opt, +, 0) benchmark_width_, DIFF, _Opt, +, 0)
TESTATOPLANAR(ARGB, 4, I420, 2, 2, 4) TESTATOPLANAR(ARGB, 4, I420, 2, 2, 4)
#ifdef __arm__
TESTATOPLANAR(ARGB, 4, J420, 2, 2, 4) TESTATOPLANAR(ARGB, 4, J420, 2, 2, 4)
#else
TESTATOPLANAR(ARGB, 4, J420, 2, 2, 0)
#endif
TESTATOPLANAR(BGRA, 4, I420, 2, 2, 4) TESTATOPLANAR(BGRA, 4, I420, 2, 2, 4)
TESTATOPLANAR(ABGR, 4, I420, 2, 2, 4) TESTATOPLANAR(ABGR, 4, I420, 2, 2, 4)
TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4) TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4)
......
...@@ -269,7 +269,6 @@ TEST_F(libyuvTest, TestARGBComputeCumulativeSum) { ...@@ -269,7 +269,6 @@ TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
TEST_F(libyuvTest, TestARGBGray) { TEST_F(libyuvTest, TestARGBGray) {
SIMD_ALIGNED(uint8 orig_pixels[256][4]); SIMD_ALIGNED(uint8 orig_pixels[256][4]);
// Test blue // Test blue
orig_pixels[0][0] = 255u; orig_pixels[0][0] = 255u;
orig_pixels[0][1] = 0u; orig_pixels[0][1] = 0u;
...@@ -285,30 +284,47 @@ TEST_F(libyuvTest, TestARGBGray) { ...@@ -285,30 +284,47 @@ TEST_F(libyuvTest, TestARGBGray) {
orig_pixels[2][1] = 0u; orig_pixels[2][1] = 0u;
orig_pixels[2][2] = 255u; orig_pixels[2][2] = 255u;
orig_pixels[2][3] = 255u; orig_pixels[2][3] = 255u;
// Test black
orig_pixels[3][0] = 0u;
orig_pixels[3][1] = 0u;
orig_pixels[3][2] = 0u;
orig_pixels[3][3] = 255u;
// Test white
orig_pixels[4][0] = 255u;
orig_pixels[4][1] = 255u;
orig_pixels[4][2] = 255u;
orig_pixels[4][3] = 255u;
// Test color // Test color
orig_pixels[3][0] = 16u; orig_pixels[5][0] = 16u;
orig_pixels[3][1] = 64u; orig_pixels[5][1] = 64u;
orig_pixels[3][2] = 192u; orig_pixels[5][2] = 192u;
orig_pixels[3][3] = 224u; orig_pixels[5][3] = 224u;
// Do 16 to test asm version. // Do 16 to test asm version.
ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1); ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
EXPECT_EQ(27u, orig_pixels[0][0]); EXPECT_EQ(30u, orig_pixels[0][0]);
EXPECT_EQ(27u, orig_pixels[0][1]); EXPECT_EQ(30u, orig_pixels[0][1]);
EXPECT_EQ(27u, orig_pixels[0][2]); EXPECT_EQ(30u, orig_pixels[0][2]);
EXPECT_EQ(128u, orig_pixels[0][3]); EXPECT_EQ(128u, orig_pixels[0][3]);
EXPECT_EQ(151u, orig_pixels[1][0]); EXPECT_EQ(149u, orig_pixels[1][0]);
EXPECT_EQ(151u, orig_pixels[1][1]); EXPECT_EQ(149u, orig_pixels[1][1]);
EXPECT_EQ(151u, orig_pixels[1][2]); EXPECT_EQ(149u, orig_pixels[1][2]);
EXPECT_EQ(0u, orig_pixels[1][3]); EXPECT_EQ(0u, orig_pixels[1][3]);
EXPECT_EQ(75u, orig_pixels[2][0]); EXPECT_EQ(76u, orig_pixels[2][0]);
EXPECT_EQ(75u, orig_pixels[2][1]); EXPECT_EQ(76u, orig_pixels[2][1]);
EXPECT_EQ(75u, orig_pixels[2][2]); EXPECT_EQ(76u, orig_pixels[2][2]);
EXPECT_EQ(255u, orig_pixels[2][3]); EXPECT_EQ(255u, orig_pixels[2][3]);
EXPECT_EQ(96u, orig_pixels[3][0]); EXPECT_EQ(0u, orig_pixels[3][0]);
EXPECT_EQ(96u, orig_pixels[3][1]); EXPECT_EQ(0u, orig_pixels[3][1]);
EXPECT_EQ(96u, orig_pixels[3][2]); EXPECT_EQ(0u, orig_pixels[3][2]);
EXPECT_EQ(224u, orig_pixels[3][3]); EXPECT_EQ(255u, orig_pixels[3][3]);
EXPECT_EQ(255u, orig_pixels[4][0]);
EXPECT_EQ(255u, orig_pixels[4][1]);
EXPECT_EQ(255u, orig_pixels[4][2]);
EXPECT_EQ(255u, orig_pixels[4][3]);
EXPECT_EQ(96u, orig_pixels[5][0]);
EXPECT_EQ(96u, orig_pixels[5][1]);
EXPECT_EQ(96u, orig_pixels[5][2]);
EXPECT_EQ(224u, orig_pixels[5][3]);
for (int i = 0; i < 256; ++i) { for (int i = 0; i < 256; ++i) {
orig_pixels[i][0] = i; orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2; orig_pixels[i][1] = i / 2;
...@@ -323,7 +339,6 @@ TEST_F(libyuvTest, TestARGBGray) { ...@@ -323,7 +339,6 @@ TEST_F(libyuvTest, TestARGBGray) {
TEST_F(libyuvTest, TestARGBGrayTo) { TEST_F(libyuvTest, TestARGBGrayTo) {
SIMD_ALIGNED(uint8 orig_pixels[256][4]); SIMD_ALIGNED(uint8 orig_pixels[256][4]);
SIMD_ALIGNED(uint8 gray_pixels[256][4]); SIMD_ALIGNED(uint8 gray_pixels[256][4]);
// Test blue // Test blue
orig_pixels[0][0] = 255u; orig_pixels[0][0] = 255u;
orig_pixels[0][1] = 0u; orig_pixels[0][1] = 0u;
...@@ -339,30 +354,47 @@ TEST_F(libyuvTest, TestARGBGrayTo) { ...@@ -339,30 +354,47 @@ TEST_F(libyuvTest, TestARGBGrayTo) {
orig_pixels[2][1] = 0u; orig_pixels[2][1] = 0u;
orig_pixels[2][2] = 255u; orig_pixels[2][2] = 255u;
orig_pixels[2][3] = 255u; orig_pixels[2][3] = 255u;
// Test black
orig_pixels[3][0] = 0u;
orig_pixels[3][1] = 0u;
orig_pixels[3][2] = 0u;
orig_pixels[3][3] = 255u;
// Test white
orig_pixels[4][0] = 255u;
orig_pixels[4][1] = 255u;
orig_pixels[4][2] = 255u;
orig_pixels[4][3] = 255u;
// Test color // Test color
orig_pixels[3][0] = 16u; orig_pixels[5][0] = 16u;
orig_pixels[3][1] = 64u; orig_pixels[5][1] = 64u;
orig_pixels[3][2] = 192u; orig_pixels[5][2] = 192u;
orig_pixels[3][3] = 224u; orig_pixels[5][3] = 224u;
// Do 16 to test asm version. // Do 16 to test asm version.
ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1); ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
EXPECT_EQ(27u, gray_pixels[0][0]); EXPECT_EQ(30u, gray_pixels[0][0]);
EXPECT_EQ(27u, gray_pixels[0][1]); EXPECT_EQ(30u, gray_pixels[0][1]);
EXPECT_EQ(27u, gray_pixels[0][2]); EXPECT_EQ(30u, gray_pixels[0][2]);
EXPECT_EQ(128u, gray_pixels[0][3]); EXPECT_EQ(128u, gray_pixels[0][3]);
EXPECT_EQ(151u, gray_pixels[1][0]); EXPECT_EQ(149u, gray_pixels[1][0]);
EXPECT_EQ(151u, gray_pixels[1][1]); EXPECT_EQ(149u, gray_pixels[1][1]);
EXPECT_EQ(151u, gray_pixels[1][2]); EXPECT_EQ(149u, gray_pixels[1][2]);
EXPECT_EQ(0u, gray_pixels[1][3]); EXPECT_EQ(0u, gray_pixels[1][3]);
EXPECT_EQ(75u, gray_pixels[2][0]); EXPECT_EQ(76u, gray_pixels[2][0]);
EXPECT_EQ(75u, gray_pixels[2][1]); EXPECT_EQ(76u, gray_pixels[2][1]);
EXPECT_EQ(75u, gray_pixels[2][2]); EXPECT_EQ(76u, gray_pixels[2][2]);
EXPECT_EQ(255u, gray_pixels[2][3]); EXPECT_EQ(255u, gray_pixels[2][3]);
EXPECT_EQ(96u, gray_pixels[3][0]); EXPECT_EQ(0u, gray_pixels[3][0]);
EXPECT_EQ(96u, gray_pixels[3][1]); EXPECT_EQ(0u, gray_pixels[3][1]);
EXPECT_EQ(96u, gray_pixels[3][2]); EXPECT_EQ(0u, gray_pixels[3][2]);
EXPECT_EQ(224u, gray_pixels[3][3]); EXPECT_EQ(255u, gray_pixels[3][3]);
EXPECT_EQ(255u, gray_pixels[4][0]);
EXPECT_EQ(255u, gray_pixels[4][1]);
EXPECT_EQ(255u, gray_pixels[4][2]);
EXPECT_EQ(255u, gray_pixels[4][3]);
EXPECT_EQ(96u, gray_pixels[5][0]);
EXPECT_EQ(96u, gray_pixels[5][1]);
EXPECT_EQ(96u, gray_pixels[5][2]);
EXPECT_EQ(224u, gray_pixels[5][3]);
for (int i = 0; i < 256; ++i) { for (int i = 0; i < 256; ++i) {
orig_pixels[i][0] = i; orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2; orig_pixels[i][1] = i / 2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment