Commit 26173eb7 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

H010ToAR30 for 10 bit bt.709 YUV to 30 bit RGB

This version of the H010ToAR30 provides a 3 step conversion
Convert16To8Row_AVX2
H420ToARGB_AVX2
ARGBToAR30_AVX2

Low level function added to convert 16 bit to 8 bit using multiply
to adjust 10 bit or other bit depths and then save the upper 16 bits.

Bug: libyuv:751
Test: LibYUVPlanarTest.Convert16To8Row_Opt unittest added
Change-Id: I9cc576fda8afa1003cb961d03e0e656e0b478f03
Reviewed-on: https://chromium-review.googlesource.com/783554
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
parent a98d6cdb
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1679
Version: 1680
License: BSD
License File: LICENSE
......
......@@ -321,6 +321,19 @@ int H422ToABGR(const uint8* src_y,
int width,
int height);
// Convert H010 to AR30.
LIBYUV_API
int H010ToAR30(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
uint8* dst_ar30,
int dst_stride_ar30,
int width,
int height);
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8* src_frame,
......
......@@ -278,6 +278,7 @@ extern "C" {
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_ARGBTOAR30ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2
#endif
......@@ -1540,6 +1541,12 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
int width);
void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
void Convert16To8Row_AVX2(const uint16* src_y,
uint8* dst_y,
int scale,
int width);
void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
......@@ -2419,9 +2426,7 @@ void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb,
void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb,
uint8* dst_rgb,
int width);
void ARGBToAR30Row_Any_AVX2(const uint8* src_argb,
uint8* dst_rgb,
int width);
void ARGBToAR30Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1679
#define LIBYUV_VERSION 1680
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -93,6 +93,7 @@ enum FourCC {
FOURCC_J420 = FOURCC('J', '4', '2', '0'),
FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
......@@ -154,6 +155,7 @@ enum FourCCBpp {
FOURCC_BPP_J420 = 12,
FOURCC_BPP_J400 = 8,
FOURCC_BPP_H420 = 12,
FOURCC_BPP_H010 = 24,
FOURCC_BPP_MJPG = 0, // 0 means unknown.
FOURCC_BPP_H264 = 0,
FOURCC_BPP_IYUV = 12,
......
......@@ -428,6 +428,136 @@ int H422ToABGR(const uint8* src_y,
width, height);
}
// Convert 10 bit YUV to 10 bit RGB with matrix
static int H010ToAR30Matrix(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
uint8* dst_ar30,
int dst_stride_ar30,
const struct YuvConstants* yuvconstants,
int scale, // 16384 for 10 bits
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*Convert16To8Row)(const uint16* src_y, uint8* dst_y, int scale,
int width) = Convert16To8Row_C;
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C;
void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToAR30Row_C;
if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30;
}
#if defined(HAS_CONVERT16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert16To8Row = Convert16To8Row_C; // TODO(fbarchard): Any AVX2
if (IS_ALIGNED(width, 64)) {
Convert16To8Row = Convert16To8Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOAR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToAR30Row = ARGBToAR30Row_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
}
}
#endif
align_buffer_64(row_y, width);
align_buffer_64(row_u, halfwidth);
align_buffer_64(row_v, halfwidth);
align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) {
Convert16To8Row(src_y, row_y, scale, width);
Convert16To8Row(src_u, row_u, scale, halfwidth);
Convert16To8Row(src_v, row_v, scale, halfwidth);
I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
ARGBToAR30Row(row_argb, dst_ar30, width);
dst_ar30 += dst_stride_ar30;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
free_aligned_buffer_64(row_y);
free_aligned_buffer_64(row_u);
free_aligned_buffer_64(row_v);
free_aligned_buffer_64(row_argb);
return 0;
}
// Convert H010 to AR30.
LIBYUV_API
int H010ToAR30(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
uint8* dst_ar30,
int dst_stride_ar30,
int width,
int height) {
return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_ar30, dst_stride_ar30,
&kYuvH709Constants, 16384, width, height);
}
// Convert I444 to ARGB with matrix
static int I444ToARGBMatrix(const uint8* src_y,
int src_stride_y,
......
......@@ -1811,6 +1811,11 @@ void MergeRGBRow_C(const uint8* src_r,
}
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 128 = 9 bits
// 64 = 10 bits
// 16 = 12 bits
// 1 = 16 bits
void MergeUVRow_16_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
......@@ -1840,6 +1845,21 @@ void MultiplyRow_16_C(const uint16* src_y,
}
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
void Convert16To8Row_C(const uint16* src_y,
uint8* dst_y,
int scale,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_y[x] = (src_y[x] * scale) >> 16;
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
......
......@@ -702,7 +702,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_ARGBTOAR30ROW_AVX2
void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
asm volatile (
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0x000000ff mask
"vpsrld $0x18,%%ymm4,%%ymm4 \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // 0xc0000000 mask
......@@ -721,7 +721,7 @@ void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
"vpslld $20,%%ymm2,%%ymm2 \n"
"vpor %%ymm1,%%ymm3,%%ymm3 \n"
"vpor %%ymm2,%%ymm3,%%ymm3 \n"
//green
// green
"vpsrld $0x08,%%ymm0,%%ymm1 \n"
"vpand %%ymm4,%%ymm1,%%ymm1 \n"
"vpsrld $0x6,%%ymm1,%%ymm2 \n"
......@@ -729,7 +729,7 @@ void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
"vpslld $10,%%ymm2,%%ymm2 \n"
"vpor %%ymm1,%%ymm3,%%ymm3 \n"
"vpor %%ymm2,%%ymm3,%%ymm3 \n"
//blue
// blue
"vpand %%ymm4,%%ymm0,%%ymm1 \n"
"vpsrld $0x6,%%ymm1,%%ymm2 \n"
"vpslld $2,%%ymm1,%%ymm1 \n"
......@@ -745,9 +745,8 @@ void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
::"memory",
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif
......@@ -2851,6 +2850,11 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
}
#endif // HAS_MERGEUVROW_AVX2
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 128 = 9 bits
// 64 = 10 bits
// 16 = 12 bits
// 1 = 16 bits
#ifdef HAS_MULTIPLYROW_16_AVX2
void MultiplyRow_16_AVX2(const uint16* src_y,
uint16* dst_y,
......@@ -2885,6 +2889,47 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
}
#endif // HAS_MULTIPLYROW_16_AVX2
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
#ifdef HAS_MULTIPLYROW_16_AVX2
void Convert16To8Row_AVX2(const uint16* src_y,
uint8* dst_y,
int scale,
int width) {
// clang-format off
asm volatile (
"vmovd %3,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
// 32 pixels per loop.
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"add $0x40,%0 \n"
"add $0x20,%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm3");
// clang-format on
}
#endif // HAS_MULTIPLYROW_16_AVX2
#ifdef HAS_SPLITRGBROW_SSSE3
// Shuffle table for converting RGB to Planar.
......
......@@ -1963,4 +1963,67 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
}
#endif // HAS_ARGBTOAR30ROW_AVX2
// Alias to copy pixels as is
#define AR30ToAR30 ARGBToARGB
#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
ALIGN, YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, \
BPP_C) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
const int kBpc = 2; \
align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \
align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \
align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \
align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
for (int i = 0; i < kWidth * kHeight; ++i) { \
reinterpret_cast<uint16*>(src_y)[i + OFF] = (fastrand() & 0x3ff); \
} \
for (int i = 0; i < kSizeUV; ++i) { \
reinterpret_cast<uint16*>(src_u)[i + OFF] = (fastrand() & 0x3ff); \
reinterpret_cast<uint16*>(src_v)[i + OFF] = (fastrand() & 0x3ff); \
} \
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth, \
reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \
reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \
dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth, \
reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \
reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \
dst_argb_opt + OFF, kStrideB, kWidth, \
NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
static_cast<int>(dst_argb_opt[i])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
}
#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, DIFF, FMT_C, BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4)
} // namespace libyuv
......@@ -2661,7 +2661,7 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
}
#endif
// TODO(fbarchard): improve test for platforms and cpu detect
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_MULTIPLYROW_16_AVX2
TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
......@@ -2697,7 +2697,48 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
free_aligned_buffer_page_end(dst_pixels_y_opt);
free_aligned_buffer_page_end(dst_pixels_y_c);
}
#endif
#endif // HAS_MULTIPLYROW_16_AVX2
// TODO(fbarchard): Improve test for more platforms.
#ifdef HAS_CONVERT16TO8ROW_AVX2
TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels * 2);
align_buffer_page_end(dst_pixels_y_opt, kPixels);
align_buffer_page_end(dst_pixels_y_c, kPixels);
MemRandomize(src_pixels_y, kPixels * 2);
// C code does not clamp so limit source range to 10 bits.
for (int i = 0; i < kPixels; ++i) {
reinterpret_cast<uint16*>(src_pixels_y)[i] &= 1023;
}
memset(dst_pixels_y_opt, 0, kPixels);
memset(dst_pixels_y_c, 1, kPixels);
Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_c, 16384, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
Convert16To8Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_opt, 16384, kPixels);
} else {
Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_opt, 16384, kPixels);
}
}
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
}
free_aligned_buffer_page_end(src_pixels_y);
free_aligned_buffer_page_end(dst_pixels_y_opt);
free_aligned_buffer_page_end(dst_pixels_y_c);
}
#endif // HAS_CONVERT16TO8ROW_AVX2
float TestScaleMaxSamples(int benchmark_width,
int benchmark_height,
......
......@@ -80,6 +80,8 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment