Commit 7fc932dd authored by Frank Barchard's avatar Frank Barchard

Add low level support for 12 bit 420, 422 and 444 YUV video frame conversion.

BUG=libyuv:560,chromium:445071
TEST=untested
R=hubbe@chromium.org

Review URL: https://codereview.chromium.org/2371293002 .
parent c11e9b7f
......@@ -281,6 +281,14 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
const float* poly,
int width, int height);
// Convert plane of 16 bit shorts to half floats.
// Source values are multiplied by scale before storing as half float.
LIBYUV_API
int HalfFloatPlane(const uint16* src_y, int src_stride_y,
uint16* dst_y, int dst_stride_y,
float scale,
int width, int height);
// Quantize a rectangle of ARGB. Alpha unaffected.
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
// interval_size should be a value between 1 and 255.
......
......@@ -231,6 +231,7 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
#define HAS_HALFFLOATROW_AVX2
// Effects:
#define HAS_ARGBADDROW_AVX2
......@@ -252,7 +253,6 @@ extern "C" {
#define HAS_ARGBTORGB565ROW_AVX2
#define HAS_J400TOARGBROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2
#define HAS_SHORTTOF16ROW_AVX2
#endif
// The following are also available on x64 Visual C.
......@@ -1934,8 +1934,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
int width);
// Scale and convert to half float.
void ShortToF16Row_C(const uint16* src, int16* dst, float scale, int width);
void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width);
void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
int width);
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
const uint8* luma, uint32 lumacoeff);
......
......@@ -83,6 +83,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
// TODO(fbarchard): Consider support for negative height.
// TODO(fbarchard): Consider stride measured in bytes.
LIBYUV_API
void CopyPlane_16(const uint16* src_y, int src_stride_y,
uint16* dst_y, int dst_stride_y,
......@@ -2441,6 +2442,51 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
return 0;
}
// Convert plane of 16 bit shorts to half floats.
// Source values are multiplied by scale before storing as half float.
LIBYUV_API
int HalfFloatPlane(const uint16* src_y, int src_stride_y,
uint16* dst_y, int dst_stride_y,
float scale,
int width, int height) {
int y;
void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) =
HalfFloatRow_C;
if (!src_y || !dst_y || width <= 0 || height == 0) {
return -1;
}
src_stride_y >>= 1;
dst_stride_y >>= 1;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
// Coalesce rows.
if (src_stride_y == width &&
dst_stride_y == width) {
width *= height;
height = 1;
src_stride_y = dst_stride_y = 0;
}
#if defined(HAS_HALFFLOATROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
HalfFloatRow = HalfFloatRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
HalfFloatRow = HalfFloatRow_AVX2;
}
}
#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
}
return 0;
}
// Apply a lumacolortable to each ARGB pixel.
LIBYUV_API
int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
......
......@@ -546,6 +546,28 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
#endif
#undef ANY11P
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, \
T shuffler, int width) { \
SIMD_ALIGNED(uint16 temp[32 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
#ifdef HAS_HALFFLOATROW_AVX2
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
#endif
#undef ANY11P16
// Any 1 to 1 with yuvconstants
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
......
......@@ -2333,6 +2333,25 @@ void ARGBPolynomialRow_C(const uint8* src_argb,
}
}
// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor
// adjust the source integer range to the half float range desired.
// This magic constant is 2^-112. Multiplying by this
// is the same as subtracting 112 from the exponent, which
// is the difference in exponent bias between 32-bit and
// 16-bit floats. Once we've done this subtraction, we can
// simply extract the low bits of the exponent and the high
// bits of the mantissa from our float and we're done.
void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) {
int i;
float mult = 1.9259299444e-34f * scale;
for (i = 0; i < width; ++i) {
float value = src[i] * mult;
dst[i] = (uint16)((*(uint32_t*)&value) >> 13);
}
}
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
const uint8* luma, uint32 lumacoeff) {
uint32 bc = lumacoeff & 0xff;
......
......@@ -5366,6 +5366,39 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
}
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
#ifdef HAS_HALFFLOATROW_AVX2
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"vbroadcastss %3, %%ymm4 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm0 \n" // 8 shorts -> 8 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm1 \n" // 8 more
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm0,%%ymm0 \n"
"vcvtdq2ps %%ymm1,%%ymm1 \n"
"vmulps %%ymm0,%%ymm4,%%ymm0 \n"
"vmulps %%ymm1,%%ymm4,%%ymm1 \n"
"vcvtps2ph $3, %%ymm0, %%xmm0 \n"
"vcvtps2ph $3, %%ymm1, %%xmm1 \n"
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
"vmovdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "x"(scale) // %3
: "memory", "cc",
"xmm0", "xmm4"
);
}
#endif // HAS_HALFFLOATROW_AVX2
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
......
......@@ -6095,13 +6095,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
}
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor
// adjust the sample range to 0 to 1 using a float multiply.
// e.g. 9 bit scale is 1.0f / 512.0f
// e.g. 10 bit scale is 1.0f / 1024.0f
#ifdef HAS_SHORTTOHALFFLOAT_AVX2
#ifdef HAS_HALFFLOATROW_AVX2
__declspec(naked)
void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
__asm {
mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */
......@@ -6111,19 +6107,24 @@ void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
// 8 pixel loop.
convertloop:
vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints
lea eax, [eax + 16]
vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts
lea eax, [eax + 32]
vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats
vcvtdq2ps ymm1, ymm1
vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1
vcvtps2ph xmm0, ymm0, 0 // float conver to 8 half floats round even
vmulps ymm1, ymm1, ymm4
vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm1, ymm1, 3
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 8
vmovdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 16
jg convertloop
vzeroupper
ret
}
}
#endif // HAS_SHORTTOHALFFLOAT_AVX2
#endif // HAS_HALFFLOATROW_AVX2
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
......
......@@ -2081,6 +2081,46 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
}
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
int i, j;
const int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
align_buffer_page_end(orig_y, y_plane_size);
align_buffer_page_end(dst_c, y_plane_size);
align_buffer_page_end(dst_opt, y_plane_size);
MemRandomize(orig_y, y_plane_size);
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags_);
double c_time = get_time();
for (j = 0; j < benchmark_iterations_; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
(uint16*)dst_c, benchmark_width_ * 2,
1.0f / 4096.0f, benchmark_width_, benchmark_height_);
}
c_time = (get_time() - c_time) / benchmark_iterations_;
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info_);
double opt_time = get_time();
for (j = 0; j < benchmark_iterations_; j++) {
HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
(uint16*)dst_opt, benchmark_width_ * 2,
1.0f / 4096.0f, benchmark_width_, benchmark_height_);
}
opt_time = (get_time() - opt_time) / benchmark_iterations_;
for (i = 0; i < y_plane_size; ++i) {
EXPECT_EQ(dst_c[i], dst_opt[i]);
}
free_aligned_buffer_page_end(orig_y);
free_aligned_buffer_page_end(dst_c);
free_aligned_buffer_page_end(dst_opt);
}
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment