Commit 85722f5d authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

ByteToFloatRow_NEON to convert and scale bytes to floats

Each byte is converted to float (0.0 to 255.0) and then multiplied
by a scale parameter.

Bug: None
Test: arm 64 build passes.
Change-Id: I04736798540b8d985f60abdf0388e24a209d075b
Reviewed-on: https://chromium-review.googlesource.com/930226
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: 's avatarIan Field <ianfield@google.com>
parent 0ea50cbc
......@@ -498,6 +498,10 @@ int HalfFloatPlane(const uint16_t* src_y,
int width,
int height);
// Convert a buffer of bytes to floats, scale the values and store as floats.
LIBYUV_API
int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
// Quantize a rectangle of ARGB. Alpha unaffected.
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
// interval_size should be a value between 1 and 255.
......
......@@ -308,6 +308,7 @@ extern "C" {
#define HAS_ARGBTOYROW_NEON
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_BYTETOFLOATROW_NEON
#define HAS_COPYROW_NEON
#define HAS_HALFFLOATROW_NEON
#define HAS_I400TOARGBROW_NEON
......@@ -3352,6 +3353,15 @@ void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
uint16_t* dst_ptr,
float param,
int width);
void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
void ByteToFloatRow_NEON(const uint8_t* src,
float* dst,
float scale,
int width);
void ByteToFloatRow_Any_NEON(const uint8_t* src,
float* dst,
float scale,
int width);
void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
......
......@@ -3123,6 +3123,27 @@ int HalfFloatPlane(const uint16_t* src_y,
return 0;
}
// Convert a buffer of bytes to floats, scale the values and store as floats.
LIBYUV_API
int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
int width) = ByteToFloatRow_C;
if (!src_y || !dst_y || width <= 0) {
return -1;
}
#if defined(HAS_BYTETOFLOATROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ByteToFloatRow = ByteToFloatRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ByteToFloatRow = ByteToFloatRow_NEON;
}
}
#endif
ByteToFloatRow(src_y, dst_y, scale, width);
return 0;
}
// Apply a lumacolortable to each ARGB pixel.
LIBYUV_API
int ARGBLumaColorTable(const uint8_t* src_argb,
......
......@@ -807,37 +807,52 @@ ANY11C(Convert8To16Row_Any_AVX2,
#undef ANY11C
// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint16_t* src_ptr, uint16_t* dst_ptr, T param, \
int width) { \
SIMD_ALIGNED(uint16_t temp[32 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, param, n); \
} \
memcpy(temp, src_ptr + n, r * SBPP); \
ANY_SIMD(temp, temp + 16, param, MASK + 1); \
memcpy(dst_ptr + n, temp + 16, r * BPP); \
#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \
void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
SIMD_ALIGNED(ST temp[32]); \
SIMD_ALIGNED(T out[32]); \
memset(temp, 0, SBPP * 32); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, param, n); \
} \
memcpy(temp, src_ptr + n, r * SBPP); \
ANY_SIMD(temp, out, param, MASK + 1); \
memcpy(dst_ptr + n, out, r * BPP); \
}
#ifdef HAS_HALFFLOATROW_SSE2
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 2, 2, 7)
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
#endif
#ifdef HAS_HALFFLOATROW_AVX2
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 2, 2, 15)
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
#endif
#ifdef HAS_HALFFLOATROW_F16C
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 2, 2, 15)
ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15)
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
ANY11P16(HalfFloat1Row_Any_F16C,
HalfFloat1Row_F16C,
uint16_t,
uint16_t,
2,
2,
15)
#endif
#ifdef HAS_HALFFLOATROW_NEON
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7)
ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7)
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
ANY11P16(HalfFloat1Row_Any_NEON,
HalfFloat1Row_NEON,
uint16_t,
uint16_t,
2,
2,
7)
#endif
#ifdef HAS_HALFFLOATROW_MSA
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31)
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
#endif
#ifdef HAS_BYTETOFLOATROW_NEON
ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
#endif
#undef ANY11P16
......
......@@ -2774,6 +2774,14 @@ void HalfFloatRow_C(const uint16_t* src,
}
}
void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
int i;
for (i = 0; i < width; ++i) {
float value = src[i] * scale;
dst[i] = value;
}
}
void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
int width,
......
......@@ -2659,6 +2659,32 @@ void HalfFloatRow_NEON(const uint16_t* src,
: "cc", "memory", "q0", "q1", "q2", "q3");
}
void ByteToFloatRow_NEON(const uint8_t* src,
float* dst,
float scale,
int width) {
asm volatile(
"vdup.32 q0, %3 \n"
"1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u8 q1, d2 \n" // 8 shorts
"vmovl.u16 q2, d2 \n" // 8 ints
"vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, d0[0] \n" // scale
"vmul.f32 q3, q3, d0[0] \n"
"vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "q0", "q1", "q2", "q3");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
#ifdef __cplusplus
......
......@@ -2700,6 +2700,30 @@ void HalfFloatRow_NEON(const uint16_t* src,
: "cc", "memory", "v1", "v2", "v3");
}
void ByteToFloatRow_NEON(const uint8_t* src,
float* dst,
float scale,
int width) {
asm volatile(
"1: \n"
"ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v1.8h, v1.8b \n" // 8 shorts
"uxtl v2.4s, v1.4h \n" // 8 ints
"uxtl2 v3.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v3.4s, v3.4s \n"
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"fmul v3.4s, v3.4s, %3.s[0] \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale) // %3
: "cc", "memory", "v1", "v2", "v3");
}
float ScaleMaxSamples_NEON(const float* src,
float* dst,
float scale,
......
......@@ -2168,6 +2168,52 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_One) {
EXPECT_LE(diff, 1);
}
float TestByteToFloat(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info,
float scale) {
int i, j;
const int y_plane_size = benchmark_width * benchmark_height;
align_buffer_page_end(orig_y, y_plane_size * (1 + 4 + 4));
float* dst_opt = reinterpret_cast<float*>(orig_y + y_plane_size);
float* dst_c = reinterpret_cast<float*>(orig_y + y_plane_size * 5);
MemRandomize(orig_y, y_plane_size);
memset(dst_c, 0, y_plane_size * 4);
memset(dst_opt, 1, y_plane_size * 4);
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags);
ByteToFloat(orig_y, dst_c, scale, y_plane_size);
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info);
for (j = 0; j < benchmark_iterations; j++) {
ByteToFloat(orig_y, dst_opt, scale, y_plane_size);
}
float max_diff = 0;
for (i = 0; i < y_plane_size; ++i) {
float abs_diff = fabs(dst_c[i] - dst_opt[i]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_page_end(orig_y);
return max_diff;
}
TEST_F(LibYUVPlanarTest, TestByteToFloat) {
float diff = TestByteToFloat(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f);
EXPECT_EQ(0.f, diff);
}
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment