Commit 451af5e9 authored by Frank Barchard's avatar Frank Barchard

scale by 1 for neon implemented

void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
    "uxtl2      v1.4s, v1.8h                   \n"
    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
    "scvtf      v1.4s, v1.4s                   \n"
    "fcvtn      v4.4h, v2.4s                   \n"  // 8 floatsgit
    "fcvtn2     v4.8h, v1.4s                   \n"
   MEMACCESS(1)
    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
    "b.gt       1b                             \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  :
  : "cc", "memory", "v1", "v2", "v4"
  );
}

void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
    "uxtl2      v1.4s, v1.8h                   \n"
    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
    "scvtf      v1.4s, v1.4s                   \n"
    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
    "fmul       v1.4s, v1.4s, %3.s[0]          \n"
    "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
    "uqshrn2    v4.8h, v1.4s, #13              \n"
   MEMACCESS(1)
    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
    "b.gt       1b                             \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  : "w"(scale * 1.9259299444e-34f)    // %3
  : "cc", "memory", "v1", "v2", "v4"
  );
}

TEST=LibYUVPlanarTest.TestHalfFloatPlane_One
BUG=libyuv:560
R=hubbe@chromium.org

Review URL: https://codereview.chromium.org/2430313008 .
parent 550cf829
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1629 Version: 1630
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -1959,9 +1959,15 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale, ...@@ -1959,9 +1959,15 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width); void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale, void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
int width); int width);
void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width);
void HalfFloat1Row_Any_F16C(const uint16* src, uint16* dst, float scale,
int width);
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width); void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale, void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale,
int width); int width);
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width);
void HalfFloat1Row_Any_NEON(const uint16* src, uint16* dst, float scale,
int width);
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
const uint8* luma, uint32 lumacoeff); const uint8* luma, uint32 lumacoeff);
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1629 #define LIBYUV_VERSION 1630
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_
...@@ -2579,17 +2579,19 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y, ...@@ -2579,17 +2579,19 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
#endif #endif
#if defined(HAS_HALFFLOATROW_F16C) #if defined(HAS_HALFFLOATROW_F16C)
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
HalfFloatRow = HalfFloatRow_Any_F16C; HalfFloatRow = (scale == 1.0f) ?
HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
HalfFloatRow = HalfFloatRow_F16C; HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
} }
} }
#endif #endif
#if defined(HAS_HALFFLOATROW_NEON) #if defined(HAS_HALFFLOATROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
HalfFloatRow = HalfFloatRow_Any_NEON; HalfFloatRow = (scale == 1.0f) ?
HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
HalfFloatRow = HalfFloatRow_NEON; HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
} }
} }
#endif #endif
......
...@@ -577,16 +577,18 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) ...@@ -577,16 +577,18 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
} }
#ifdef HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_SSE2
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15) ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7)
#endif #endif
#ifdef HAS_HALFFLOATROW_AVX2 #ifdef HAS_HALFFLOATROW_AVX2
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15) ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
#endif #endif
#ifdef HAS_HALFFLOATROW_F16C #ifdef HAS_HALFFLOATROW_F16C
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15) ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 1, 1, 15)
#endif #endif
#ifdef HAS_HALFFLOATROW_NEON #ifdef HAS_HALFFLOATROW_NEON
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7) ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7)
#endif #endif
#undef ANY11P16 #undef ANY11P16
......
...@@ -5410,6 +5410,36 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { ...@@ -5410,6 +5410,36 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
} }
#endif // HAS_HALFFLOATROW_F16C #endif // HAS_HALFFLOATROW_F16C
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
asm volatile (
// 16 pixel loop.
LABELALIGN
"1: \n"
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"vcvtdq2ps %%ymm2,%%ymm2 \n"
"vcvtdq2ps %%ymm3,%%ymm3 \n"
"vcvtps2ph $3, %%ymm2, %%xmm2 \n"
"vcvtps2ph $3, %%ymm3, %%xmm3 \n"
"vmovdqu %%xmm2," MEMACCESS(1) " \n"
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "memory", "cc",
"xmm2", "xmm3"
);
}
#endif // HAS_HALFFLOATROW_F16C
#ifdef HAS_ARGBCOLORTABLEROW_X86 #ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table. // Tranform ARGB pixels with color table.
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
......
...@@ -2711,6 +2711,55 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ...@@ -2711,6 +2711,55 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
); );
} }
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"uxtl2 v1.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v1.4s, v1.4s \n"
"fcvtn v4.4h, v2.4s \n" // 8 floatsgit
"fcvtn2 v4.8h, v1.4s \n"
MEMACCESS(1)
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v4"
);
}
void HalfFloatRow_NEON2(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"uxtl2 v1.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v1.4s, v1.4s \n"
"fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
"fmul v1.4s, v1.4s, %3.s[0] \n"
"uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v4.8h, v1.4s, #13 \n"
MEMACCESS(1)
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "v1", "v2", "v4"
);
}
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile ( asm volatile (
"1: \n" "1: \n"
......
...@@ -2084,17 +2084,22 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) { ...@@ -2084,17 +2084,22 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
int TestHalfFloatPlane(int benchmark_width, int benchmark_height, int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
int benchmark_iterations, int benchmark_iterations,
int disable_cpu_flags, int benchmark_cpu_info, int disable_cpu_flags, int benchmark_cpu_info,
float scale) { float scale, int mask) {
int i, j; int i, j;
const int y_plane_size = benchmark_width * benchmark_height * 2; const int y_plane_size = benchmark_width * benchmark_height * 2;
align_buffer_page_end(orig_y, y_plane_size); align_buffer_page_end(orig_y, y_plane_size * 3);
align_buffer_page_end(dst_c, y_plane_size); uint8* dst_opt = orig_y + y_plane_size;
align_buffer_page_end(dst_opt, y_plane_size); uint8* dst_c = orig_y + y_plane_size * 2;
MemRandomize(orig_y, y_plane_size); MemRandomize(orig_y, y_plane_size);
memset(dst_c, 0, y_plane_size); memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size); memset(dst_opt, 1, y_plane_size);
for (i = 0; i < y_plane_size / 2; ++i) {
reinterpret_cast<uint16*>(orig_y)[i] = static_cast<uint16>(i & mask);
}
// Disable all optimizations. // Disable all optimizations.
MaskCpuFlags(disable_cpu_flags); MaskCpuFlags(disable_cpu_flags);
double c_time = get_time(); double c_time = get_time();
...@@ -2122,38 +2127,62 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height, ...@@ -2122,38 +2127,62 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
} }
free_aligned_buffer_page_end(orig_y); free_aligned_buffer_page_end(orig_y);
free_aligned_buffer_page_end(dst_c);
free_aligned_buffer_page_end(dst_opt);
return diff; return diff;
} }
// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
// exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally // exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally
// happen since scale is 1/(1<<bits) where bits is 9, 10 or 12. // happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_denormal) { #define MAXHALFDIFF 0
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_,
1.0f / 65536.0f, 65535);
EXPECT_LE(diff, MAXHALFDIFF);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_,
1.0f / 1024.0f, 1023);
EXPECT_LE(diff, MAXHALFDIFF);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, disable_cpu_flags_, benchmark_cpu_info_,
1.0f / 65536.0f); 1.0f / 512.0f, 511);
EXPECT_EQ(diff, 0); EXPECT_LE(diff, MAXHALFDIFF);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, disable_cpu_flags_, benchmark_cpu_info_,
1.0f / 4096.0f); 1.0f / 4096.0f, 4095);
EXPECT_EQ(diff, 0); EXPECT_LE(diff, MAXHALFDIFF);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_,
1.0f, 4095);
EXPECT_LE(diff, MAXHALFDIFF);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, disable_cpu_flags_, benchmark_cpu_info_,
1.0f / 1023.0f); 1.0f / 4095.0f, 4095);
EXPECT_EQ(diff, 0); EXPECT_LE(diff, MAXHALFDIFF);
} }
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) { TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
SIMD_ALIGNED(uint8 orig_pixels[1280][4]); SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]); SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment