Commit 22f8aad8 authored by Frank Barchard's avatar Frank Barchard Committed by Commit Bot

RAWToRGBA for 3 channel OCR

Replace ARM64 only row function with high level function
that implements SSSE3, 32 bit Neon and C.

Compared to 2 step RAWToARGB + ARGBToRGBA on row level:
3.1x faster on ARM
6.2% faster on Intel

BUG=b/140748379

Change-Id: Ia8636d9e4fcdbe10b8c2e81610a54728e29845cd
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1860914Reviewed-by: 's avatarrichard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
parent 98a4882d
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1737
Version: 1738
License: BSD
License File: LICENSE
......
......@@ -576,6 +576,15 @@ int RAWToARGB(const uint8_t* src_raw,
int width,
int height);
// RGB big endian (rgb in memory) to RGBA.
LIBYUV_API
int RAWToRGBA(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_rgba,
int dst_stride_rgba,
int width,
int height);
// RGB16 (RGBP fourcc) little endian to ARGB.
LIBYUV_API
int RGB565ToARGB(const uint8_t* src_rgb565,
......
......@@ -274,6 +274,7 @@ extern "C" {
#define HAS_I210TOARGBROW_SSSE3
#define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3
#define HAS_RAWTORGBAROW_SSSE3
#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
......@@ -369,6 +370,7 @@ extern "C" {
#define HAS_NV21TORGB24ROW_NEON
#define HAS_NV21TOYUV24ROW_NEON
#define HAS_RAWTOARGBROW_NEON
#define HAS_RAWTORGBAROW_NEON
#define HAS_RAWTORGB24ROW_NEON
#define HAS_RAWTOUVROW_NEON
#define HAS_RAWTOYROW_NEON
......@@ -1941,6 +1943,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width);
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
......@@ -1961,6 +1964,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
......@@ -1995,6 +1999,7 @@ void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
int width);
void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
......@@ -2014,6 +2019,9 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
......@@ -2047,6 +2055,7 @@ void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1737
#define LIBYUV_VERSION 1738
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -1349,6 +1349,57 @@ int RAWToARGB(const uint8_t* src_raw,
return 0;
}
// Convert RAW to RGBA.
LIBYUV_API
int RAWToRGBA(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_rgba,
int dst_stride_rgba,
int width,
int height) {
int y;
void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
RAWToRGBARow_C;
if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
// Coalesce rows.
if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
width *= height;
height = 1;
src_stride_raw = dst_stride_rgba = 0;
}
#if defined(HAS_RAWTORGBAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RAWToRGBARow = RAWToRGBARow_SSSE3;
}
}
#endif
#if defined(HAS_RAWTORGBAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToRGBARow = RAWToRGBARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToRGBARow = RAWToRGBARow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
RAWToRGBARow(src_raw, dst_rgba, width);
src_raw += src_stride_raw;
dst_rgba += dst_stride_rgba;
}
return 0;
}
// Convert RGB565 to ARGB.
LIBYUV_API
int RGB565ToARGB(const uint8_t* src_rgb565,
......
......@@ -559,6 +559,9 @@ ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#endif
#if defined(HAS_RAWTORGBAROW_SSSE3)
ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
#endif
#if defined(HAS_RAWTORGB24ROW_SSSE3)
ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
#endif
......@@ -773,6 +776,9 @@ ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
#ifdef HAS_RAWTOARGBROW_NEON
ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
#endif
#ifdef HAS_RAWTORGBAROW_NEON
ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
#endif
#ifdef HAS_RAWTOARGBROW_MSA
ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
#endif
......
......@@ -123,6 +123,21 @@ void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
}
}
void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
int x;
for (x = 0; x < width; ++x) {
uint8_t r = src_raw[0];
uint8_t g = src_raw[1];
uint8_t b = src_raw[2];
dst_rgba[0] = 255u;
dst_rgba[1] = b;
dst_rgba[2] = g;
dst_rgba[3] = r;
dst_rgba += 4;
src_raw += 3;
}
}
void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
int x;
for (x = 0; x < width; ++x) {
......
......@@ -98,6 +98,10 @@ static const uvec8 kShuffleMaskRGB24ToARGB = {
static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
// Shuffle table for converting RAW to RGBA.
static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
// Shuffle table for converting RAW to RGB24. First 8.
static const uvec8 kShuffleMaskRAWToRGB24_0 = {
2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
......@@ -260,6 +264,45 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
// Same code as RAWToARGB with different shuffler and A in low bits
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
"psrld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqu %%xmm2,0x20(%1) \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqu %%xmm1,0x10(%1) \n"
"por %%xmm5,%%xmm3 \n"
"movdqu %%xmm3,0x30(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
: "m"(kShuffleMaskRAWToRGBA) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
uint8_t* dst_rgb24,
int width) {
......
......@@ -781,6 +781,22 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
);
}
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
"vmov.u8 d0, #255 \n" // Alpha
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
);
}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
......
......@@ -821,6 +821,24 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
);
}
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
"movi v0.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v2.8b, v4.8b, v4.8b \n" // move g
"orr v1.8b, v5.8b, v5.8b \n" // move r
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
......
......@@ -1232,6 +1232,7 @@ TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1, 0)
TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment