Commit 1e16cb5c authored by Frank Barchard's avatar Frank Barchard

SplitRGBPlane and MergeRGBPlane functions added

Converts packed RGB to planar and back.

TBR=kjellander@chromium.org
BUG=libyuv:728
TEST=MergeRGBPlane_Opt and SplitRGBPlane_Opt unittests added

Change-Id: Ida59af940afcb1fc4a48bbf62c714f592665c3cc
Reviewed-on: https://chromium-review.googlesource.com/658069Reviewed-by: 's avatarFrank Barchard <fbarchard@google.com>
Reviewed-by: 's avatarCheng Wang <wangcheng@google.com>
parent 367c0d8f
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1667
Version: 1668
License: BSD
License File: LICENSE
......
......@@ -69,6 +69,32 @@ void MergeUVPlane(const uint8* src_u,
int width,
int height);
// Split interleaved RGB plane into separate R, G and B planes.
LIBYUV_API
void SplitRGBPlane(const uint8* src_rgb,
int src_stride_rgb,
uint8* dst_r,
int dst_stride_r,
uint8* dst_g,
int dst_stride_g,
uint8* dst_b,
int dst_stride_b,
int width,
int height);
// Merge separate R, G and B planes into one interleaved RGB plane.
LIBYUV_API
void MergeRGBPlane(const uint8* src_r,
int src_stride_r,
const uint8* src_g,
int src_stride_g,
const uint8* src_b,
int src_stride_b,
uint8* dst_rgb,
int dst_stride_rgb,
int width,
int height);
// Copy I400. Supports inverting.
LIBYUV_API
int I400ToI400(const uint8* src_y,
......
......@@ -271,6 +271,14 @@ extern "C" {
#define HAS_I422TOARGBROW_SSSE3
#endif
// The following are available forr gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
......@@ -330,6 +338,7 @@ extern "C" {
#define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
#define HAS_SPLITRGBROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
......@@ -1462,6 +1471,58 @@ void MergeUVRow_Any_MSA(const uint8* src_u,
uint8* dst_uv,
int width);
void SplitRGBRow_C(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void SplitRGBRow_SSSE3(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void SplitRGBRow_NEON(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void SplitRGBRow_Any_SSSE3(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void SplitRGBRow_Any_NEON(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width);
void MergeRGBRow_C(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeRGBRow_SSSE3(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeRGBRow_NEON(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeRGBRow_Any_SSSE3(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void MergeRGBRow_Any_NEON(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1667
#define LIBYUV_VERSION 1668
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -407,6 +407,122 @@ void MergeUVPlane(const uint8* src_u,
}
}
// Support function for NV12 etc RGB channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
void SplitRGBPlane(const uint8* src_rgb,
int src_stride_rgb,
uint8* dst_r,
int dst_stride_r,
uint8* dst_g,
int dst_stride_g,
uint8* dst_b,
int dst_stride_b,
int width,
int height) {
int y;
void (*SplitRGBRow)(const uint8* src_rgb, uint8* dst_r, uint8* dst_g,
uint8* dst_b, int width) = SplitRGBRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_r = dst_r + (height - 1) * dst_stride_r;
dst_g = dst_g + (height - 1) * dst_stride_g;
dst_b = dst_b + (height - 1) * dst_stride_b;
dst_stride_r = -dst_stride_r;
dst_stride_g = -dst_stride_g;
dst_stride_b = -dst_stride_b;
}
// Coalesce rows.
if (src_stride_rgb == width * 3 && dst_stride_r == width &&
dst_stride_g == width && dst_stride_b == width) {
width *= height;
height = 1;
src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
}
#if defined(HAS_SPLITRGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
SplitRGBRow = SplitRGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
SplitRGBRow = SplitRGBRow_SSSE3;
}
}
#endif
#if defined(HAS_SPLITRGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitRGBRow = SplitRGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitRGBRow = SplitRGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
// Copy a row of RGB.
SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
dst_r += dst_stride_r;
dst_g += dst_stride_g;
dst_b += dst_stride_b;
src_rgb += src_stride_rgb;
}
}
LIBYUV_API
void MergeRGBPlane(const uint8* src_r,
int src_stride_r,
const uint8* src_g,
int src_stride_g,
const uint8* src_b,
int src_stride_b,
uint8* dst_rgb,
int dst_stride_rgb,
int width,
int height) {
int y;
void (*MergeRGBRow)(const uint8* src_r, const uint8* src_g,
const uint8* src_b, uint8* dst_rgb, int width) =
MergeRGBRow_C;
// Coalesce rows.
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
dst_stride_rgb = -dst_stride_rgb;
}
// Coalesce rows.
if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
dst_stride_rgb == width * 3) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
}
#if defined(HAS_MERGERGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MergeRGBRow = MergeRGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
MergeRGBRow = MergeRGBRow_SSSE3;
}
}
#endif
#if defined(HAS_MERGERGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeRGBRow = MergeRGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
MergeRGBRow = MergeRGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of RGB.
MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_rgb += dst_stride_rgb;
}
}
// Mirror a plane of data.
void MirrorPlane(const uint8* src_y,
int src_stride_y,
......
......@@ -84,6 +84,14 @@ ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
// Merge functions.
#ifdef HAS_MERGERGBROW_SSSE3
ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
#endif
#ifdef HAS_MERGERGBROW_NEON
ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
#endif
#ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
......@@ -943,6 +951,31 @@ ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
#endif
#undef ANY12
// Any 1 to 3. Outputs RGB planes.
#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_r, uint8* dst_g, uint8* dst_b, \
int width) { \
SIMD_ALIGNED(uint8 temp[16 * 6]); \
memset(temp, 0, 16 * 3); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \
} \
memcpy(temp, src_ptr + n * BPP, r * BPP); \
ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
memcpy(dst_r + n, temp + 16 * 3, r); \
memcpy(dst_g + n, temp + 16 * 4, r); \
memcpy(dst_b + n, temp + 16 * 5, r); \
}
#ifdef HAS_SPLITRGBROW_SSSE3
ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
#endif
#ifdef HAS_SPLITRGBROW_NEON
ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
#endif
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
......
......@@ -1770,6 +1770,34 @@ void MergeUVRow_C(const uint8* src_u,
}
}
void SplitRGBRow_C(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_r[x] = src_rgb[0];
dst_g[x] = src_rgb[1];
dst_b[x] = src_rgb[2];
src_rgb += 3;
}
}
void MergeRGBRow_C(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_rgb[0] = src_r[x];
dst_rgb[1] = src_g[x];
dst_rgb[2] = src_b[x];
dst_rgb += 3;
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}
......
......@@ -38,9 +38,8 @@ static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
127, -84, -43, 0, 127, -84, -43, 0};
static vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
static vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
-18, -94, 112, 0, -18, -94, 112, 0};
static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-20, -107, 127, 0, -20, -107, 127, 0};
......@@ -2754,6 +2753,199 @@ void MergeUVRow_SSE2(const uint8* src_u,
}
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_SPLITRGBROW_SSSE3
// Shuffle table for converting RGB to Planar.
static uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
2u, 5u, 8u, 11u, 14u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 1u,
4u, 7u, 10u, 13u};
static uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
3u, 6u, 9u, 12u, 15u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 2u,
5u, 8u, 11u, 14u};
static uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
4u, 7u, 10u, 13u, 128u, 128u,
128u, 128u, 128u, 128u};
static uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 0u, 3u,
6u, 9u, 12u, 15u};
void SplitRGBRow_SSSE3(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width) {
asm volatile (
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"pshufb %5, %%xmm0 \n"
"pshufb %6, %%xmm1 \n"
"pshufb %7, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"pshufb %8, %%xmm0 \n"
"pshufb %9, %%xmm1 \n"
"pshufb %10, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"pshufb %11, %%xmm0 \n"
"pshufb %12, %%xmm1 \n"
"pshufb %13, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(3) " \n"
"lea " MEMLEA(0x10,3) ",%3 \n"
"lea " MEMLEA(0x30,0) ",%0 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "m"(kShuffleMaskRGBToR0), // %5
"m"(kShuffleMaskRGBToR1), // %6
"m"(kShuffleMaskRGBToR2), // %7
"m"(kShuffleMaskRGBToG0), // %8
"m"(kShuffleMaskRGBToG1), // %9
"m"(kShuffleMaskRGBToG2), // %10
"m"(kShuffleMaskRGBToB0), // %11
"m"(kShuffleMaskRGBToB1), // %12
"m"(kShuffleMaskRGBToB2) // %13
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2"
);
}
#endif // HAS_SPLITRGBROW_SSSE3
#ifdef HAS_MERGERGBROW_SSSE3
// Shuffle table for converting RGB to Planar.
static uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
2u, 128u, 128u, 3u, 128u, 128u,
4u, 128u, 128u, 5u};
static uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
128u, 2u, 128u, 128u, 3u, 128u,
128u, 4u, 128u, 128u};
static uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
128u, 128u, 2u, 128u, 128u, 3u,
128u, 128u, 4u, 128u};
static uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
7u, 128u, 128u, 8u, 128u, 128u,
9u, 128u, 128u, 10u};
static uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
128u, 7u, 128u, 128u, 8u, 128u,
128u, 9u, 128u, 128u};
static uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
128u, 128u, 8u, 128u, 128u, 9u,
128u, 128u, 10u, 128u};
static uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
12u, 128u, 128u, 13u, 128u, 128u,
14u, 128u, 128u, 15u};
static uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
128u, 13u, 128u, 128u, 14u, 128u,
128u, 15u, 128u, 128u};
static uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
128u, 128u, 13u, 128u, 128u, 14u,
128u, 128u, 15u, 128u};
void MergeRGBRow_SSSE3(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width) {
asm volatile (
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
"pshufb %5, %%xmm0 \n"
"pshufb %6, %%xmm1 \n"
"pshufb %7, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(3) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
"pshufb %8, %%xmm0 \n"
"pshufb %9, %%xmm1 \n"
"pshufb %10, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS2(16, 3) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
"pshufb %11, %%xmm0 \n"
"pshufb %12, %%xmm1 \n"
"pshufb %13, %%xmm2 \n"
"por %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS2(32, 3) " \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"lea " MEMLEA(0x30,3) ",%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_rgb), // %3
"+r"(width) // %4
: "m"(kShuffleMaskRToRGB0), // %5
"m"(kShuffleMaskGToRGB0), // %6
"m"(kShuffleMaskBToRGB0), // %7
"m"(kShuffleMaskRToRGB1), // %8
"m"(kShuffleMaskGToRGB1), // %9
"m"(kShuffleMaskBToRGB1), // %10
"m"(kShuffleMaskRToRGB2), // %11
"m"(kShuffleMaskGToRGB2), // %12
"m"(kShuffleMaskBToRGB2) // %13
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2"
);
}
#endif // HAS_MERGERGBROW_SSSE3
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile (
......
......@@ -526,7 +526,7 @@ void MergeUVRow_NEON(const uint8* src_u,
"vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
......@@ -537,6 +537,56 @@ void MergeUVRow_NEON(const uint8* src_u,
);
}
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width) {
asm volatile(
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
"subs %4, %4, #16 \n" // 16 processed per loop
"vst1.8 {q0}, [%1]! \n" // store R
"vst1.8 {q1}, [%2]! \n" // store G
"vst1.8 {q2}, [%3]! \n" // store B
"bgt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "d0", "d1", "d2" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width) {
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G
"vld1.8 {q2}, [%2]! \n" // load B
"subs %4, %4, #16 \n" // 16 processed per loop
"vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
"vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_rgb), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "q0", "q1", "q2" // Clobber List
);
}
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile(
......
......@@ -580,6 +580,54 @@ void MergeUVRow_NEON(const uint8* src_u,
);
}
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8* src_rgb,
uint8* dst_r,
uint8* dst_g,
uint8* dst_b,
int width) {
asm volatile(
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store R
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%3], #16 \n" // store B
"b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
void MergeRGBRow_NEON(const uint8* src_r,
const uint8* src_g,
const uint8* src_b,
uint8* dst_rgb,
int width) {
asm volatile(
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%2], #16 \n" // load B
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_rgb), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile(
......
......@@ -1054,7 +1054,7 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr,
"+r"(dst_width) // %3
: "r"(2LL), // %4
"r"(14LL) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
"v19" // Clobber List
);
}
......
......@@ -2521,6 +2521,101 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 3);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
align_buffer_page_end(tmp_pixels_b, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 3);
align_buffer_page_end(dst_pixels_c, kPixels * 3);
MemRandomize(src_pixels, kPixels * 3);
MemRandomize(tmp_pixels_r, kPixels);
MemRandomize(tmp_pixels_g, kPixels);
MemRandomize(tmp_pixels_b, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 3);
MemRandomize(dst_pixels_c, kPixels * 3);
MaskCpuFlags(disable_cpu_flags_);
SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, benchmark_width_, benchmark_height_);
MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, dst_pixels_c,
benchmark_width_ * 3, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, benchmark_width_, benchmark_height_);
for (int i = 0; i < benchmark_iterations_; ++i) {
MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
benchmark_width_, tmp_pixels_b, benchmark_width_,
dst_pixels_opt, benchmark_width_ * 3, benchmark_width_,
benchmark_height_);
}
for (int i = 0; i < kPixels * 3; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_r);
free_aligned_buffer_page_end(tmp_pixels_g);
free_aligned_buffer_page_end(tmp_pixels_b);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 3);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
align_buffer_page_end(tmp_pixels_b, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 3);
align_buffer_page_end(dst_pixels_c, kPixels * 3);
MemRandomize(src_pixels, kPixels * 3);
MemRandomize(tmp_pixels_r, kPixels);
MemRandomize(tmp_pixels_g, kPixels);
MemRandomize(tmp_pixels_b, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 3);
MemRandomize(dst_pixels_c, kPixels * 3);
MaskCpuFlags(disable_cpu_flags_);
SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, benchmark_width_, benchmark_height_);
MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, dst_pixels_c,
benchmark_width_ * 3, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, benchmark_width_,
benchmark_height_);
}
MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, dst_pixels_opt,
benchmark_width_ * 3, benchmark_width_, benchmark_height_);
for (int i = 0; i < kPixels * 3; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_r);
free_aligned_buffer_page_end(tmp_pixels_g);
free_aligned_buffer_page_end(tmp_pixels_b);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
float TestScaleMaxSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment