Commit c56a55fc authored by fbarchard@google.com's avatar fbarchard@google.com

Sobel and SobelXY Neon port. Improved Bayer - did 8 at time version, and…

Sobel and SobelXY Neon port.  Improved Bayer - did 8 at time version, and specialized G channel version.
BUG=201
TEST=libyuvTest.TestSobel and libyuvTest.TestSobelXY
Review URL: https://webrtc-codereview.appspot.com/1279006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@642 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 9b4c00b9
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 641
Version: 642
License: BSD
License File: LICENSE
......
......@@ -197,6 +197,7 @@ extern "C" {
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTOBAYERROW_NEON
#define HAS_ARGBTOBAYERGGROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGB565ROW_NEON
......@@ -269,6 +270,10 @@ extern "C" {
#define HAS_ARGBSEPIAROW_NEON
#define HAS_ARGBSHADEROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
#define HAS_SOBELROW_NEON
#define HAS_SOBELXYROW_NEON
#define HAS_SOBELXROW_NEON
#define HAS_SOBELYROW_NEON
#endif
// The following are available on Mips platforms
......@@ -1315,16 +1320,18 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix);
void ARGBToBayerRow_C(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix);
void ARGBToBayerRow_SSSE3(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix);
void ARGBToBayerRow_NEON(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix);
void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix);
void ARGBToBayerRow_Any_NEON(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix);
void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix);
void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix);
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix);
void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix);
void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix);
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void I422ToYUY2Row_C(const uint8* src_y,
const uint8* src_u,
......@@ -1459,18 +1466,26 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
uint8* dst_sobelx, int width);
void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width);
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width);
void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width);
void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width);
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width);
void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
#ifdef __cplusplus
} // extern "C"
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 641
#define LIBYUV_VERSION 642
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -80,9 +80,9 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
}
}
#elif defined(HAS_ARGBTOBAYERROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_NEON;
}
}
......@@ -437,9 +437,9 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
}
}
#elif defined(HAS_ARGBTOBAYERROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_NEON;
}
}
......
......@@ -1769,9 +1769,9 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
}
}
#elif defined(HAS_ARGBTOBAYERROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_NEON;
}
}
......@@ -1782,6 +1782,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3)) {
SobelYRow = SobelYRow_SSSE3;
}
#endif
#if defined(HAS_SOBELYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SobelYRow = SobelYRow_NEON;
}
#endif
void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobely, int width) =
......@@ -1790,6 +1795,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3)) {
SobelXRow = SobelXRow_SSSE3;
}
#endif
#if defined(HAS_SOBELXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SobelXRow = SobelXRow_NEON;
}
#endif
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C;
......@@ -1799,6 +1809,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
SobelRow = SobelRow_SSE2;
}
#endif
#if defined(HAS_SOBELROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
SobelRow = SobelRow_NEON;
}
#endif
const int kEdge = 16; // Extra pixels at start of row for extrude/align.
SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
......@@ -1868,9 +1883,9 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
}
}
#elif defined(HAS_ARGBTOBAYERROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_NEON;
}
}
......@@ -1881,6 +1896,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3)) {
SobelYRow = SobelYRow_SSSE3;
}
#endif
#if defined(HAS_SOBELYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SobelYRow = SobelYRow_NEON;
}
#endif
void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobely, int width) =
......@@ -1889,6 +1909,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3)) {
SobelXRow = SobelXRow_SSSE3;
}
#endif
#if defined(HAS_SOBELXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SobelXRow = SobelXRow_NEON;
}
#endif
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C;
......@@ -1898,6 +1923,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
SobelXYRow = SobelXYRow_SSE2;
}
#endif
#if defined(HAS_SOBELXYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
SobelXYRow = SobelXYRow_NEON;
}
#endif
const int kEdge = 16; // Extra pixels at start of row for extrude/align.
SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
......
......@@ -190,7 +190,7 @@ BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
#endif
#if defined(HAS_ARGBTOBAYERROW_NEON)
BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
3, 4, 1)
7, 4, 1)
#endif
#undef BAYERANY
......
......@@ -1176,18 +1176,20 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
"vmov.u32 d2[0], %3 \n" // selector
"vmov.u32 d6[0], %3 \n" // selector
"1: \n"
"vld1.u8 {q0}, [%0]! \n" // load row 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d3, {d0, d1}, d2 \n" // look up 4 pixels
"vst1.u32 {d3[0]}, [%1]! \n" // store 4.
"vld1.u8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels
"vst1.u8 {d4}, [%1]! \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
: "r"(selector) // %3
: "cc", "memory", "q0", "q1" // Clobber List
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
......@@ -2595,6 +2597,134 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
);
}
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
// A = 255
// R = Sobel
// G = Sobel
// B = Sobel
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d1}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d0, d0, d1 \n" // add
"vmov.u8 d1, d0 \n"
"vmov.u8 d2, d0 \n"
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
: "cc", "memory", "q0", "q1"
);
}
// Mixes Sobel X, Sobel Y and Sobel into ARGB.
// A = 255
// R = Sobel X
// G = Sobel
// B = Sobel Y
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d0}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d1, d0, d2 \n" // add
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
: "cc", "memory", "q0", "q1"
);
}
// SobelX as a matrix is
// -1 0 1
// -2 0 2
// -1 0 1
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld1.u8 {d0}, [%0],%5 \n" // top
"vld1.u8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n"
"vld1.u8 {d2}, [%1],%5 \n" // center * 2
"vld1.u8 {d3}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
"vld1.u8 {d2}, [%2],%5 \n" // bottom
"vld1.u8 {d3}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
"vst1.u8 {d0}, [%3]! \n" // store 8 sobelx
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
"+r"(dst_sobelx), // %3
"+r"(width) // %4
: "r"(2), // %5
"r"(6) // %6
: "cc", "memory", "q0", "q1" // Clobber List
);
}
// SobelY as a matrix is
// -1 -2 -1
// 0 0 0
// 1 2 1
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld1.u8 {d0}, [%0],%4 \n" // left
"vld1.u8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n"
"vld1.u8 {d2}, [%0],%4 \n" // center * 2
"vld1.u8 {d3}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
"vld1.u8 {d2}, [%0],%5 \n" // right
"vld1.u8 {d3}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
"vst1.u8 {d0}, [%2]! \n" // store 8 sobely
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
"+r"(width) // %3
: "r"(1), // %4
"r"(6) // %5
: "cc", "memory", "q0", "q1" // Clobber List
);
}
#endif // __ARM_NEON__
#ifdef __cplusplus
......
......@@ -977,6 +977,11 @@ TEST_F(libyuvTest, TestSobelX) {
if (TestCpuFlag(kCpuHasSSSE3)) {
SobelXRow = SobelXRow_SSSE3;
}
#endif
#if defined(HAS_SOBELXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SobelXRow = SobelXRow_NEON;
}
#endif
for (int i = 0; i < benchmark_pixels_div256_; ++i) {
SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2,
......@@ -1009,6 +1014,11 @@ TEST_F(libyuvTest, TestSobelY) {
if (TestCpuFlag(kCpuHasSSSE3)) {
SobelYRow = SobelYRow_SSSE3;
}
#endif
#if defined(HAS_SOBELYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SobelYRow = SobelYRow_NEON;
}
#endif
for (int i = 0; i < benchmark_pixels_div256_; ++i) {
SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256);
......@@ -1048,6 +1058,11 @@ TEST_F(libyuvTest, TestSobel) {
if (TestCpuFlag(kCpuHasSSE2)) {
SobelRow = SobelRow_SSE2;
}
#endif
#if defined(HAS_SOBELROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SobelRow = SobelRow_NEON;
}
#endif
for (int i = 0; i < benchmark_pixels_div256_; ++i) {
SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
......@@ -1083,6 +1098,11 @@ TEST_F(libyuvTest, TestSobelXY) {
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXYRow = SobelXYRow_SSE2;
}
#endif
#if defined(HAS_SOBELXYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SobelXYRow = SobelXYRow_NEON;
}
#endif
for (int i = 0; i < benchmark_pixels_div256_; ++i) {
SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment