Commit 78020389 authored by fbarchard@google.com's avatar fbarchard@google.com

rotate for x86 and bayer refactored - 3x faster.

BUG=1
TEST=tested with talk unittests.
Review URL: http://webrtc-codereview.appspot.com/250004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@42 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 3f4c056b
...@@ -25,6 +25,14 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -25,6 +25,14 @@ int I420Copy(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height); int width, int height);
// Draw a rectangle into I420
int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int x, int y,
int width, int height,
int value_y, int value_u, int value_v);
// Convert I422 to I420. Used by MJPG. // Convert I422 to I420. Used by MJPG.
int I422ToI420(const uint8* src_y, int src_stride_y, int I422ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
...@@ -146,7 +154,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, ...@@ -146,7 +154,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
// Convert ARGB to I400. // Convert ARGB to I400.
int ARGBToI400(const uint8* src_argb, int src_stride_argb, int ARGBToI400(const uint8* src_argb, int src_stride_argb,
const uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height); int width, int height);
} // namespace libyuv } // namespace libyuv
......
...@@ -11,39 +11,41 @@ ...@@ -11,39 +11,41 @@
#ifndef INCLUDE_LIBYUV_ROTATE_H_ #ifndef INCLUDE_LIBYUV_ROTATE_H_
#define INCLUDE_LIBYUV_ROTATE_H_ #define INCLUDE_LIBYUV_ROTATE_H_
#include "basic_types.h" #include "libyuv/basic_types.h"
namespace libyuv { namespace libyuv {
// Supported rotation // Supported rotation
enum RotationMode { enum RotationMode {
kRotate0 = 0, // No rotation
kRotate90 = 90, // Rotate 90 degrees clockwise
kRotate180 = 180, // Rotate 180 degrees
kRotate270 = 270, // Rotate 270 degrees clockwise
// Deprecated
kRotateNone = 0, kRotateNone = 0,
kRotateClockwise = 90, kRotateClockwise = 90,
kRotateCounterClockwise = 270, kRotateCounterClockwise = 270,
kRotate180 = 180,
}; };
// Rotate I420 frame // Rotate I420 frame
int int I420Rotate(const uint8* src_y, int src_stride_y,
I420Rotate(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u,
const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v,
const uint8* src_v, int src_stride_v, uint8* dst_y, int dst_stride_y,
uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u,
uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v,
uint8* dst_v, int dst_stride_v, int width, int height,
int width, int height, RotationMode mode);
RotationMode mode);
// Rotate NV12 input and store in I420
// Split a NV12 input buffer into Y, U, V buffers and int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
// then rotate the buffers. const uint8* src_uv, int src_stride_uv,
int uint8* dst_y, int dst_stride_y,
NV12ToI420Rotate(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u,
const uint8* src_uv, int src_stride_uv, uint8* dst_v, int dst_stride_v,
uint8* dst_y, int dst_stride_y, int width, int height,
uint8* dst_u, int dst_stride_u, RotationMode mode);
uint8* dst_v, int dst_stride_v,
int width, int height,
RotationMode mode);
} // namespace libyuv } // namespace libyuv
......
This diff is collapsed.
...@@ -49,28 +49,33 @@ static void SplitUV_NEON(const uint8* src_uv, ...@@ -49,28 +49,33 @@ static void SplitUV_NEON(const uint8* src_uv,
#endif #endif
// Shuffle table for converting ABGR to ARGB. // Shuffle table for converting ABGR to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
{ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u }; 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting BGRA to ARGB. // Shuffle table for converting BGRA to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
{ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
// Shuffle table for converting BG24 to ARGB. // Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
{ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB. // Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
{ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
// Constant multiplication table for converting ARGB to I400. // Constant multiplication table for converting ARGB to I400.
extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
{ 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u }; };
extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) = extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) = {
{ 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u }; 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
};
#if defined(WIN32) && !defined(COVERAGE_ENABLED) #if defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SPLITUV_SSE2 #define HAS_SPLITUV_SSE2
...@@ -169,28 +174,7 @@ static void I420CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -169,28 +174,7 @@ static void I420CopyPlane(const uint8* src_y, int src_stride_y,
} }
} }
static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, // Copy I420 with optional flipping
uint8* dst, int dst_stride,
int width, int height) {
// Copy plane
for (int y = 0; y < height; y += 2) {
memcpy(dst, src, width);
src += src_stride_0;
dst += dst_stride;
memcpy(dst, src, width);
src += src_stride_1;
dst += dst_stride;
}
}
// TODO(fbarchard): For biplanar formats (ie NV21), the Y plane is the same
// as I420, and only the chroma plane varies. Copy the Y plane by reference,
// and just convert the UV. This method can be used for NV21, NV12, I420,
// I422, M422. 8 of the 12 bits is Y, so this would copy 3 times less data,
// which is approximately how much faster it would be.
// Helper function to copy yuv data without scaling. Used
// by our jpeg conversion callbacks to incrementally fill a yuv image.
int I420Copy(const uint8* src_y, int src_stride_y, int I420Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
...@@ -198,6 +182,12 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -198,6 +182,12 @@ int I420Copy(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
if (!src_y || !src_u || !src_v ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
...@@ -218,6 +208,137 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -218,6 +208,137 @@ int I420Copy(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// SetRows32 writes 'count' bytes using a 32 bit value repeated
#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
#define HAS_SETROW_NEON
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
__asm__ volatile
(
"vdup.u32 {q0}, %2 \n" // duplicate 4 ints
"1:\n"
"vst1.u32 {q0}, [%0]! \n" // store
"subs %1, %1, #16 \n" // 16 processed per loop
"bhi 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
: "q0", "memory"
);
}
#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SETROW_SSE2
__declspec(naked)
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
__asm {
mov eax, [esp + 4] // dst
movd xmm7, [esp + 8] // v32
mov ecx, [esp + 12] // count
pshufd xmm7, xmm7, 0
wloop:
movdqa [eax], xmm7
lea eax, [eax + 16]
sub ecx, 16
ja wloop
ret
}
}
#elif (defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_SETROW_SSE2
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
asm volatile(
"movd %2, %%xmm7\n"
"pshufd $0x0,%%xmm7,%%xmm7\n"
"1:"
"movdqa %%xmm7,(%0)\n"
"lea 0x10(%0),%0\n"
"sub $0x10,%1\n"
"ja 1b\n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
: "memory"
);
}
#endif
static void SetRow8_C(uint8* dst, uint32 v8, int count) {
memset(dst, v8, count);
}
static void I420SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
int value) {
void (*SetRow)(uint8* dst, uint32 value, int pix);
#if defined(HAS_SETROW_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(width % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
SetRow = SetRow32_NEON;
} else
#elif defined(HAS_SETROW_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
(width % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
SetRow = SetRow32_SSE2;
} else
#endif
{
SetRow = SetRow8_C;
}
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
// Set plane
for (int y = 0; y < height; ++y) {
SetRow(dst_y, v32, width);
dst_y += dst_stride_y;
}
}
// Draw a rectangle into I420
int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int x, int y,
int width, int height,
int value_y, int value_u, int value_v) {
if (!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0 ||
x < 0 || y < 0 ||
value_y < 0 || value_y > 255 ||
value_u < 0 || value_u > 255 ||
value_v < 0 || value_v > 255) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
int halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (halfheight - 1) * dst_stride_v;
dst_stride_y = -dst_stride_y;
dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v;
}
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
uint8* start_y = dst_y + y * dst_stride_y + x;
uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
I420SetPlane(start_y, dst_stride_y, width, height, value_y);
I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
return 0;
}
// Helper function to copy yuv data without scaling. Used // Helper function to copy yuv data without scaling. Used
// by our jpeg conversion callbacks to incrementally fill a yuv image. // by our jpeg conversion callbacks to incrementally fill a yuv image.
int I422ToI420(const uint8* src_y, int src_stride_y, int I422ToI420(const uint8* src_y, int src_stride_y,
...@@ -271,6 +392,20 @@ int I422ToI420(const uint8* src_y, int src_stride_y, ...@@ -271,6 +392,20 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
uint8* dst, int dst_stride,
int width, int height) {
// Copy plane
for (int y = 0; y < height; y += 2) {
memcpy(dst, src, width);
src += src_stride_0;
dst += dst_stride;
memcpy(dst, src, width);
src += src_stride_1;
dst += dst_stride;
}
}
// Support converting from FOURCC_M420 // Support converting from FOURCC_M420
// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for // Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
// easy conversion to I420. // easy conversion to I420.
...@@ -1238,8 +1373,7 @@ __asm { ...@@ -1238,8 +1373,7 @@ __asm {
#define HAS_ARGBTOI400ROW_SSSE3 #define HAS_ARGBTOI400ROW_SSSE3
__declspec(naked) __declspec(naked)
static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] // dst_y
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment