Commit 78020389 authored by fbarchard@google.com's avatar fbarchard@google.com

rotate for x86 and bayer refactored - 3x faster.

BUG=1
TEST=tested with talk unittests.
Review URL: http://webrtc-codereview.appspot.com/250004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@42 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 3f4c056b
...@@ -25,6 +25,14 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -25,6 +25,14 @@ int I420Copy(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height); int width, int height);
// Draw a rectangle into I420
int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int x, int y,
int width, int height,
int value_y, int value_u, int value_v);
// Convert I422 to I420. Used by MJPG. // Convert I422 to I420. Used by MJPG.
int I422ToI420(const uint8* src_y, int src_stride_y, int I422ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
...@@ -146,7 +154,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, ...@@ -146,7 +154,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
// Convert ARGB to I400. // Convert ARGB to I400.
int ARGBToI400(const uint8* src_argb, int src_stride_argb, int ARGBToI400(const uint8* src_argb, int src_stride_argb,
const uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height); int width, int height);
} // namespace libyuv } // namespace libyuv
......
...@@ -11,21 +11,25 @@ ...@@ -11,21 +11,25 @@
#ifndef INCLUDE_LIBYUV_ROTATE_H_ #ifndef INCLUDE_LIBYUV_ROTATE_H_
#define INCLUDE_LIBYUV_ROTATE_H_ #define INCLUDE_LIBYUV_ROTATE_H_
#include "basic_types.h" #include "libyuv/basic_types.h"
namespace libyuv { namespace libyuv {
// Supported rotation // Supported rotation
enum RotationMode { enum RotationMode {
kRotate0 = 0, // No rotation
kRotate90 = 90, // Rotate 90 degrees clockwise
kRotate180 = 180, // Rotate 180 degrees
kRotate270 = 270, // Rotate 270 degrees clockwise
// Deprecated
kRotateNone = 0, kRotateNone = 0,
kRotateClockwise = 90, kRotateClockwise = 90,
kRotateCounterClockwise = 270, kRotateCounterClockwise = 270,
kRotate180 = 180,
}; };
// Rotate I420 frame // Rotate I420 frame
int int I420Rotate(const uint8* src_y, int src_stride_y,
I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
...@@ -34,10 +38,8 @@ I420Rotate(const uint8* src_y, int src_stride_y, ...@@ -34,10 +38,8 @@ I420Rotate(const uint8* src_y, int src_stride_y,
int width, int height, int width, int height,
RotationMode mode); RotationMode mode);
// Split a NV12 input buffer into Y, U, V buffers and // Rotate NV12 input and store in I420
// then rotate the buffers. int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
int
NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv, const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
......
...@@ -27,396 +27,6 @@ namespace libyuv { ...@@ -27,396 +27,6 @@ namespace libyuv {
#define FORCE_INLINE #define FORCE_INLINE
#endif #endif
enum {
RED = 0,
BLUE = 1,
GREEN_BETWEEN_RED = 2,
GREEN_BETWEEN_BLUE = 3,
};
enum Position {
LEFT = 0,
RIGHT = 1,
TOP = 2,
BOTTOM = 4,
CENTER = 6,
// Due to the choice of the above values, these are all distinct and the
// corner values and edge values are each contiguous. This allows us to
// figure out the position type of a pixel with a single addition operation
// using the above values, rather than having to use a 3x3 nested switch
// statement.
TOP_LEFT = TOP + LEFT, // 2
TOP_RIGHT = TOP + RIGHT, // 3
BOTTOM_LEFT = BOTTOM + LEFT, // 4
BOTTOM_RIGHT = BOTTOM + RIGHT, // 5
LEFT_EDGE = CENTER + LEFT, // 6
RIGHT_EDGE = CENTER + RIGHT, // 7
TOP_EDGE = TOP + CENTER, // 8
BOTTOM_EDGE = BOTTOM + CENTER, // 10
MIDDLE = CENTER + CENTER, // 12
};
static FORCE_INLINE Position GetPosition(int x, int y, int width, int height) {
Position xpos = CENTER;
Position ypos = CENTER;
if (x == 0) {
xpos = LEFT;
} else if (x == width - 1) {
xpos = RIGHT;
}
if (y == 0) {
ypos = TOP;
} else if (y == height - 1) {
ypos = BOTTOM;
}
return static_cast<Position>(xpos + ypos);
}
static FORCE_INLINE bool IsRedBlue(uint8 colour) {
return colour <= BLUE;
}
static FORCE_INLINE uint32 FourCcToBayerPixelColourMap(uint32 fourcc) {
// The colour map is a 4-byte array-as-uint32 containing the colours for the
// four pixels in each 2x2 grid, in left-to-right and top-to-bottom order.
switch (fourcc) {
default:
assert(false);
case FOURCC_RGGB:
return FOURCC(RED, GREEN_BETWEEN_RED, GREEN_BETWEEN_BLUE, BLUE);
case FOURCC_BGGR:
return FOURCC(BLUE, GREEN_BETWEEN_BLUE, GREEN_BETWEEN_RED, RED);
case FOURCC_GRBG:
return FOURCC(GREEN_BETWEEN_RED, RED, BLUE, GREEN_BETWEEN_BLUE);
case FOURCC_GBRG:
return FOURCC(GREEN_BETWEEN_BLUE, BLUE, RED, GREEN_BETWEEN_RED);
}
}
static FORCE_INLINE void RGBToYUV(uint8 r, uint8 g, uint8 b,
uint8* y, uint8* u, uint8* v) {
// Taken from http://en.wikipedia.org/wiki/YUV
*y = (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
*u = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
*v = ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r,
uint8* g,
uint8* b,
const uint8* src,
int src_stride,
Position pos,
uint8 colour) {
// Compute the offsets to use for fetching the adjacent pixels.
int adjacent_row;
int adjacent_column;
switch (pos) {
case TOP_LEFT:
adjacent_row = src_stride;
adjacent_column = 1;
break;
case TOP_RIGHT:
adjacent_row = src_stride;
adjacent_column = -1;
break;
case BOTTOM_LEFT:
adjacent_row = -src_stride;
adjacent_column = 1;
break;
case BOTTOM_RIGHT:
default:
adjacent_row = -src_stride;
adjacent_column = -1;
break;
}
// Now interpolate.
if (IsRedBlue(colour)) {
uint8 current_pixel = src[0];
// Average of the adjacent green pixels (there's only two).
*g = (src[adjacent_column] + src[adjacent_row]) / 2;
// Average of the oppositely-coloured corner pixels (there's only one).
uint8 corner_average = src[adjacent_row + adjacent_column];
if (colour == RED) {
*r = current_pixel;
*b = corner_average;
} else { // i.e., BLUE
*b = current_pixel;
*r = corner_average;
}
} else { // i.e., GREEN_BETWEEN_*
*g = src[0];
// Average of the adjacent same-row pixels (there's only one).
uint8 row_average = src[adjacent_column];
// Average of the adjacent same-column pixels (there's only one).
uint8 column_average = src[adjacent_row];
if (colour == GREEN_BETWEEN_RED) {
*r = row_average;
*b = column_average;
} else { // i.e., GREEN_BETWEEN_BLUE
*b = row_average;
*r = column_average;
}
}
}
static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r,
uint8* g,
uint8* b,
const uint8* src,
int src_stride,
Position pos,
uint8 colour) {
// Compute the offsets to use for fetching the adjacent pixels.
// Goes one pixel "in" to the image (i.e. towards the center)
int inner;
// Goes one pixel to the side (i.e. along the edge) in either the clockwise or
// counter-clockwise direction, and its negative value goes in the other
// direction.
int side;
switch (pos) {
case TOP_EDGE:
inner = src_stride;
side = 1;
break;
case RIGHT_EDGE:
inner = -1;
side = src_stride;
break;
case BOTTOM_EDGE:
inner = -src_stride;
side = 1;
break;
case LEFT_EDGE:
default:
inner = 1;
side = src_stride;
break;
}
// Now interpolate.
if (IsRedBlue(colour)) {
uint8 current_pixel = src[0];
// Average of the adjacent green pixels (there's only three).
*g = (src[inner] + src[side] + src[-side]) / 3;
// Average of the oppositely-coloured corner pixels (there's only two).
uint8 corner_average = (src[inner + side] + src[inner - side]) / 2;
if (colour == RED) {
*r = current_pixel;
*b = corner_average;
} else { // i.e., BLUE
*b = current_pixel;
*r = corner_average;
}
} else { // i.e., GREEN_BETWEEN_*
*g = src[0];
// Average of the adjacent side-ways pixels (there's only two).
uint8 side_average = (src[side] + src[-side]) / 2;
// Average of the adjacent inner-ways pixels (there's only one).
uint8 inner_pixel = src[inner];
// Including && side == 1 effectively transposes the colour logic for
// processing the left/right sides, which is needed since the "T" shape
// formed by the pixels is transposed.
if (colour == GREEN_BETWEEN_RED && side == 1) {
*r = side_average;
*b = inner_pixel;
} else { // i.e., GREEN_BETWEEN_BLUE || side != 1
*b = side_average;
*r = inner_pixel;
}
}
}
// We inline this one because it runs 99% of the time, so inlining it is
// probably beneficial.
static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r,
uint8* g,
uint8* b,
const uint8* src,
int src_stride,
uint8 colour) {
if (IsRedBlue(colour)) {
uint8 current_pixel = src[0];
// Average of the adjacent green pixels (there's four).
// NOTE(tschmelcher): The material at
// http://www.siliconimaging.com/RGB%20Bayer.htm discusses a way to improve
// quality here by using only two of the green pixels based on the
// correlation to the nearby red/blue pixels, but that is slower and would
// result in more edge cases.
*g = (src[1] + src[-1] + src[src_stride] + src[-src_stride]) / 4;
// Average of the oppositely-coloured corner pixels (there's four).
uint8 corner_average = (src[src_stride + 1] +
src[src_stride - 1] +
src[-src_stride + 1] +
src[-src_stride - 1]) / 4;
if (colour == RED) {
*r = current_pixel;
*b = corner_average;
} else { // i.e., BLUE
*b = current_pixel;
*r = corner_average;
}
} else { // i.e., GREEN_BETWEEN_*
*g = src[0];
// Average of the adjacent same-row pixels (there's two).
uint8 row_adjacent = (src[1] + src[-1]) / 2;
// Average of the adjacent same-column pixels (there's two).
uint8 column_adjacent = (src[src_stride] + src[-src_stride]) / 2;
if (colour == GREEN_BETWEEN_RED) {
*r = row_adjacent;
*b = column_adjacent;
} else { // i.e., GREEN_BETWEEN_BLUE
*b = row_adjacent;
*r = column_adjacent;
}
}
}
// Converts any Bayer RGB format to ARGB.
int BayerRGBToARGB(const uint8* src, int src_stride, uint32 src_fourcc,
uint8* dst, int dst_stride,
int width, int height) {
assert(width % 2 == 0);
assert(height % 2 == 0);
uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
int src_row_inc = src_stride * 2 - width;
int dst_row_inc = dst_stride * 2 - width * 4;
// Iterate over the 2x2 grids.
for (int y1 = 0; y1 < height; y1 += 2) {
for (int x1 = 0; x1 < width; x1 += 2) {
uint32 colours = colour_map;
// Iterate over the four pixels within them.
for (int y2 = 0; y2 < 2; ++y2) {
for (int x2 = 0; x2 < 2; ++x2) {
uint8 r, g, b;
// The low-order byte of the colour map is the current colour.
uint8 current_colour = static_cast<uint8>(colours);
colours >>= 8;
Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
const uint8* src_pixel = &src[y2 * src_stride + x2];
uint8* dst_pixel = &dst[y2 * dst_stride + x2 * 4];
// Convert from Bayer RGB to regular RGB.
if (pos == MIDDLE) {
// 99% of the image is the middle.
InterpolateBayerRGBCenter(&r, &g, &b,
src_pixel, src_stride,
current_colour);
} else if (pos >= LEFT_EDGE) {
// Next most frequent is edges.
InterpolateBayerRGBEdge(&r, &g, &b,
src_pixel, src_stride, pos,
current_colour);
} else {
// Last is the corners. There are only 4.
InterpolateBayerRGBCorner(&r, &g, &b,
src_pixel, src_stride, pos,
current_colour);
}
// Store ARGB
dst_pixel[0] = b;
dst_pixel[1] = g;
dst_pixel[2] = r;
dst_pixel[3] = 255u;
}
}
src += 2;
dst += 2 * 4;
}
src += src_row_inc;
dst += dst_row_inc;
}
return 0;
}
// Converts any Bayer RGB format to I420.
int BayerRGBToI420(const uint8* src, int src_stride, uint32 src_fourcc,
uint8* y, int y_stride,
uint8* u, int u_stride,
uint8* v, int v_stride,
int width, int height) {
assert(width % 2 == 0);
assert(height % 2 == 0);
uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
int src_row_inc = src_stride * 2 - width;
int y_row_inc = y_stride * 2 - width;
int u_row_inc = u_stride - width / 2;
int v_row_inc = v_stride - width / 2;
// Iterate over the 2x2 grids.
for (int y1 = 0; y1 < height; y1 += 2) {
for (int x1 = 0; x1 < width; x1 += 2) {
uint32 colours = colour_map;
int total_u = 0;
int total_v = 0;
// Iterate over the four pixels within them.
for (int y2 = 0; y2 < 2; ++y2) {
for (int x2 = 0; x2 < 2; ++x2) {
uint8 r, g, b;
// The low-order byte of the colour map is the current colour.
uint8 current_colour = static_cast<uint8>(colours);
colours >>= 8;
Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
const uint8* src_pixel = &src[y2 * src_stride + x2];
uint8* y_pixel = &y[y2 * y_stride + x2];
// Convert from Bayer RGB to regular RGB.
if (pos == MIDDLE) {
// 99% of the image is the middle.
InterpolateBayerRGBCenter(&r, &g, &b,
src_pixel, src_stride,
current_colour);
} else if (pos >= LEFT_EDGE) {
// Next most frequent is edges.
InterpolateBayerRGBEdge(&r, &g, &b,
src_pixel, src_stride, pos,
current_colour);
} else {
// Last is the corners. There are only 4.
InterpolateBayerRGBCorner(&r, &g, &b,
src_pixel, src_stride, pos,
current_colour);
}
// Convert from RGB to YUV.
uint8 tmp_u, tmp_v;
RGBToYUV(r, g, b, y_pixel, &tmp_u, &tmp_v);
total_u += tmp_u;
total_v += tmp_v;
}
}
src += 2;
y += 2;
*u = total_u / 4;
*v = total_v / 4;
++u;
++v;
}
src += src_row_inc;
y += y_row_inc;
u += u_row_inc;
v += v_row_inc;
}
return 0;
}
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
// and vst would select which 2 components to write. The low level would need // and vst would select which 2 components to write. The low level would need
// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
...@@ -429,15 +39,15 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, ...@@ -429,15 +39,15 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_bayer mov edx, [esp + 8] // dst_bayer
movd xmm0, [esp + 12] // selector movd xmm7, [esp + 12] // selector
mov ecx, [esp + 16] // pix mov ecx, [esp + 16] // pix
pshufd xmm0, xmm0, 0 pshufd xmm7, xmm7, 0
wloop: wloop:
movdqa xmm1, [eax] movdqa xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
pshufb xmm1, xmm0 pshufb xmm0, xmm7
movd [edx], xmm1 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 4 sub ecx, 4
ja wloop ja wloop
...@@ -445,37 +55,30 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, ...@@ -445,37 +55,30 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
} }
} }
#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ #elif (defined(__x86_64__) || defined(__i386__)) && \
!TARGET_IPHONE_SIMULATOR !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOBAYERROW_SSSE3 #define HAS_ARGBTOBAYERROW_SSSE3
extern "C" void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix); uint32 selector, int pix) {
asm( asm volatile(
".text\n" "movd %3,%%xmm7\n"
#if defined(OSX) "pshufd $0x0,%%xmm7,%%xmm7\n"
".globl _ARGBToBayerRow_SSSE3\n"
"_ARGBToBayerRow_SSSE3:\n"
#else
".global ARGBToBayerRow_SSSE3\n"
"ARGBToBayerRow_SSSE3:\n"
#endif
"mov 0x4(%esp),%eax\n"
"mov 0x8(%esp),%edx\n"
"movd 0xc(%esp),%xmm0\n"
"mov 0x10(%esp),%ecx\n"
"pshufd $0x0,%xmm0,%xmm0\n"
"1:" "1:"
"movdqa (%eax),%xmm1\n" "movdqa (%0),%%xmm0\n"
"lea 0x10(%eax),%eax\n" "lea 0x10(%0),%0\n"
"pshufb %xmm0,%xmm1\n" "pshufb %%xmm7,%%xmm0\n"
"movd %xmm1,(%edx)\n" "movd %%xmm0,(%1)\n"
"lea 0x4(%edx),%edx\n" "lea 0x4(%1),%1\n"
"sub $0x4,%ecx\n" "sub $0x4,%2\n"
"ja 1b\n" "ja 1b\n"
"ret\n" : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
: "r"(selector) // %3
: "memory"
); );
}
#endif #endif
static void ARGBToBayerRow_C(const uint8* src_argb, static void ARGBToBayerRow_C(const uint8* src_argb,
...@@ -483,12 +86,15 @@ static void ARGBToBayerRow_C(const uint8* src_argb, ...@@ -483,12 +86,15 @@ static void ARGBToBayerRow_C(const uint8* src_argb,
int index0 = selector & 0xff; int index0 = selector & 0xff;
int index1 = (selector >> 8) & 0xff; int index1 = (selector >> 8) & 0xff;
// Copy a row of Bayer. // Copy a row of Bayer.
for (int x = 0; x < pix; x += 2) { for (int x = 0; x < (pix - 1); x += 2) {
dst_bayer[0] = src_argb[index0]; dst_bayer[0] = src_argb[index0];
dst_bayer[1] = src_argb[index1]; dst_bayer[1] = src_argb[index1];
src_argb += 8; src_argb += 8;
dst_bayer += 2; dst_bayer += 2;
} }
if (pix & 1) {
dst_bayer[0] = src_argb[index0];
}
} }
// generate a selector mask useful for pshufb // generate a selector mask useful for pshufb
...@@ -504,7 +110,11 @@ int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb, ...@@ -504,7 +110,11 @@ int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
uint8* dst_bayer, int dst_stride_bayer, uint8* dst_bayer, int dst_stride_bayer,
uint32 dst_fourcc_bayer, uint32 dst_fourcc_bayer,
int width, int height) { int width, int height) {
assert(width % 2 == 0); if (height < 0) {
height = -height;
src_rgb = src_rgb + (height - 1) * src_stride_rgb;
src_stride_rgb = -src_stride_rgb;
}
void (*ARGBToBayerRow)(const uint8* src_argb, void (*ARGBToBayerRow)(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix); uint8* dst_bayer, uint32 selector, int pix);
#if defined(HAS_ARGBTOBAYERROW_SSSE3) #if defined(HAS_ARGBTOBAYERROW_SSSE3)
...@@ -556,4 +166,277 @@ int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb, ...@@ -556,4 +166,277 @@ int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
return 0; return 0;
} }
#define AVG(a,b) (((a) + (b)) >> 1)
static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_rgb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 g = src_bayer0[1];
uint8 r = src_bayer1[1];
for (int x = 0; x < (pix - 2); x += 2) {
dst_rgb[0] = src_bayer0[0];
dst_rgb[1] = AVG(g, src_bayer0[1]);
dst_rgb[2] = AVG(r, src_bayer1[1]);
dst_rgb[3] = 255U;
dst_rgb[4] = AVG(src_bayer0[0], src_bayer0[2]);
dst_rgb[5] = src_bayer0[1];
dst_rgb[6] = src_bayer1[1];
dst_rgb[7] = 255U;
g = src_bayer0[1];
r = src_bayer1[1];
src_bayer0 += 2;
src_bayer1 += 2;
dst_rgb += 8;
}
dst_rgb[0] = src_bayer0[0];
dst_rgb[1] = AVG(g, src_bayer0[1]);
dst_rgb[2] = AVG(r, src_bayer1[1]);
dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer0[0];
dst_rgb[5] = src_bayer0[1];
dst_rgb[6] = src_bayer1[1];
dst_rgb[7] = 255U;
}
static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_rgb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 g = src_bayer0[1];
uint8 b = src_bayer1[1];
for (int x = 0; x < (pix - 2); x += 2) {
dst_rgb[0] = AVG(b, src_bayer1[1]);
dst_rgb[1] = AVG(g, src_bayer0[1]);
dst_rgb[2] = src_bayer0[0];
dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer1[1];
dst_rgb[5] = src_bayer0[1];
dst_rgb[6] = AVG(src_bayer0[0], src_bayer0[2]);
dst_rgb[7] = 255U;
g = src_bayer0[1];
b = src_bayer1[1];
src_bayer0 += 2;
src_bayer1 += 2;
dst_rgb += 8;
}
dst_rgb[0] = AVG(b, src_bayer1[1]);
dst_rgb[1] = AVG(g, src_bayer0[1]);
dst_rgb[2] = src_bayer0[0];
dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer1[1];
dst_rgb[5] = src_bayer0[1];
dst_rgb[6] = src_bayer0[0];
dst_rgb[7] = 255U;
}
static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_rgb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 b = src_bayer0[1];
for (int x = 0; x < (pix - 2); x += 2) {
dst_rgb[0] = AVG(b, src_bayer0[1]);
dst_rgb[1] = src_bayer0[0];
dst_rgb[2] = src_bayer1[0];
dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer0[1];
dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
dst_rgb[6] = AVG(src_bayer1[0], src_bayer1[2]);
dst_rgb[7] = 255U;
b = src_bayer0[1];
src_bayer0 += 2;
src_bayer1 += 2;
dst_rgb += 8;
}
dst_rgb[0] = AVG(b, src_bayer0[1]);
dst_rgb[1] = src_bayer0[0];
dst_rgb[2] = src_bayer1[0];
dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer0[1];
dst_rgb[5] = src_bayer0[0];
dst_rgb[6] = src_bayer1[0];
dst_rgb[7] = 255U;
}
static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_rgb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 r = src_bayer0[1];
for (int x = 0; x < (pix - 2); x += 2) {
dst_rgb[0] = src_bayer1[0];
dst_rgb[1] = src_bayer0[0];
dst_rgb[2] = AVG(r, src_bayer0[1]);
dst_rgb[3] = 255U;
dst_rgb[4] = AVG(src_bayer1[0], src_bayer1[2]);
dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
dst_rgb[6] = src_bayer0[1];
dst_rgb[7] = 255U;
r = src_bayer0[1];
src_bayer0 += 2;
src_bayer1 += 2;
dst_rgb += 8;
}
dst_rgb[0] = src_bayer1[0];
dst_rgb[1] = src_bayer0[0];
dst_rgb[2] = AVG(r, src_bayer0[1]);
dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer1[0];
dst_rgb[5] = src_bayer0[0];
dst_rgb[6] = src_bayer0[1];
dst_rgb[7] = 255U;
}
// Converts any Bayer RGB format to ARGB.
int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
uint32 src_fourcc_bayer,
uint8* dst_rgb, int dst_stride_rgb,
int width, int height) {
if (height < 0) {
height = -height;
dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
dst_stride_rgb = -dst_stride_rgb;
}
void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_rgb, int pix);
void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_rgb, int pix);
switch (src_fourcc_bayer) {
default:
assert(false);
case FOURCC_RGGB:
BayerRow0 = BayerRowRG;
BayerRow1 = BayerRowGB;
break;
case FOURCC_BGGR:
BayerRow0 = BayerRowBG;
BayerRow1 = BayerRowGR;
break;
case FOURCC_GRBG:
BayerRow0 = BayerRowGR;
BayerRow1 = BayerRowBG;
break;
case FOURCC_GBRG:
BayerRow0 = BayerRowGB;
BayerRow1 = BayerRowRG;
break;
}
for (int y = 0; y < (height - 1); y += 2) {
BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
dst_rgb + dst_stride_rgb, width);
src_bayer += src_stride_bayer * 2;
dst_rgb += dst_stride_rgb * 2;
}
if (height & 1) {
BayerRow0(src_bayer, -src_stride_bayer, dst_rgb, width);
}
return 0;
}
// Taken from http://en.wikipedia.org/wiki/YUV
static FORCE_INLINE int RGBToY(uint8 r, uint8 g, uint8 b) {
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
}
static FORCE_INLINE int RGBToU(uint8 r, uint8 g, uint8 b) {
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
}
static FORCE_INLINE int RGBToV(uint8 r, uint8 g, uint8 b) {
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
static void ARGBtoYRow(const uint8* src_argb0,
uint8* dst_y, int width) {
for (int x = 0; x < width; ++x) {
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
src_argb0 += 4;
dst_y += 1;
}
}
static void ARGBtoUVRow(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u,
uint8* dst_v,
int width) {
const uint8* src_argb1 = src_argb0 + src_stride_argb;
for (int x = 0; x < width - 1; x += 2) {
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
src_argb0 += 8;
src_argb1 += 8;
dst_u += 1;
dst_v += 1;
}
}
// Converts any Bayer RGB format to ARGB.
int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
uint32 src_fourcc_bayer,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
int halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (halfheight - 1) * dst_stride_v;
dst_stride_y = -dst_stride_y;
dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v;
}
void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_rgb, int pix);
void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_rgb, int pix);
switch (src_fourcc_bayer) {
default:
assert(false);
case FOURCC_RGGB:
BayerRow0 = BayerRowRG;
BayerRow1 = BayerRowGB;
break;
case FOURCC_BGGR:
BayerRow0 = BayerRowBG;
BayerRow1 = BayerRowGR;
break;
case FOURCC_GRBG:
BayerRow0 = BayerRowGR;
BayerRow1 = BayerRowBG;
break;
case FOURCC_GBRG:
BayerRow0 = BayerRowGB;
BayerRow1 = BayerRowRG;
break;
}
#define kMaxStride 2048 * 4
uint8 row[kMaxStride * 2];
for (int y = 0; y < (height - 1); y += 2) {
BayerRow0(src_bayer, src_stride_bayer, row, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
row + kMaxStride, width);
ARGBtoYRow(row, dst_y, width);
ARGBtoYRow(row + kMaxStride, dst_y + dst_stride_y, width);
ARGBtoUVRow(row, kMaxStride, dst_u, dst_v, width);
src_bayer += src_stride_bayer * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
BayerRow0(src_bayer, src_stride_bayer, row, width);
ARGBtoYRow(row, dst_y, width);
ARGBtoUVRow(row, 0, dst_u, dst_v, width);
}
return 0;
}
} // namespace libyuv } // namespace libyuv
...@@ -49,28 +49,33 @@ static void SplitUV_NEON(const uint8* src_uv, ...@@ -49,28 +49,33 @@ static void SplitUV_NEON(const uint8* src_uv,
#endif #endif
// Shuffle table for converting ABGR to ARGB. // Shuffle table for converting ABGR to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
{ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u }; 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting BGRA to ARGB. // Shuffle table for converting BGRA to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
{ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
// Shuffle table for converting BG24 to ARGB. // Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
{ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB. // Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
{ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
// Constant multiplication table for converting ARGB to I400. // Constant multiplication table for converting ARGB to I400.
extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
{ 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u }; };
extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) = extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) = {
{ 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u }; 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
};
#if defined(WIN32) && !defined(COVERAGE_ENABLED) #if defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SPLITUV_SSE2 #define HAS_SPLITUV_SSE2
...@@ -169,28 +174,7 @@ static void I420CopyPlane(const uint8* src_y, int src_stride_y, ...@@ -169,28 +174,7 @@ static void I420CopyPlane(const uint8* src_y, int src_stride_y,
} }
} }
static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, // Copy I420 with optional flipping
uint8* dst, int dst_stride,
int width, int height) {
// Copy plane
for (int y = 0; y < height; y += 2) {
memcpy(dst, src, width);
src += src_stride_0;
dst += dst_stride;
memcpy(dst, src, width);
src += src_stride_1;
dst += dst_stride;
}
}
// TODO(fbarchard): For biplanar formats (ie NV21), the Y plane is the same
// as I420, and only the chroma plane varies. Copy the Y plane by reference,
// and just convert the UV. This method can be used for NV21, NV12, I420,
// I422, M422. 8 of the 12 bits is Y, so this would copy 3 times less data,
// which is approximately how much faster it would be.
// Helper function to copy yuv data without scaling. Used
// by our jpeg conversion callbacks to incrementally fill a yuv image.
int I420Copy(const uint8* src_y, int src_stride_y, int I420Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
...@@ -198,6 +182,12 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -198,6 +182,12 @@ int I420Copy(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
if (!src_y || !src_u || !src_v ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
...@@ -218,6 +208,137 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -218,6 +208,137 @@ int I420Copy(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// SetRows32 writes 'count' bytes using a 32 bit value repeated
#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
#define HAS_SETROW_NEON
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
__asm__ volatile
(
"vdup.u32 {q0}, %2 \n" // duplicate 4 ints
"1:\n"
"vst1.u32 {q0}, [%0]! \n" // store
"subs %1, %1, #16 \n" // 16 processed per loop
"bhi 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
: "q0", "memory"
);
}
#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SETROW_SSE2
__declspec(naked)
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
__asm {
mov eax, [esp + 4] // dst
movd xmm7, [esp + 8] // v32
mov ecx, [esp + 12] // count
pshufd xmm7, xmm7, 0
wloop:
movdqa [eax], xmm7
lea eax, [eax + 16]
sub ecx, 16
ja wloop
ret
}
}
#elif (defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_SETROW_SSE2
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
asm volatile(
"movd %2, %%xmm7\n"
"pshufd $0x0,%%xmm7,%%xmm7\n"
"1:"
"movdqa %%xmm7,(%0)\n"
"lea 0x10(%0),%0\n"
"sub $0x10,%1\n"
"ja 1b\n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
: "memory"
);
}
#endif
static void SetRow8_C(uint8* dst, uint32 v8, int count) {
memset(dst, v8, count);
}
static void I420SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
int value) {
void (*SetRow)(uint8* dst, uint32 value, int pix);
#if defined(HAS_SETROW_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(width % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
SetRow = SetRow32_NEON;
} else
#elif defined(HAS_SETROW_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
(width % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
SetRow = SetRow32_SSE2;
} else
#endif
{
SetRow = SetRow8_C;
}
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
// Set plane
for (int y = 0; y < height; ++y) {
SetRow(dst_y, v32, width);
dst_y += dst_stride_y;
}
}
// Draw a rectangle into I420
int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int x, int y,
int width, int height,
int value_y, int value_u, int value_v) {
if (!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0 ||
x < 0 || y < 0 ||
value_y < 0 || value_y > 255 ||
value_u < 0 || value_u > 255 ||
value_v < 0 || value_v > 255) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
int halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (halfheight - 1) * dst_stride_v;
dst_stride_y = -dst_stride_y;
dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v;
}
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
uint8* start_y = dst_y + y * dst_stride_y + x;
uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
I420SetPlane(start_y, dst_stride_y, width, height, value_y);
I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
return 0;
}
// Helper function to copy yuv data without scaling. Used // Helper function to copy yuv data without scaling. Used
// by our jpeg conversion callbacks to incrementally fill a yuv image. // by our jpeg conversion callbacks to incrementally fill a yuv image.
int I422ToI420(const uint8* src_y, int src_stride_y, int I422ToI420(const uint8* src_y, int src_stride_y,
...@@ -271,6 +392,20 @@ int I422ToI420(const uint8* src_y, int src_stride_y, ...@@ -271,6 +392,20 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
uint8* dst, int dst_stride,
int width, int height) {
// Copy plane
for (int y = 0; y < height; y += 2) {
memcpy(dst, src, width);
src += src_stride_0;
dst += dst_stride;
memcpy(dst, src, width);
src += src_stride_1;
dst += dst_stride;
}
}
// Support converting from FOURCC_M420 // Support converting from FOURCC_M420
// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for // Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
// easy conversion to I420. // easy conversion to I420.
...@@ -1238,8 +1373,7 @@ __asm { ...@@ -1238,8 +1373,7 @@ __asm {
#define HAS_ARGBTOI400ROW_SSSE3 #define HAS_ARGBTOI400ROW_SSSE3
__declspec(naked) __declspec(naked)
static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] // dst_y
......
...@@ -154,6 +154,133 @@ __asm { ...@@ -154,6 +154,133 @@ __asm {
} }
} }
#define HAS_TRANSPOSE_UVWX8_SSE2
__declspec(naked)
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
mov ebp, [esp + 16 + 24] // dst_stride_b
mov ecx, esp
sub esp, 4 + 16
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
convertloop :
// Read in the data from the source pointer.
// First round of bit swap.
movdqa xmm0, [eax]
movdqa xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
movdqa xmm2, [eax]
movdqa xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
movdqa xmm4, [eax]
movdqa xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
movdqa xmm6, [eax]
movdqa xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
movdqa xmm2, xmm5
movdqa xmm5, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm5, xmm3
movdqa xmm3, xmm5
movdqa xmm5, xmm4
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
movdqa xmm5, [esp] // restore xmm5
movdqa [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
movdqa xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
punpckhdq xmm0, xmm6
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
punpckhdq xmm0, xmm5
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
sub ecx, 8
ja convertloop
mov esp, [esp + 16]
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
#elif (defined(__i386__) || defined(__x86_64__)) && \ #elif (defined(__i386__) || defined(__x86_64__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_TRANSPOSE_WX8_SSSE3 #define HAS_TRANSPOSE_WX8_SSSE3
...@@ -240,15 +367,134 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ...@@ -240,15 +367,134 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
); );
} }
// TODO(fbarchard): Port to 32 bit
#if defined (__x86_64__)
#define HAS_TRANSPOSE_UVWX8_SSE2
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
asm volatile(
"1:"
// Read in the data from the source pointer.
// First round of bit swap.
"movdqa (%0),%%xmm0\n"
"movdqa (%0,%4),%%xmm1\n"
"lea (%0,%4,2),%0\n"
"movdqa %%xmm0,%%xmm8\n"
"punpcklbw %%xmm1,%%xmm0\n"
"punpckhbw %%xmm1,%%xmm8\n"
"movdqa %%xmm8,%%xmm1\n"
"movdqa (%0),%%xmm2\n"
"movdqa (%0,%4),%%xmm3\n"
"lea (%0,%4,2),%0\n"
"movdqa %%xmm2,%%xmm8\n"
"punpcklbw %%xmm3,%%xmm2\n"
"punpckhbw %%xmm3,%%xmm8\n"
"movdqa %%xmm8,%%xmm3\n"
"movdqa (%0),%%xmm4\n"
"movdqa (%0,%4),%%xmm5\n"
"lea (%0,%4,2),%0\n"
"movdqa %%xmm4,%%xmm8\n"
"punpcklbw %%xmm5,%%xmm4\n"
"punpckhbw %%xmm5,%%xmm8\n"
"movdqa %%xmm8,%%xmm5\n"
"movdqa (%0),%%xmm6\n"
"movdqa (%0,%4),%%xmm7\n"
"lea (%0,%4,2),%0\n"
"movdqa %%xmm6,%%xmm8\n"
"punpcklbw %%xmm7,%%xmm6\n"
"neg %4\n"
"lea 0x10(%0,%4,8),%0\n"
"punpckhbw %%xmm7,%%xmm8\n"
"movdqa %%xmm8,%%xmm7\n"
"neg %4\n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8\n"
"movdqa %%xmm1,%%xmm9\n"
"punpckhwd %%xmm2,%%xmm8\n"
"punpckhwd %%xmm3,%%xmm9\n"
"punpcklwd %%xmm2,%%xmm0\n"
"punpcklwd %%xmm3,%%xmm1\n"
"movdqa %%xmm8,%%xmm2\n"
"movdqa %%xmm9,%%xmm3\n"
"movdqa %%xmm4,%%xmm8\n"
"movdqa %%xmm5,%%xmm9\n"
"punpckhwd %%xmm6,%%xmm8\n"
"punpckhwd %%xmm7,%%xmm9\n"
"punpcklwd %%xmm6,%%xmm4\n"
"punpcklwd %%xmm7,%%xmm5\n"
"movdqa %%xmm8,%%xmm6\n"
"movdqa %%xmm9,%%xmm7\n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8\n"
"punpckldq %%xmm4,%%xmm0\n"
"movlpd %%xmm0,(%1)\n" // Write back U channel
"movhpd %%xmm0,(%2)\n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8\n"
"movlpd %%xmm8,(%1,%5)\n"
"lea (%1,%5,2),%1\n"
"movhpd %%xmm8,(%2,%6)\n"
"lea (%2,%6,2),%2\n"
"movdqa %%xmm2,%%xmm8\n"
"punpckldq %%xmm6,%%xmm2\n"
"movlpd %%xmm2,(%1)\n"
"movhpd %%xmm2,(%2)\n"
"punpckhdq %%xmm6,%%xmm8\n"
"movlpd %%xmm8,(%1,%5)\n"
"lea (%1,%5,2),%1\n"
"movhpd %%xmm8,(%2,%6)\n"
"lea (%2,%6,2),%2\n"
"movdqa %%xmm1,%%xmm8\n"
"punpckldq %%xmm5,%%xmm1\n"
"movlpd %%xmm1,(%1)\n"
"movhpd %%xmm1,(%2)\n"
"punpckhdq %%xmm5,%%xmm8\n"
"movlpd %%xmm8,(%1,%5)\n"
"lea (%1,%5,2),%1\n"
"movhpd %%xmm8,(%2,%6)\n"
"lea (%2,%6,2),%2\n"
"movdqa %%xmm3,%%xmm8\n"
"punpckldq %%xmm7,%%xmm3\n"
"movlpd %%xmm3,(%1)\n"
"movhpd %%xmm3,(%2)\n"
"punpckhdq %%xmm7,%%xmm8\n"
"movlpd %%xmm8,(%1,%5)\n"
"lea (%1,%5,2),%1\n"
"movhpd %%xmm8,(%2,%6)\n"
"lea (%2,%6,2),%2\n"
"sub $0x8,%3\n"
"ja 1b\n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(w) // %3
: "r"(static_cast<intptr_t>(src_stride)), // %4
"r"(static_cast<intptr_t>(dst_stride_a)), // %5
"r"(static_cast<intptr_t>(dst_stride_b)) // %6
: "memory"
);
}
#endif
#endif #endif
static void TransposeWx8_C(const uint8* src, int src_stride, static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride,
int w) { int w) {
int i, j; int i;
for (i = 0; i < w; ++i) for (i = 0; i < w; ++i) {
for (j = 0; j < 8; ++j) dst[0] = src[0 * src_stride];
dst[i * dst_stride + j] = src[j * src_stride + i]; dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
dst[3] = src[3 * src_stride];
dst[4] = src[4 * src_stride];
dst[5] = src[5 * src_stride];
dst[6] = src[6 * src_stride];
dst[7] = src[7 * src_stride];
++src;
dst += dst_stride;
}
} }
static void TransposeWxH_C(const uint8* src, int src_stride, static void TransposeWxH_C(const uint8* src, int src_stride,
...@@ -328,10 +574,10 @@ void RotatePlane270(const uint8* src, int src_stride, ...@@ -328,10 +574,10 @@ void RotatePlane270(const uint8* src, int src_stride,
static void ReverseLine_C(const uint8* src, uint8* dst, int width) { static void ReverseLine_C(const uint8* src, uint8* dst, int width) {
int i; int i;
src += width; src += width - 1;
for (i = 0; i < width; ++i) { for (i = 0; i < width; ++i) {
--src;
dst[i] = src[0]; dst[i] = src[0];
--src;
} }
} }
...@@ -407,15 +653,13 @@ void RotatePlane180(const uint8* src, int src_stride, ...@@ -407,15 +653,13 @@ void RotatePlane180(const uint8* src, int src_stride,
{ {
ReverseLine = ReverseLine_C; ReverseLine = ReverseLine_C;
} }
// Rotate by 180 is a mirror with the destination // Rotate by 180 is a mirror and vertical flip
// written in reverse. src += src_stride * (height - 1);
dst += dst_stride * (height - 1);
for (i = 0; i < height; ++i) { for (i = 0; i < height; ++i) {
ReverseLine(src, dst, width); ReverseLine(src, dst, width);
src -= src_stride;
src += src_stride; dst += dst_stride;
dst -= dst_stride;
} }
} }
...@@ -423,11 +667,27 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride, ...@@ -423,11 +667,27 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int w) { int w) {
int i, j; int i;
for (i = 0; i < w * 2; i += 2) for (i = 0; i < w; ++i) {
for (j = 0; j < 8; ++j) { dst_a[0] = src[0 * src_stride + 0];
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; dst_b[0] = src[0 * src_stride + 1];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; dst_a[1] = src[1 * src_stride + 0];
dst_b[1] = src[1 * src_stride + 1];
dst_a[2] = src[2 * src_stride + 0];
dst_b[2] = src[2 * src_stride + 1];
dst_a[3] = src[3 * src_stride + 0];
dst_b[3] = src[3 * src_stride + 1];
dst_a[4] = src[4 * src_stride + 0];
dst_b[4] = src[4 * src_stride + 1];
dst_a[5] = src[5 * src_stride + 0];
dst_b[5] = src[5 * src_stride + 1];
dst_a[6] = src[6 * src_stride + 0];
dst_b[6] = src[6 * src_stride + 1];
dst_a[7] = src[7 * src_stride + 0];
dst_b[7] = src[7 * src_stride + 1];
src += 2;
dst_a += dst_stride_a;
dst_b += dst_stride_b;
} }
} }
...@@ -436,7 +696,7 @@ static void TransposeUVWxH_C(const uint8* src, int src_stride, ...@@ -436,7 +696,7 @@ static void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int w, int h) { int w, int h) {
int i, j; int i, j;
for (i = 0; i < w*2; i += 2) for (i = 0; i < w * 2; i += 2)
for (j = 0; j < h; ++j) { for (j = 0; j < h; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
...@@ -452,12 +712,8 @@ void TransposeUV(const uint8* src, int src_stride, ...@@ -452,12 +712,8 @@ void TransposeUV(const uint8* src, int src_stride,
rotate_uv_wxh_func TransposeWxH; rotate_uv_wxh_func TransposeWxH;
#if defined(HAS_TRANSPOSE_UVWX8_NEON) #if defined(HAS_TRANSPOSE_UVWX8_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(width % 8 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0)) {
unsigned long long store_reg[8]; unsigned long long store_reg[8];
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
SaveRegisters_NEON(store_reg); SaveRegisters_NEON(store_reg);
TransposeWx8 = TransposeUVWx8_NEON; TransposeWx8 = TransposeUVWx8_NEON;
TransposeWxH = TransposeUVWxH_C; TransposeWxH = TransposeUVWxH_C;
...@@ -466,9 +722,9 @@ void TransposeUV(const uint8* src, int src_stride, ...@@ -466,9 +722,9 @@ void TransposeUV(const uint8* src, int src_stride,
#if defined(HAS_TRANSPOSE_UVWX8_SSE2) #if defined(HAS_TRANSPOSE_UVWX8_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
(width % 8 == 0) && (width % 8 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) && IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) && IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0)) { IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) {
TransposeWx8 = TransposeUVWx8_SSE2; TransposeWx8 = TransposeUVWx8_SSE2;
TransposeWxH = TransposeUVWxH_C; TransposeWxH = TransposeUVWxH_C;
} else } else
...@@ -544,7 +800,7 @@ __asm { ...@@ -544,7 +800,7 @@ __asm {
mov edi, [esp + 4 + 12] // dst_b mov edi, [esp + 4 + 12] // dst_b
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
movdqa xmm7, _kShuffleReverseUV movdqa xmm7, _kShuffleReverseUV
lea eax, [eax + 2 * ecx - 16] lea eax, [eax + ecx * 2 - 16]
convertloop : convertloop :
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -610,13 +866,12 @@ void RotateUV180(const uint8* src, int src_stride, ...@@ -610,13 +866,12 @@ void RotateUV180(const uint8* src, int src_stride,
int i; int i;
reverse_uv_func ReverseLine; reverse_uv_func ReverseLine;
// TODO(frkoenig) : do processor detection here.
#if defined(HAS_REVERSE_LINE_UV_NEON) #if defined(HAS_REVERSE_LINE_UV_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) && IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0) ) { IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
ReverseLine = ReverseLineUV_NEON; ReverseLine = ReverseLineUV_NEON;
} else } else
#endif #endif
...@@ -624,8 +879,8 @@ void RotateUV180(const uint8* src, int src_stride, ...@@ -624,8 +879,8 @@ void RotateUV180(const uint8* src, int src_stride,
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) && IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0) ) { IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
ReverseLine = ReverseLineUV_SSSE3; ReverseLine = ReverseLineUV_SSSE3;
} else } else
#endif #endif
...@@ -669,7 +924,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y, ...@@ -669,7 +924,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
} }
switch (mode) { switch (mode) {
case kRotateNone: case kRotate0:
// copy frame // copy frame
return I420Copy(src_y, src_stride_y, return I420Copy(src_y, src_stride_y,
src_u, src_stride_u, src_u, src_stride_u,
...@@ -678,7 +933,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y, ...@@ -678,7 +933,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
dst_u, dst_stride_u, dst_u, dst_stride_u,
dst_v, dst_stride_v, dst_v, dst_stride_v,
width, height); width, height);
case kRotateClockwise: case kRotate90:
RotatePlane90(src_y, src_stride_y, RotatePlane90(src_y, src_stride_y,
dst_y, dst_stride_y, dst_y, dst_stride_y,
width, height); width, height);
...@@ -689,7 +944,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y, ...@@ -689,7 +944,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
dst_v, dst_stride_v, dst_v, dst_stride_v,
halfwidth, halfheight); halfwidth, halfheight);
return 0; return 0;
case kRotateCounterClockwise: case kRotate270:
RotatePlane270(src_y, src_stride_y, RotatePlane270(src_y, src_stride_y,
dst_y, dst_stride_y, dst_y, dst_stride_y,
width, height); width, height);
...@@ -738,14 +993,14 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, ...@@ -738,14 +993,14 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
} }
switch (mode) { switch (mode) {
case kRotateNone: case kRotate0:
// copy frame // copy frame
return NV12ToI420(src_y, src_uv, src_stride_y, return NV12ToI420(src_y, src_uv, src_stride_y,
dst_y, dst_stride_y, dst_y, dst_stride_y,
dst_u, dst_stride_u, dst_u, dst_stride_u,
dst_v, dst_stride_v, dst_v, dst_stride_v,
width, height); width, height);
case kRotateClockwise: case kRotate90:
RotatePlane90(src_y, src_stride_y, RotatePlane90(src_y, src_stride_y,
dst_y, dst_stride_y, dst_y, dst_stride_y,
width, height); width, height);
...@@ -754,7 +1009,7 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, ...@@ -754,7 +1009,7 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
dst_v, dst_stride_v, dst_v, dst_stride_v,
halfwidth, halfheight); halfwidth, halfheight);
return 0; return 0;
case kRotateCounterClockwise: case kRotate270:
RotatePlane270(src_y, src_stride_y, RotatePlane270(src_y, src_stride_y,
dst_y, dst_stride_y, dst_y, dst_stride_y,
width, height); width, height);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment