Commit 9394ed99 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGB To I420 and variations using row functions

BUG=none
TEST=media_unittests from talk used to benchmark
Review URL: http://webrtc-codereview.appspot.com/254001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@51 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 7472021e
......@@ -20,6 +20,9 @@ static const int kCpuHasSSSE3 = 2;
// These flags are only valid on ARM processors
static const int kCpuHasNEON = 4;
// Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 8;
// Detect CPU has SSE2 etc.
bool TestCpuFlag(int flag);
......
......@@ -636,147 +636,110 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
return 0;
}
// ARGBToI420Row_C etc row functions use the following macro, generating
// code with RGB offsets/strides different for each version. Less error
// prone than duplicating the code.
// template could be used, but macro method works for C and asm and this is
// performance critical code.
#define MAKEROWRGBTOI420(NAME,R,G,B,BPP) \
static void \
NAME(const uint8* src_row0, const uint8* src_row1, \
uint8* dst_yplane0, uint8* dst_yplane1, \
uint8* dst_u, \
uint8* dst_v, \
int width) { \
for (int x = 0; x < width - 1; x += 2) { \
dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
src_row0[G] * 129 + \
src_row0[B] * 25 + 128) >> 8) + 16; \
dst_yplane0[1] = (uint8)((src_row0[R + BPP] * 66 + \
src_row0[G + BPP] * 129 + \
src_row0[B + BPP] * 25 + 128) >> 8) + 16; \
dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
src_row1[G] * 129 + \
src_row1[B] * 25 + 128) >> 8) + 16; \
dst_yplane1[1] = (uint8)((src_row1[R + BPP] * 66 + \
src_row1[G + BPP] * 129 + \
src_row1[B + BPP] * 25 + 128) >> 8) + 16; \
dst_u[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
src_row1[R] + src_row1[R + BPP]) * -38 + \
(src_row0[G] + src_row0[G + BPP] + \
src_row1[G] + src_row1[G + BPP]) * -74 + \
(src_row0[B] + src_row0[B + BPP] + \
src_row1[B] + src_row1[B + BPP]) * 112 + \
+ 512) >> 10) + 128; \
dst_v[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
src_row1[R] + src_row1[R + BPP]) * 112 + \
(src_row0[G] + src_row0[G + BPP] + \
src_row1[G] + src_row1[G + BPP]) * -94 + \
(src_row0[B] + src_row0[B + BPP] + \
src_row1[B] + src_row1[B + BPP]) * -18 + \
+ 512) >> 10) + 128; \
dst_yplane0 += 2; \
dst_yplane1 += 2; \
++dst_u; \
++dst_v; \
src_row0 += BPP * 2; \
src_row1 += BPP * 2; \
} \
if (width & 1) { \
dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
src_row0[G] * 129 + \
src_row0[B] * 25 + 128) >> 8) + 16; \
dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
src_row1[G] * 129 + \
src_row1[B] * 25 + 128) >> 8) + 16; \
dst_u[0] = (uint8)(((src_row0[R] + \
src_row1[R]) * -38 + \
(src_row0[G] + \
src_row1[G]) * -74 + \
(src_row0[B] + \
src_row1[B]) * 112 + \
+ 256) >> 9) + 128; \
dst_v[0] = (uint8)(((src_row0[R] + \
src_row1[R]) * 112 + \
(src_row0[G] + \
src_row1[G]) * -94 + \
(src_row0[B] + \
src_row1[B]) * -18 + \
+ 256) >> 9) + 128; \
} \
}
// Generate variations of RGBToI420. Parameters are r,g,b offsets within a
// pixel, and number of bytes per pixel.
MAKEROWRGBTOI420(ARGBToI420Row_C, 2, 1, 0, 4)
MAKEROWRGBTOI420(BGRAToI420Row_C, 1, 2, 3, 4)
MAKEROWRGBTOI420(ABGRToI420Row_C, 0, 1, 2, 4)
MAKEROWRGBTOI420(RGB24ToI420Row_C, 2, 1, 0, 3)
MAKEROWRGBTOI420(RAWToI420Row_C, 0, 1, 2, 3)
static int RGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
void (*RGBToI420Row)(const uint8* src_row0,
const uint8* src_row1,
uint8* dst_yplane0,
uint8* dst_yplane1,
uint8* dst_u,
uint8* dst_v,
int width)) {
if (src_frame == NULL || dst_y == NULL ||
dst_v == NULL || dst_v == NULL)
return -1;
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
if (height < 0) {
height = -height;
src_frame = src_frame + src_stride_frame * (height -1);
src_frame = src_frame + (height - 1) * src_stride_frame;
src_stride_frame = -src_stride_frame;
}
for (int y = 0; y < height - 1; y += 2) {
RGBToI420Row(src_frame, src_frame + src_stride_frame,
dst_y, dst_y + dst_stride_y,
dst_u, dst_v,
width);
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ARGBTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
}
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = ARGBToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
RGBToI420Row(src_frame, src_frame,
dst_y, dst_y,
dst_u, dst_v,
width);
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
}
int ARGBToI420_Reference(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return RGBToI420(src_frame, src_stride_frame,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height, ARGBToI420Row_C);
}
int BGRAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return RGBToI420(src_frame, src_stride_frame,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height, BGRAToI420Row_C);
if (height < 0) {
height = -height;
src_frame = src_frame + (height - 1) * src_stride_frame;
src_stride_frame = -src_stride_frame;
}
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_BGRATOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = BGRAToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = BGRAToYRow_C;
}
#if defined(HAS_BGRATOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = BGRAToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = BGRAToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
}
int ABGRToI420(const uint8* src_frame, int src_stride_frame,
......@@ -784,11 +747,52 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return RGBToI420(src_frame, src_stride_frame,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height, ABGRToI420Row_C);
if (height < 0) {
height = -height;
src_frame = src_frame + (height - 1) * src_stride_frame;
src_stride_frame = -src_stride_frame;
}
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ABGRTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ABGRToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ABGRToYRow_C;
}
#if defined(HAS_ABGRTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ABGRToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = ABGRToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
}
int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
......@@ -796,30 +800,59 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return RGBToI420(src_frame, src_stride_frame,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height, RGB24ToI420Row_C);
}
if (height < 0) {
height = -height;
src_frame = src_frame + (height - 1) * src_stride_frame;
src_stride_frame = -src_stride_frame;
}
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_RGB24TOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = RGB24ToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = RGB24ToYRow_C;
}
#if defined(HAS_RGB24TOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = RGB24ToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = RGB24ToUVRow_C;
}
int RAWToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return RGBToI420(src_frame, src_stride_frame,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height, RAWToI420Row_C);
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
}
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int RAWToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
if (height < 0) {
height = -height;
src_frame = src_frame + (height - 1) * src_stride_frame;
......@@ -828,42 +861,42 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ARGBTOYROW_SSSE3)
#if defined(HAS_RAWTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3;
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = RAWToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
ARGBToYRow = RAWToYRow_C;
}
#if defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_RAWTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 4) && (dst_stride_u % 4 == 0) &&
IS_ALIGNED(dst_v, 4) && (dst_stride_v % 4 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = RAWToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = ARGBToUVRow_C;
ARGBToUVRow = RAWToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
ARGBToYRow(src_frame, dst_y, width);
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
}
......
......@@ -15,9 +15,6 @@
#include <intrin.h>
#endif
// Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 16;
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static inline void __cpuid(int cpu_info[4], int info_type) {
......@@ -64,11 +61,11 @@ static void InitCpuFlags() {
void MaskCpuFlags(int enable_flags) {
InitCpuFlags();
cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
cpu_info_ &= enable_flags;
}
bool TestCpuFlag(int flag) {
if (!cpu_info_) {
if (0 == cpu_info_) {
InitCpuFlags();
}
return cpu_info_ & flag ? true : false;
......
......@@ -14,6 +14,8 @@
#include "video_common.h"
#include "row.h"
#define kMaxStride (2048 * 4)
namespace libyuv {
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
......@@ -329,6 +331,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
if (width * 4 > kMaxStride) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
......@@ -347,23 +352,29 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#define kMaxStride (2048 * 4)
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
#if defined(HAS_ARGBTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) &&
(width % 16 == 0) &&
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) {
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
}
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
#else
ARGBToUVRow = ARGBToUVRow_C;
#endif
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
ARGBToUVRow = ARGBToUVRow_C;
}
......@@ -392,9 +403,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
BayerRow0(src_bayer, src_stride_bayer, row, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
row + kMaxStride, width);
ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
src_bayer += src_stride_bayer * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
......@@ -403,8 +414,8 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
// TODO(fbarchard): Make sure this filters properly
if (height & 1) {
BayerRow0(src_bayer, src_stride_bayer, row, width);
ARGBToYRow(row, dst_y, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
}
return 0;
}
......
......@@ -58,16 +58,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
#if defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SPLITUV_SSE2
__declspec(naked)
......@@ -206,7 +196,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
__asm__ volatile
(
"vdup.u32 {q0}, %2 \n" // duplicate 4 ints
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1:\n"
"vst1.u32 {q0}, [%0]! \n" // store
"subs %1, %1, #16 \n" // 16 processed per loop
......@@ -1282,85 +1272,6 @@ __asm {
}
}
#define HAS_BG24TOARGBROW_SSSE3
__declspec(naked)
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_bg24
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskBG24ToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
#define HAS_RAWTOARGBROW_SSSE3
__declspec(naked)
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskRAWToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
#elif (defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
......@@ -1435,84 +1346,6 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
);
}
#define HAS_BG24TOARGBROW_SSSE3
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n"
"por %%xmm7,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_bg24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskBG24ToARGB) // %3
: "memory"
);
}
#define HAS_RAWTOARGBROW_SSSE3
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n"
"por %%xmm7,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskRAWToARGB) // %3
: "memory"
);
}
#endif
static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
......@@ -1556,97 +1389,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
return 0;
}
static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 r = src_raw[0];
uint8 g = src_raw[1];
uint8 b = src_raw[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb += 4;
src_raw += 3;
}
}
// Convert RAW to ARGB.
int RAWToARGB(const uint8* src_raw, int src_stride_raw,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
} else
#endif
{
RAWToARGBRow = RAWToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
src_raw += src_stride_raw;
dst_argb += dst_stride_argb;
}
return 0;
}
static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 b = src_bg24[0];
uint8 g = src_bg24[1];
uint8 r = src_bg24[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb[3] = 255u;
dst_argb += 4;
src_bg24 += 3;
}
}
// Convert BG24 to ARGB.
int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
src_stride_bg24 = -src_stride_bg24;
}
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(HAS_BG24TOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
BG24ToARGBRow = BG24ToARGBRow_SSSE3;
} else
#endif
{
BG24ToARGBRow = BG24ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
BG24ToARGBRow(src_bg24, dst_argb, width);
src_bg24 += src_stride_bg24;
dst_argb += dst_stride_argb;
}
return 0;
}
static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
// To support in-place conversion.
......@@ -1768,5 +1510,66 @@ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
return 0;
}
// Convert RAW to ARGB.
int RAWToARGB(const uint8* src_raw, int src_stride_raw,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
} else
#endif
{
RAWToARGBRow = RAWToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
src_raw += src_stride_raw;
dst_argb += dst_stride_argb;
}
return 0;
}
// Convert BG24 to ARGB.
int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
src_stride_bg24 = -src_stride_bg24;
}
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(HAS_BG24TOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
BG24ToARGBRow = BG24ToARGBRow_SSSE3;
} else
#endif
{
BG24ToARGBRow = BG24ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
BG24ToARGBRow(src_bg24, dst_argb, width);
src_bg24 += src_stride_bg24;
dst_argb += dst_stride_argb;
}
return 0;
}
} // namespace libyuv
......@@ -497,6 +497,143 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
);
#if defined (__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile(
"1:"
// Read in the data from the source pointer.
// First round of bit swap.
"movdqa (%0),%%xmm0\n"
"movdqa (%0,%3),%%xmm1\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm0,%%xmm8\n"
"punpcklbw %%xmm1,%%xmm0\n"
"punpckhbw %%xmm1,%%xmm8\n"
"movdqa (%0),%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n"
"movdqa %%xmm8,%%xmm9\n"
"palignr $0x8,%%xmm1,%%xmm1\n"
"palignr $0x8,%%xmm9,%%xmm9\n"
"movdqa (%0,%3),%%xmm3\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm2,%%xmm10\n"
"punpcklbw %%xmm3,%%xmm2\n"
"punpckhbw %%xmm3,%%xmm10\n"
"movdqa %%xmm2,%%xmm3\n"
"movdqa %%xmm10,%%xmm11\n"
"movdqa (%0),%%xmm4\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"palignr $0x8,%%xmm11,%%xmm11\n"
"movdqa (%0,%3),%%xmm5\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm4,%%xmm12\n"
"punpcklbw %%xmm5,%%xmm4\n"
"punpckhbw %%xmm5,%%xmm12\n"
"movdqa %%xmm4,%%xmm5\n"
"movdqa %%xmm12,%%xmm13\n"
"movdqa (%0),%%xmm6\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"palignr $0x8,%%xmm13,%%xmm13\n"
"movdqa (%0,%3),%%xmm7\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm6,%%xmm14\n"
"punpcklbw %%xmm7,%%xmm6\n"
"punpckhbw %%xmm7,%%xmm14\n"
"neg %3\n"
"movdqa %%xmm6,%%xmm7\n"
"movdqa %%xmm14,%%xmm15\n"
"lea 0x10(%0,%3,8),%0\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"neg %3\n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0\n"
"punpcklwd %%xmm3,%%xmm1\n"
"movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n"
"palignr $0x8,%%xmm2,%%xmm2\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"punpcklwd %%xmm6,%%xmm4\n"
"punpcklwd %%xmm7,%%xmm5\n"
"movdqa %%xmm4,%%xmm6\n"
"movdqa %%xmm5,%%xmm7\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"punpcklwd %%xmm10,%%xmm8\n"
"punpcklwd %%xmm11,%%xmm9\n"
"movdqa %%xmm8,%%xmm10\n"
"movdqa %%xmm9,%%xmm11\n"
"palignr $0x8,%%xmm10,%%xmm10\n"
"palignr $0x8,%%xmm11,%%xmm11\n"
"punpcklwd %%xmm14,%%xmm12\n"
"punpcklwd %%xmm15,%%xmm13\n"
"movdqa %%xmm12,%%xmm14\n"
"movdqa %%xmm13,%%xmm15\n"
"palignr $0x8,%%xmm14,%%xmm14\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0\n"
"movq %%xmm0,(%1)\n"
"movdqa %%xmm0,%%xmm4\n"
"palignr $0x8,%%xmm4,%%xmm4\n"
"movq %%xmm4,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm6,%%xmm2\n"
"movdqa %%xmm2,%%xmm6\n"
"movq %%xmm2,(%1)\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"punpckldq %%xmm5,%%xmm1\n"
"movq %%xmm6,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm1,%%xmm5\n"
"movq %%xmm1,(%1)\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"movq %%xmm5,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm7,%%xmm3\n"
"movq %%xmm3,(%1)\n"
"movdqa %%xmm3,%%xmm7\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"movq %%xmm7,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm12,%%xmm8\n"
"movq %%xmm8,(%1)\n"
"movdqa %%xmm8,%%xmm12\n"
"palignr $0x8,%%xmm12,%%xmm12\n"
"movq %%xmm12,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm14,%%xmm10\n"
"movdqa %%xmm10,%%xmm14\n"
"movq %%xmm10,(%1)\n"
"palignr $0x8,%%xmm14,%%xmm14\n"
"punpckldq %%xmm13,%%xmm9\n"
"movq %%xmm14,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm9,%%xmm13\n"
"movq %%xmm9,(%1)\n"
"palignr $0x8,%%xmm13,%%xmm13\n"
"movq %%xmm13,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm15,%%xmm11\n"
"movq %%xmm11,(%1)\n"
"movdqa %%xmm11,%%xmm15\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"movq %%xmm15,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
"r"(static_cast<intptr_t>(dst_stride)) // %4
: "memory"
);
}
#define HAS_TRANSPOSE_UVWX8_SSE2
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
......@@ -644,17 +781,26 @@ void TransposePlane(const uint8* src, int src_stride,
#if defined(HAS_TRANSPOSE_WX8_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(width % 8 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) {
IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_NEON;
TransposeWxH = TransposeWxH_C;
} else
#endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3;
TransposeWxH = TransposeWxH_C;
} else
#endif
#if defined(HAS_TRANSPOSE_WX8_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) {
IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_SSSE3;
TransposeWxH = TransposeWxH_C;
} else
......
......@@ -13,17 +13,91 @@
#include "libyuv/basic_types.h"
// The following are available on all x86 platforms
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BG24TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOUVROW_SSSE3
#define HAS_RAWTOUVROW_SSSE3
#endif
// The following are available only on Windows
#if defined(WIN32) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#endif
extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
#define HASRGB24TOYROW_SSSE3
#endif
#ifdef HASRGB24TOYROW_SSSE3
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#ifdef HAS_BG24TOARGBROW_SSSE3
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
#endif
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
#endif
#ifdef OSX
extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
#else
extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
#endif
void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
......@@ -52,34 +126,6 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
uint8* rgb_buf,
int width);
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
#endif
#ifdef OSX
extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
#else
extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
#endif
// Method to force C version.
//#define USE_MMX 0
//#define USE_SSE2 0
......
......@@ -23,6 +23,16 @@ extern "C" TALIGN16(const uint8, kAdd16[16]) = {
1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
};
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile(
"movdqa (%3),%%xmm7\n"
......@@ -55,47 +65,81 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
#endif
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
}
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
}
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
for (int x = 0; x < width; ++x) {
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
src_argb0 += 4;
dst_y += 1;
}
#ifdef HAS_BG24TOARGBROW_SSSE3
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n"
"por %%xmm7,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_bg24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskBG24ToARGB) // %3
: "memory"
);
}
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
const uint8* src_argb1 = src_argb0 + src_stride_argb;
for (int x = 0; x < width - 1; x += 2) {
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
src_argb0 += 8;
src_argb1 += 8;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
}
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n"
"por %%xmm7,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskRAWToARGB) // %3
: "memory"
);
}
#endif
#if defined(__x86_64__)
......@@ -611,4 +655,5 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
}
#endif
} // extern "C"
......@@ -10,6 +10,8 @@
#include "row.h"
#define kMaxStride (2048 * 4)
extern "C" {
#define MAKETABLE(NAME) \
......@@ -301,4 +303,167 @@ MAKETABLE(kCoefficientsAbgrY)
MAKETABLE(_kCoefficientsAbgrY)
#endif
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 r = src_raw[0];
uint8 g = src_raw[1];
uint8 b = src_raw[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb += 4;
src_raw += 3;
}
}
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 b = src_bg24[0];
uint8 g = src_bg24[1];
uint8 r = src_bg24[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb[3] = 255u;
dst_argb += 4;
src_bg24 += 3;
}
}
// C versions do the same
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
BG24ToARGBRow_C(src_argb, row, pix);
ARGBToYRow_C(row, dst_y, pix);
}
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
RAWToARGBRow_C(src_argb, row, pix);
ARGBToYRow_C(row, dst_y, pix);
}
void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_C(src_argb, row, pix);
BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_C(src_argb, row, pix);
RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
}
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
}
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
#define MAKEROWY(NAME,R,G,B) \
void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
for (int x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
src_argb0 += 4; \
dst_y += 1; \
} \
} \
void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
uint8* dst_u, uint8* dst_v, int width) { \
const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
for (int x = 0; x < width - 1; x += 2) { \
uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \
src_rgb1[B] + src_rgb1[B + 4]) >> 2; \
uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \
src_rgb1[G] + src_rgb1[G + 4]) >> 2; \
uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \
src_rgb1[R] + src_rgb1[R + 4]) >> 2; \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb0 += 8; \
src_rgb1 += 8; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
MAKEROWY(ARGB,2,1,0)
MAKEROWY(BGRA,1,2,3)
MAKEROWY(ABGR,0,1,2)
#if defined(HAS_RAWTOYROW_SSSE3)
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
#endif
#if defined(HAS_RAWTOUVROW_SSSE3)
#if defined(HAS_ARGBTOUVROW_SSSE3)
void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
#else
void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
#endif
#endif
} // extern "C"
......@@ -16,59 +16,160 @@ extern "C" {
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
// Constant multiplication table for converting ARGB to I400.
extern "C" TALIGN16(const int8, kRGBToY[16]) = {
extern "C" TALIGN16(const int8, kARGBToY[16]) = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
extern "C" TALIGN16(const int8, kRGBToU[16]) = {
extern "C" TALIGN16(const int8, kARGBToU[16]) = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
extern "C" TALIGN16(const int8, kRGBToV[16]) = {
extern "C" TALIGN16(const int8, kARGBToV[16]) = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
// Constants for BGRA
extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
};
extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
};
extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
};
// Constants for ABGR
extern "C" TALIGN16(const int8, kABGRToY[16]) = {
33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
};
extern "C" TALIGN16(const int8, kABGRToU[16]) = {
-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
};
extern "C" TALIGN16(const int8, kABGRToV[16]) = {
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
};
extern "C" TALIGN16(const uint8, kAddY16[16]) = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
};
extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u,
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
// Convert 16 ARGB pixels (64 bytes) to 16 Y values
__declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
movdqa xmm7, _kRGBToY
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kARGBToY
movdqa xmm6, _kAddY16
pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff
psrld xmm5, 16
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
pmaddubsw xmm0, xmm7
lea eax, [eax + 32]
pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
palignr xmm3, xmm1, 2
paddw xmm3, xmm1
pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx
packuswb xmm2, xmm2
paddb xmm2, xmm6
movq qword ptr [edx], xmm2
lea edx, [edx + 8]
sub ecx, 8
ja convertloop
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
pmaddubsw xmm3, xmm7
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm6
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kBGRAToY
movdqa xmm6, _kAddY16
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
pmaddubsw xmm3, xmm7
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm6
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kABGRToY
movdqa xmm6, _kAddY16
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
pmaddubsw xmm3, xmm7
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm6
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
......@@ -84,55 +185,52 @@ __asm {
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kRGBToU
movdqa xmm6, _kRGBToV
movdqa xmm7, _kARGBToU
movdqa xmm6, _kARGBToV
movdqa xmm5, _kAddUV128
pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff
psrld xmm4, 16
sub edi, edx // stride from u to v
convertloop :
// step 1 - subsample 8x2 argb pixels to 4x1
movdqa xmm0, [eax] // 32x2 -> 32x1
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
movdqa xmm2, xmm0 // 32x1 -> 16x1
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm2, xmm1, 0xdd
pavgb xmm0, xmm2
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 8 different pixels, its 4 pixels of U and 4 of V
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm1, xmm6 // V
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
palignr xmm3, xmm1, 2
paddw xmm3, xmm1
pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
psraw xmm2, 8
psraw xmm3, 8
packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
paddb xmm2, xmm5 // -> unsigned
packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V
// step 3 - store 4 U and 4 V values
movd dword ptr [edx], xmm2 // U
lea edx, [edx + 4]
pshufd xmm0, xmm2, 0x55 // V
movd dword ptr [edi], xmm0
lea edi, [edi + 4]
sub ecx, 8
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
......@@ -140,45 +238,208 @@ __asm {
}
}
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
}
__declspec(naked)
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kBGRAToU
movdqa xmm6, _kBGRAToV
movdqa xmm5, _kAddUV128
sub edi, edx // stride from u to v
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
}
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
convertloop :
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
for (int x = 0; x < width; ++x) {
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
src_argb0 += 4;
dst_y += 1;
__declspec(naked)
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kABGRToU
movdqa xmm6, _kABGRToV
movdqa xmm5, _kAddUV128
sub edi, edx // stride from u to v
convertloop :
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
}
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
const uint8* src_argb1 = src_argb0 + src_stride_argb;
for (int x = 0; x < width - 1; x += 2) {
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
src_argb0 += 8;
src_argb1 += 8;
dst_u += 1;
dst_v += 1;
__declspec(naked)
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_bg24
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskBG24ToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
if (width & 1) {
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
}
__declspec(naked)
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskRAWToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment