Commit 9394ed99 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGB To I420 and variations using row functions

BUG=none
TEST=media_unittests from talk used to benchmark
Review URL: http://webrtc-codereview.appspot.com/254001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@51 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 7472021e
...@@ -20,6 +20,9 @@ static const int kCpuHasSSSE3 = 2; ...@@ -20,6 +20,9 @@ static const int kCpuHasSSSE3 = 2;
// These flags are only valid on ARM processors // These flags are only valid on ARM processors
static const int kCpuHasNEON = 4; static const int kCpuHasNEON = 4;
// Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 8;
// Detect CPU has SSE2 etc. // Detect CPU has SSE2 etc.
bool TestCpuFlag(int flag); bool TestCpuFlag(int flag);
......
...@@ -636,147 +636,110 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, ...@@ -636,147 +636,110 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
return 0; return 0;
} }
// ARGBToI420Row_C etc row functions use the following macro, generating int ARGBToI420(const uint8* src_frame, int src_stride_frame,
// code with RGB offsets/strides different for each version. Less error
// prone than duplicating the code.
// template could be used, but macro method works for C and asm and this is
// performance critical code.
#define MAKEROWRGBTOI420(NAME,R,G,B,BPP) \
static void \
NAME(const uint8* src_row0, const uint8* src_row1, \
uint8* dst_yplane0, uint8* dst_yplane1, \
uint8* dst_u, \
uint8* dst_v, \
int width) { \
for (int x = 0; x < width - 1; x += 2) { \
dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
src_row0[G] * 129 + \
src_row0[B] * 25 + 128) >> 8) + 16; \
dst_yplane0[1] = (uint8)((src_row0[R + BPP] * 66 + \
src_row0[G + BPP] * 129 + \
src_row0[B + BPP] * 25 + 128) >> 8) + 16; \
dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
src_row1[G] * 129 + \
src_row1[B] * 25 + 128) >> 8) + 16; \
dst_yplane1[1] = (uint8)((src_row1[R + BPP] * 66 + \
src_row1[G + BPP] * 129 + \
src_row1[B + BPP] * 25 + 128) >> 8) + 16; \
dst_u[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
src_row1[R] + src_row1[R + BPP]) * -38 + \
(src_row0[G] + src_row0[G + BPP] + \
src_row1[G] + src_row1[G + BPP]) * -74 + \
(src_row0[B] + src_row0[B + BPP] + \
src_row1[B] + src_row1[B + BPP]) * 112 + \
+ 512) >> 10) + 128; \
dst_v[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
src_row1[R] + src_row1[R + BPP]) * 112 + \
(src_row0[G] + src_row0[G + BPP] + \
src_row1[G] + src_row1[G + BPP]) * -94 + \
(src_row0[B] + src_row0[B + BPP] + \
src_row1[B] + src_row1[B + BPP]) * -18 + \
+ 512) >> 10) + 128; \
dst_yplane0 += 2; \
dst_yplane1 += 2; \
++dst_u; \
++dst_v; \
src_row0 += BPP * 2; \
src_row1 += BPP * 2; \
} \
if (width & 1) { \
dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
src_row0[G] * 129 + \
src_row0[B] * 25 + 128) >> 8) + 16; \
dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
src_row1[G] * 129 + \
src_row1[B] * 25 + 128) >> 8) + 16; \
dst_u[0] = (uint8)(((src_row0[R] + \
src_row1[R]) * -38 + \
(src_row0[G] + \
src_row1[G]) * -74 + \
(src_row0[B] + \
src_row1[B]) * 112 + \
+ 256) >> 9) + 128; \
dst_v[0] = (uint8)(((src_row0[R] + \
src_row1[R]) * 112 + \
(src_row0[G] + \
src_row1[G]) * -94 + \
(src_row0[B] + \
src_row1[B]) * -18 + \
+ 256) >> 9) + 128; \
} \
}
// Generate variations of RGBToI420. Parameters are r,g,b offsets within a
// pixel, and number of bytes per pixel.
MAKEROWRGBTOI420(ARGBToI420Row_C, 2, 1, 0, 4)
MAKEROWRGBTOI420(BGRAToI420Row_C, 1, 2, 3, 4)
MAKEROWRGBTOI420(ABGRToI420Row_C, 0, 1, 2, 4)
MAKEROWRGBTOI420(RGB24ToI420Row_C, 2, 1, 0, 3)
MAKEROWRGBTOI420(RAWToI420Row_C, 0, 1, 2, 3)
static int RGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height, int width, int height) {
void (*RGBToI420Row)(const uint8* src_row0,
const uint8* src_row1,
uint8* dst_yplane0,
uint8* dst_yplane1,
uint8* dst_u,
uint8* dst_v,
int width)) {
if (src_frame == NULL || dst_y == NULL ||
dst_v == NULL || dst_v == NULL)
return -1;
if (height < 0) { if (height < 0) {
height = -height; height = -height;
src_frame = src_frame + src_stride_frame * (height -1); src_frame = src_frame + (height - 1) * src_stride_frame;
src_stride_frame = -src_stride_frame; src_stride_frame = -src_stride_frame;
} }
for (int y = 0; y < height - 1; y += 2) { void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
RGBToI420Row(src_frame, src_frame + src_stride_frame, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
dst_y, dst_y + dst_stride_y, uint8* dst_u, uint8* dst_v, int width);
dst_u, dst_v, #if defined(HAS_ARGBTOYROW_SSSE3)
width); if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
}
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = ARGBToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2; src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2; dst_y += dst_stride_y * 2;
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
if (height & 1) { if (height & 1) {
RGBToI420Row(src_frame, src_frame, ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
dst_y, dst_y, ARGBToYRow(src_frame, dst_y, width);
dst_u, dst_v,
width);
} }
return 0; return 0;
} }
int ARGBToI420_Reference(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return RGBToI420(src_frame, src_stride_frame,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height, ARGBToI420Row_C);
}
int BGRAToI420(const uint8* src_frame, int src_stride_frame, int BGRAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
return RGBToI420(src_frame, src_stride_frame, if (height < 0) {
dst_y, dst_stride_y, height = -height;
dst_u, dst_stride_u, src_frame = src_frame + (height - 1) * src_stride_frame;
dst_v, dst_stride_v, src_stride_frame = -src_stride_frame;
width, height, BGRAToI420Row_C); }
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_BGRATOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = BGRAToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = BGRAToYRow_C;
}
#if defined(HAS_BGRATOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = BGRAToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = BGRAToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
} }
int ABGRToI420(const uint8* src_frame, int src_stride_frame, int ABGRToI420(const uint8* src_frame, int src_stride_frame,
...@@ -784,11 +747,52 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, ...@@ -784,11 +747,52 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
return RGBToI420(src_frame, src_stride_frame, if (height < 0) {
dst_y, dst_stride_y, height = -height;
dst_u, dst_stride_u, src_frame = src_frame + (height - 1) * src_stride_frame;
dst_v, dst_stride_v, src_stride_frame = -src_stride_frame;
width, height, ABGRToI420Row_C); }
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ABGRTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ABGRToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ABGRToYRow_C;
}
#if defined(HAS_ABGRTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ABGRToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = ABGRToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
} }
int RGB24ToI420(const uint8* src_frame, int src_stride_frame, int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
...@@ -796,26 +800,55 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame, ...@@ -796,26 +800,55 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
return RGBToI420(src_frame, src_stride_frame, if (height < 0) {
dst_y, dst_stride_y, height = -height;
dst_u, dst_stride_u, src_frame = src_frame + (height - 1) * src_stride_frame;
dst_v, dst_stride_v, src_stride_frame = -src_stride_frame;
width, height, RGB24ToI420Row_C); }
} void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_RGB24TOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = RGB24ToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = RGB24ToYRow_C;
}
#if defined(HAS_RGB24TOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = RGB24ToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = RGB24ToUVRow_C;
}
int RAWToI420(const uint8* src_frame, int src_stride_frame, for (int y = 0; y < (height - 1); y += 2) {
uint8* dst_y, int dst_stride_y, ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
uint8* dst_u, int dst_stride_u, ARGBToYRow(src_frame, dst_y, width);
uint8* dst_v, int dst_stride_v, ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
int width, int height) { src_frame += src_stride_frame * 2;
return RGBToI420(src_frame, src_stride_frame, dst_y += dst_stride_y * 2;
dst_y, dst_stride_y, dst_u += dst_stride_u;
dst_u, dst_stride_u, dst_v += dst_stride_v;
dst_v, dst_stride_v, }
width, height, RAWToI420Row_C); if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
} }
int ARGBToI420(const uint8* src_frame, int src_stride_frame, int RAWToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
...@@ -828,42 +861,42 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, ...@@ -828,42 +861,42 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_RAWTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = RAWToYRow_SSSE3;
} else } else
#endif #endif
{ {
ARGBToYRow = ARGBToYRow_C; ARGBToYRow = RAWToYRow_C;
} }
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_RAWTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 4) && (dst_stride_u % 4 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 4) && (dst_stride_v % 4 == 0)) { IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToUVRow = RAWToUVRow_SSSE3;
} else } else
#endif #endif
{ {
ARGBToUVRow = ARGBToUVRow_C; ARGBToUVRow = RAWToUVRow_C;
} }
for (int y = 0; y < (height - 1); y += 2) { for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width); ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
src_frame += src_stride_frame * 2; src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2; dst_y += dst_stride_y * 2;
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
if (height & 1) { if (height & 1) {
ARGBToYRow(src_frame, dst_y, width);
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
} }
return 0; return 0;
} }
......
...@@ -15,9 +15,6 @@ ...@@ -15,9 +15,6 @@
#include <intrin.h> #include <intrin.h>
#endif #endif
// Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 16;
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static inline void __cpuid(int cpu_info[4], int info_type) { static inline void __cpuid(int cpu_info[4], int info_type) {
...@@ -64,11 +61,11 @@ static void InitCpuFlags() { ...@@ -64,11 +61,11 @@ static void InitCpuFlags() {
void MaskCpuFlags(int enable_flags) { void MaskCpuFlags(int enable_flags) {
InitCpuFlags(); InitCpuFlags();
cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized; cpu_info_ &= enable_flags;
} }
bool TestCpuFlag(int flag) { bool TestCpuFlag(int flag) {
if (!cpu_info_) { if (0 == cpu_info_) {
InitCpuFlags(); InitCpuFlags();
} }
return cpu_info_ & flag ? true : false; return cpu_info_ & flag ? true : false;
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "video_common.h" #include "video_common.h"
#include "row.h" #include "row.h"
#define kMaxStride (2048 * 4)
namespace libyuv { namespace libyuv {
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
...@@ -329,6 +331,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -329,6 +331,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
if (width * 4 > kMaxStride) {
return -1;
}
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
...@@ -347,23 +352,29 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -347,23 +352,29 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#define kMaxStride (2048 * 4)
SIMD_ALIGNED(uint8 row[kMaxStride * 2]); SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) && (width % 16 == 0) &&
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
}
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToUVRow = ARGBToUVRow_SSSE3;
#else
ARGBToUVRow = ARGBToUVRow_C;
#endif
} else } else
#endif #endif
{ {
ARGBToYRow = ARGBToYRow_C;
ARGBToUVRow = ARGBToUVRow_C; ARGBToUVRow = ARGBToUVRow_C;
} }
...@@ -392,9 +403,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -392,9 +403,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow0(src_bayer, src_stride_bayer, row, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
row + kMaxStride, width); row + kMaxStride, width);
ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width); ARGBToYRow(row, dst_y, width);
ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
src_bayer += src_stride_bayer * 2; src_bayer += src_stride_bayer * 2;
dst_y += dst_stride_y * 2; dst_y += dst_stride_y * 2;
dst_u += dst_stride_u; dst_u += dst_stride_u;
...@@ -403,8 +414,8 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -403,8 +414,8 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
// TODO(fbarchard): Make sure this filters properly // TODO(fbarchard): Make sure this filters properly
if (height & 1) { if (height & 1) {
BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow0(src_bayer, src_stride_bayer, row, width);
ARGBToYRow(row, dst_y, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToUVRow(row, 0, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
} }
return 0; return 0;
} }
......
...@@ -58,16 +58,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = { ...@@ -58,16 +58,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
}; };
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
#if defined(WIN32) && !defined(COVERAGE_ENABLED) #if defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SPLITUV_SSE2 #define HAS_SPLITUV_SSE2
__declspec(naked) __declspec(naked)
...@@ -206,7 +196,7 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -206,7 +196,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
__asm__ volatile __asm__ volatile
( (
"vdup.u32 {q0}, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1:\n" "1:\n"
"vst1.u32 {q0}, [%0]! \n" // store "vst1.u32 {q0}, [%0]! \n" // store
"subs %1, %1, #16 \n" // 16 processed per loop "subs %1, %1, #16 \n" // 16 processed per loop
...@@ -1282,85 +1272,6 @@ __asm { ...@@ -1282,85 +1272,6 @@ __asm {
} }
} }
#define HAS_BG24TOARGBROW_SSSE3
__declspec(naked)
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_bg24
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskBG24ToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
#define HAS_RAWTOARGBROW_SSSE3
__declspec(naked)
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskRAWToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
#elif (defined(__x86_64__) || defined(__i386__)) && \ #elif (defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
...@@ -1435,84 +1346,6 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, ...@@ -1435,84 +1346,6 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
); );
} }
#define HAS_BG24TOARGBROW_SSSE3
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n"
"por %%xmm7,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_bg24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskBG24ToARGB) // %3
: "memory"
);
}
#define HAS_RAWTOARGBROW_SSSE3
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n"
"por %%xmm7,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskRAWToARGB) // %3
: "memory"
);
}
#endif #endif
static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
...@@ -1556,97 +1389,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1556,97 +1389,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 r = src_raw[0];
uint8 g = src_raw[1];
uint8 b = src_raw[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb += 4;
src_raw += 3;
}
}
// Convert RAW to ARGB.
int RAWToARGB(const uint8* src_raw, int src_stride_raw,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
} else
#endif
{
RAWToARGBRow = RAWToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
src_raw += src_stride_raw;
dst_argb += dst_stride_argb;
}
return 0;
}
static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 b = src_bg24[0];
uint8 g = src_bg24[1];
uint8 r = src_bg24[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb[3] = 255u;
dst_argb += 4;
src_bg24 += 3;
}
}
// Convert BG24 to ARGB.
int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
src_stride_bg24 = -src_stride_bg24;
}
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(HAS_BG24TOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
BG24ToARGBRow = BG24ToARGBRow_SSSE3;
} else
#endif
{
BG24ToARGBRow = BG24ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
BG24ToARGBRow(src_bg24, dst_argb, width);
src_bg24 += src_stride_bg24;
dst_argb += dst_stride_argb;
}
return 0;
}
static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) { for (int x = 0; x < pix; ++x) {
// To support in-place conversion. // To support in-place conversion.
...@@ -1768,5 +1510,66 @@ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); ...@@ -1768,5 +1510,66 @@ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
return 0; return 0;
} }
// Convert RAW to ARGB.
int RAWToARGB(const uint8* src_raw, int src_stride_raw,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
} else
#endif
{
RAWToARGBRow = RAWToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
src_raw += src_stride_raw;
dst_argb += dst_stride_argb;
}
return 0;
}
// Convert BG24 to ARGB.
int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
src_stride_bg24 = -src_stride_bg24;
}
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(HAS_BG24TOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
BG24ToARGBRow = BG24ToARGBRow_SSSE3;
} else
#endif
{
BG24ToARGBRow = BG24ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
BG24ToARGBRow(src_bg24, dst_argb, width);
src_bg24 += src_stride_bg24;
dst_argb += dst_stride_argb;
}
return 0;
}
} // namespace libyuv } // namespace libyuv
...@@ -497,6 +497,143 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -497,6 +497,143 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
); );
#if defined (__x86_64__) #if defined (__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile(
"1:"
// Read in the data from the source pointer.
// First round of bit swap.
"movdqa (%0),%%xmm0\n"
"movdqa (%0,%3),%%xmm1\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm0,%%xmm8\n"
"punpcklbw %%xmm1,%%xmm0\n"
"punpckhbw %%xmm1,%%xmm8\n"
"movdqa (%0),%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n"
"movdqa %%xmm8,%%xmm9\n"
"palignr $0x8,%%xmm1,%%xmm1\n"
"palignr $0x8,%%xmm9,%%xmm9\n"
"movdqa (%0,%3),%%xmm3\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm2,%%xmm10\n"
"punpcklbw %%xmm3,%%xmm2\n"
"punpckhbw %%xmm3,%%xmm10\n"
"movdqa %%xmm2,%%xmm3\n"
"movdqa %%xmm10,%%xmm11\n"
"movdqa (%0),%%xmm4\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"palignr $0x8,%%xmm11,%%xmm11\n"
"movdqa (%0,%3),%%xmm5\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm4,%%xmm12\n"
"punpcklbw %%xmm5,%%xmm4\n"
"punpckhbw %%xmm5,%%xmm12\n"
"movdqa %%xmm4,%%xmm5\n"
"movdqa %%xmm12,%%xmm13\n"
"movdqa (%0),%%xmm6\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"palignr $0x8,%%xmm13,%%xmm13\n"
"movdqa (%0,%3),%%xmm7\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm6,%%xmm14\n"
"punpcklbw %%xmm7,%%xmm6\n"
"punpckhbw %%xmm7,%%xmm14\n"
"neg %3\n"
"movdqa %%xmm6,%%xmm7\n"
"movdqa %%xmm14,%%xmm15\n"
"lea 0x10(%0,%3,8),%0\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"neg %3\n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0\n"
"punpcklwd %%xmm3,%%xmm1\n"
"movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n"
"palignr $0x8,%%xmm2,%%xmm2\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"punpcklwd %%xmm6,%%xmm4\n"
"punpcklwd %%xmm7,%%xmm5\n"
"movdqa %%xmm4,%%xmm6\n"
"movdqa %%xmm5,%%xmm7\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"punpcklwd %%xmm10,%%xmm8\n"
"punpcklwd %%xmm11,%%xmm9\n"
"movdqa %%xmm8,%%xmm10\n"
"movdqa %%xmm9,%%xmm11\n"
"palignr $0x8,%%xmm10,%%xmm10\n"
"palignr $0x8,%%xmm11,%%xmm11\n"
"punpcklwd %%xmm14,%%xmm12\n"
"punpcklwd %%xmm15,%%xmm13\n"
"movdqa %%xmm12,%%xmm14\n"
"movdqa %%xmm13,%%xmm15\n"
"palignr $0x8,%%xmm14,%%xmm14\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0\n"
"movq %%xmm0,(%1)\n"
"movdqa %%xmm0,%%xmm4\n"
"palignr $0x8,%%xmm4,%%xmm4\n"
"movq %%xmm4,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm6,%%xmm2\n"
"movdqa %%xmm2,%%xmm6\n"
"movq %%xmm2,(%1)\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"punpckldq %%xmm5,%%xmm1\n"
"movq %%xmm6,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm1,%%xmm5\n"
"movq %%xmm1,(%1)\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"movq %%xmm5,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm7,%%xmm3\n"
"movq %%xmm3,(%1)\n"
"movdqa %%xmm3,%%xmm7\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"movq %%xmm7,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm12,%%xmm8\n"
"movq %%xmm8,(%1)\n"
"movdqa %%xmm8,%%xmm12\n"
"palignr $0x8,%%xmm12,%%xmm12\n"
"movq %%xmm12,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm14,%%xmm10\n"
"movdqa %%xmm10,%%xmm14\n"
"movq %%xmm10,(%1)\n"
"palignr $0x8,%%xmm14,%%xmm14\n"
"punpckldq %%xmm13,%%xmm9\n"
"movq %%xmm14,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm9,%%xmm13\n"
"movq %%xmm9,(%1)\n"
"palignr $0x8,%%xmm13,%%xmm13\n"
"movq %%xmm13,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm15,%%xmm11\n"
"movq %%xmm11,(%1)\n"
"movdqa %%xmm11,%%xmm15\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"movq %%xmm15,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
"r"(static_cast<intptr_t>(dst_stride)) // %4
: "memory"
);
}
#define HAS_TRANSPOSE_UVWX8_SSE2 #define HAS_TRANSPOSE_UVWX8_SSE2
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
...@@ -644,17 +781,26 @@ void TransposePlane(const uint8* src, int src_stride, ...@@ -644,17 +781,26 @@ void TransposePlane(const uint8* src, int src_stride,
#if defined(HAS_TRANSPOSE_WX8_NEON) #if defined(HAS_TRANSPOSE_WX8_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(width % 8 == 0) && (width % 8 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) && IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) { IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_NEON; TransposeWx8 = TransposeWx8_NEON;
TransposeWxH = TransposeWxH_C; TransposeWxH = TransposeWxH_C;
} else } else
#endif #endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3;
TransposeWxH = TransposeWxH_C;
} else
#endif
#if defined(HAS_TRANSPOSE_WX8_SSSE3) #if defined(HAS_TRANSPOSE_WX8_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) && (width % 8 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) && IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) { IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_SSSE3; TransposeWx8 = TransposeWx8_SSSE3;
TransposeWxH = TransposeWxH_C; TransposeWxH = TransposeWxH_C;
} else } else
......
...@@ -13,54 +13,73 @@ ...@@ -13,54 +13,73 @@
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
// The following are available on all x86 platforms
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ #if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOYROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3
#define HAS_BG24TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOUVROW_SSSE3
#define HAS_RAWTOUVROW_SSSE3
#endif #endif
// The following are available only on Windows
#if defined(WIN32) \ #if defined(WIN32) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#endif #endif
extern "C" { extern "C" {
void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToRGB32Row(const uint8* y_buf,
uint8* rgb_buf,
int width);
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
#define HASRGB24TOYROW_SSSE3
#endif
#ifdef HASRGB24TOYROW_SSSE3
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif #endif
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#ifdef HAS_BG24TOARGBROW_SSSE3
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
#endif
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var #define SIMD_ALIGNED(var) __declspec(align(16)) var
...@@ -79,6 +98,33 @@ extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); ...@@ -79,6 +98,33 @@ extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]); extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]); extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
#endif #endif
void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToRGB32Row(const uint8* y_buf,
uint8* rgb_buf,
int width);
// Method to force C version. // Method to force C version.
//#define USE_MMX 0 //#define USE_MMX 0
......
...@@ -23,6 +23,16 @@ extern "C" TALIGN16(const uint8, kAdd16[16]) = { ...@@ -23,6 +23,16 @@ extern "C" TALIGN16(const uint8, kAdd16[16]) = {
1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
}; };
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile( asm volatile(
"movdqa (%3),%%xmm7\n" "movdqa (%3),%%xmm7\n"
...@@ -55,47 +65,81 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -55,47 +65,81 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
} }
#endif #endif
static inline int RGBToY(uint8 r, uint8 g, uint8 b) { #ifdef HAS_BG24TOARGBROW_SSSE3
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
} asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
static inline int RGBToU(uint8 r, uint8 g, uint8 b) { "pslld $0x18,%%xmm7\n"
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; "movdqa (%3),%%xmm6\n"
} "1:"
static inline int RGBToV(uint8 r, uint8 g, uint8 b) { "movdqa (%0),%%xmm0\n"
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; "movdqa 0x10(%0),%%xmm1\n"
} "movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { "movdqa %%xmm3,%%xmm2\n"
for (int x = 0; x < width; ++x) { "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); "pshufb %%xmm6,%%xmm2\n"
src_argb0 += 4; "por %%xmm7,%%xmm2\n"
dst_y += 1; "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
} "pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_bg24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskBG24ToARGB) // %3
: "memory"
);
} }
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
uint8* dst_u, uint8* dst_v, int width) { asm volatile(
const uint8* src_argb1 = src_argb0 + src_stride_argb; "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
for (int x = 0; x < width - 1; x += 2) { "pslld $0x18,%%xmm7\n"
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; "movdqa (%3),%%xmm6\n"
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; "1:"
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; "movdqa (%0),%%xmm0\n"
dst_u[0] = RGBToU(ar, ag, ab); "movdqa 0x10(%0),%%xmm1\n"
dst_v[0] = RGBToV(ar, ag, ab); "movdqa 0x20(%0),%%xmm3\n"
src_argb0 += 8; "lea 0x30(%0),%0\n"
src_argb1 += 8; "movdqa %%xmm3,%%xmm2\n"
dst_u += 1; "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
dst_v += 1; "pshufb %%xmm6,%%xmm2\n"
} "por %%xmm7,%%xmm2\n"
if (width & 1) { "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1; "pshufb %%xmm6,%%xmm0\n"
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1; "movdqa %%xmm2,0x20(%1)\n"
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1; "por %%xmm7,%%xmm0\n"
dst_u[0] = RGBToU(ar, ag, ab); "pshufb %%xmm6,%%xmm1\n"
dst_v[0] = RGBToV(ar, ag, ab); "movdqa %%xmm0,(%1)\n"
} "por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskRAWToARGB) // %3
: "memory"
);
} }
#endif
#if defined(__x86_64__) #if defined(__x86_64__)
...@@ -611,4 +655,5 @@ void FastConvertYToRGB32Row(const uint8* y_buf, ...@@ -611,4 +655,5 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
} }
#endif #endif
} // extern "C" } // extern "C"
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include "row.h" #include "row.h"
#define kMaxStride (2048 * 4)
extern "C" { extern "C" {
#define MAKETABLE(NAME) \ #define MAKETABLE(NAME) \
...@@ -301,4 +303,167 @@ MAKETABLE(kCoefficientsAbgrY) ...@@ -301,4 +303,167 @@ MAKETABLE(kCoefficientsAbgrY)
MAKETABLE(_kCoefficientsAbgrY) MAKETABLE(_kCoefficientsAbgrY)
#endif #endif
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 r = src_raw[0];
uint8 g = src_raw[1];
uint8 b = src_raw[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb += 4;
src_raw += 3;
}
}
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 b = src_bg24[0];
uint8 g = src_bg24[1];
uint8 r = src_bg24[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb[3] = 255u;
dst_argb += 4;
src_bg24 += 3;
}
}
// C versions do the same
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
BG24ToARGBRow_C(src_argb, row, pix);
ARGBToYRow_C(row, dst_y, pix);
}
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
RAWToARGBRow_C(src_argb, row, pix);
ARGBToYRow_C(row, dst_y, pix);
}
void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_C(src_argb, row, pix);
BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_C(src_argb, row, pix);
RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
}
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
}
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
#define MAKEROWY(NAME,R,G,B) \
void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
for (int x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
src_argb0 += 4; \
dst_y += 1; \
} \
} \
void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
uint8* dst_u, uint8* dst_v, int width) { \
const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
for (int x = 0; x < width - 1; x += 2) { \
uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \
src_rgb1[B] + src_rgb1[B + 4]) >> 2; \
uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \
src_rgb1[G] + src_rgb1[G + 4]) >> 2; \
uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \
src_rgb1[R] + src_rgb1[R + 4]) >> 2; \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb0 += 8; \
src_rgb1 += 8; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
MAKEROWY(ARGB,2,1,0)
MAKEROWY(BGRA,1,2,3)
MAKEROWY(ABGR,0,1,2)
#if defined(HAS_RAWTOYROW_SSSE3)
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
#endif
#if defined(HAS_RAWTOUVROW_SSSE3)
#if defined(HAS_ARGBTOUVROW_SSSE3)
void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
#else
void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
#endif
#endif
} // extern "C" } // extern "C"
...@@ -16,58 +16,159 @@ extern "C" { ...@@ -16,58 +16,159 @@ extern "C" {
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
// Constant multiplication table for converting ARGB to I400. // Constant multiplication table for converting ARGB to I400.
extern "C" TALIGN16(const int8, kRGBToY[16]) = { extern "C" TALIGN16(const int8, kARGBToY[16]) = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
}; };
extern "C" TALIGN16(const int8, kRGBToU[16]) = { extern "C" TALIGN16(const int8, kARGBToU[16]) = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
}; };
extern "C" TALIGN16(const int8, kRGBToV[16]) = { extern "C" TALIGN16(const int8, kARGBToV[16]) = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
}; };
// Constants for BGRA
extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
};
extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
};
extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
};
// Constants for ABGR
extern "C" TALIGN16(const int8, kABGRToY[16]) = {
33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
};
extern "C" TALIGN16(const int8, kABGRToU[16]) = {
-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
};
extern "C" TALIGN16(const int8, kABGRToV[16]) = {
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
};
extern "C" TALIGN16(const uint8, kAddY16[16]) = { extern "C" TALIGN16(const uint8, kAddY16[16]) = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
}; };
extern "C" TALIGN16(const uint8, kAddUV128[16]) = { extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
}; };
// Convert 16 ARGB pixels (64 bytes) to 16 Y values
__declspec(naked) __declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kRGBToY movdqa xmm7, _kARGBToY
movdqa xmm6, _kAddY16 movdqa xmm6, _kAddY16
pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff
psrld xmm5, 16
convertloop : convertloop :
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7 pmaddubsw xmm0, xmm7
lea eax, [eax + 32] pmaddubsw xmm1, xmm7
pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra pmaddubsw xmm2, xmm7
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx pmaddubsw xmm3, xmm7
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx lea eax, [eax + 64]
pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 phaddw xmm0, xmm1
palignr xmm3, xmm1, 2 phaddw xmm2, xmm3
paddw xmm3, xmm1 psrlw xmm0, 7
pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 psrlw xmm2, 7
packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA packuswb xmm0, xmm2
psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx paddb xmm0, xmm6
packuswb xmm2, xmm2 movdqa [edx], xmm0
paddb xmm2, xmm6 lea edx, [edx + 16]
movq qword ptr [edx], xmm2 sub ecx, 16
lea edx, [edx + 8] ja convertloop
sub ecx, 8 ret
}
}
__declspec(naked)
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kBGRAToY
movdqa xmm6, _kAddY16
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
pmaddubsw xmm3, xmm7
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm6
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kABGRToY
movdqa xmm6, _kAddY16
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
pmaddubsw xmm3, xmm7
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm6
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop ja convertloop
ret ret
} }
...@@ -84,55 +185,52 @@ __asm { ...@@ -84,55 +185,52 @@ __asm {
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kRGBToU movdqa xmm7, _kARGBToU
movdqa xmm6, _kRGBToV movdqa xmm6, _kARGBToV
movdqa xmm5, _kAddUV128 movdqa xmm5, _kAddUV128
pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff sub edi, edx // stride from u to v
psrld xmm4, 16
convertloop : convertloop :
// step 1 - subsample 8x2 argb pixels to 4x1 /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax] // 32x2 -> 32x1 movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi] movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + esi + 16] movdqa xmm3, [eax + 48]
lea eax, [eax + 32] pavgb xmm0, [eax + esi]
pavgb xmm0, xmm2 pavgb xmm1, [eax + esi + 16]
pavgb xmm1, xmm3 pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
movdqa xmm2, xmm0 // 32x1 -> 16x1 lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88 shufps xmm0, xmm1, 0x88
shufps xmm2, xmm1, 0xdd shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm2 pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V // step 2 - convert to U and V
// from here down is very similar to Y code except // from here down is very similar to Y code except
// instead of 8 different pixels, its 4 pixels of U and 4 of V // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0 movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx phaddw xmm0, xmm2
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx phaddw xmm1, xmm3
pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 psraw xmm0, 8
psraw xmm1, 8
palignr xmm3, xmm1, 2 packsswb xmm0, xmm1
paddw xmm3, xmm1 paddb xmm0, xmm5 // -> unsigned
pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
// step 3 - store 8 U and 8 V values
psraw xmm2, 8 movlps qword ptr [edx], xmm0 // U
psraw xmm3, 8 movhps qword ptr [edx + edi], xmm0 // V
packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA lea edx, [edx + 8]
paddb xmm2, xmm5 // -> unsigned sub ecx, 16
packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V
// step 3 - store 4 U and 4 V values
movd dword ptr [edx], xmm2 // U
lea edx, [edx + 4]
pshufd xmm0, xmm2, 0x55 // V
movd dword ptr [edi], xmm0
lea edi, [edi + 4]
sub ecx, 8
ja convertloop ja convertloop
pop edi pop edi
pop esi pop esi
...@@ -140,45 +238,208 @@ __asm { ...@@ -140,45 +238,208 @@ __asm {
} }
} }
static inline int RGBToY(uint8 r, uint8 g, uint8 b) { __declspec(naked)
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
} uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kBGRAToU
movdqa xmm6, _kBGRAToV
movdqa xmm5, _kAddUV128
sub edi, edx // stride from u to v
static inline int RGBToU(uint8 r, uint8 g, uint8 b) { convertloop :
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; /* step 1 - subsample 16x2 argb pixels to 8x1 */
} movdqa xmm0, [eax]
static inline int RGBToV(uint8 r, uint8 g, uint8 b) { movdqa xmm1, [eax + 16]
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; movdqa xmm2, [eax + 32]
} movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { // step 2 - convert to U and V
for (int x = 0; x < width; ++x) { // from here down is very similar to Y code except
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); // instead of 16 different pixels, its 8 pixels of U and 8 of V
src_argb0 += 4; movdqa xmm1, xmm0
dst_y += 1; movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
} }
} }
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, __declspec(naked)
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
const uint8* src_argb1 = src_argb0 + src_stride_argb; __asm {
for (int x = 0; x < width - 1; x += 2) { push esi
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; push edi
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; mov eax, [esp + 8 + 4] // src_argb
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; mov esi, [esp + 8 + 8] // src_stride_argb
dst_u[0] = RGBToU(ar, ag, ab); mov edx, [esp + 8 + 12] // dst_u
dst_v[0] = RGBToV(ar, ag, ab); mov edi, [esp + 8 + 16] // dst_v
src_argb0 += 8; mov ecx, [esp + 8 + 20] // pix
src_argb1 += 8; movdqa xmm7, _kABGRToU
dst_u += 1; movdqa xmm6, _kABGRToV
dst_v += 1; movdqa xmm5, _kAddUV128
sub edi, edx // stride from u to v
convertloop :
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
} }
if (width & 1) { }
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1; __declspec(naked)
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1; void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
dst_u[0] = RGBToU(ar, ag, ab); __asm {
dst_v[0] = RGBToV(ar, ag, ab); mov eax, [esp + 4] // src_bg24
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskBG24ToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskRAWToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment