Commit 9394ed99 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGB To I420 and variations using row functions

BUG=none
TEST=media_unittests from talk used to benchmark
Review URL: http://webrtc-codereview.appspot.com/254001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@51 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 7472021e
...@@ -20,6 +20,9 @@ static const int kCpuHasSSSE3 = 2; ...@@ -20,6 +20,9 @@ static const int kCpuHasSSSE3 = 2;
// These flags are only valid on ARM processors // These flags are only valid on ARM processors
static const int kCpuHasNEON = 4; static const int kCpuHasNEON = 4;
// Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 8;
// Detect CPU has SSE2 etc. // Detect CPU has SSE2 etc.
bool TestCpuFlag(int flag); bool TestCpuFlag(int flag);
......
...@@ -636,147 +636,110 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, ...@@ -636,147 +636,110 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
return 0; return 0;
} }
// ARGBToI420Row_C etc row functions use the following macro, generating int ARGBToI420(const uint8* src_frame, int src_stride_frame,
// code with RGB offsets/strides different for each version. Less error uint8* dst_y, int dst_stride_y,
// prone than duplicating the code. uint8* dst_u, int dst_stride_u,
// template could be used, but macro method works for C and asm and this is uint8* dst_v, int dst_stride_v,
// performance critical code. int width, int height) {
#define MAKEROWRGBTOI420(NAME,R,G,B,BPP) \
static void \
NAME(const uint8* src_row0, const uint8* src_row1, \
uint8* dst_yplane0, uint8* dst_yplane1, \
uint8* dst_u, \
uint8* dst_v, \
int width) { \
for (int x = 0; x < width - 1; x += 2) { \
dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
src_row0[G] * 129 + \
src_row0[B] * 25 + 128) >> 8) + 16; \
dst_yplane0[1] = (uint8)((src_row0[R + BPP] * 66 + \
src_row0[G + BPP] * 129 + \
src_row0[B + BPP] * 25 + 128) >> 8) + 16; \
dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
src_row1[G] * 129 + \
src_row1[B] * 25 + 128) >> 8) + 16; \
dst_yplane1[1] = (uint8)((src_row1[R + BPP] * 66 + \
src_row1[G + BPP] * 129 + \
src_row1[B + BPP] * 25 + 128) >> 8) + 16; \
dst_u[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
src_row1[R] + src_row1[R + BPP]) * -38 + \
(src_row0[G] + src_row0[G + BPP] + \
src_row1[G] + src_row1[G + BPP]) * -74 + \
(src_row0[B] + src_row0[B + BPP] + \
src_row1[B] + src_row1[B + BPP]) * 112 + \
+ 512) >> 10) + 128; \
dst_v[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
src_row1[R] + src_row1[R + BPP]) * 112 + \
(src_row0[G] + src_row0[G + BPP] + \
src_row1[G] + src_row1[G + BPP]) * -94 + \
(src_row0[B] + src_row0[B + BPP] + \
src_row1[B] + src_row1[B + BPP]) * -18 + \
+ 512) >> 10) + 128; \
dst_yplane0 += 2; \
dst_yplane1 += 2; \
++dst_u; \
++dst_v; \
src_row0 += BPP * 2; \
src_row1 += BPP * 2; \
} \
if (width & 1) { \
dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
src_row0[G] * 129 + \
src_row0[B] * 25 + 128) >> 8) + 16; \
dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
src_row1[G] * 129 + \
src_row1[B] * 25 + 128) >> 8) + 16; \
dst_u[0] = (uint8)(((src_row0[R] + \
src_row1[R]) * -38 + \
(src_row0[G] + \
src_row1[G]) * -74 + \
(src_row0[B] + \
src_row1[B]) * 112 + \
+ 256) >> 9) + 128; \
dst_v[0] = (uint8)(((src_row0[R] + \
src_row1[R]) * 112 + \
(src_row0[G] + \
src_row1[G]) * -94 + \
(src_row0[B] + \
src_row1[B]) * -18 + \
+ 256) >> 9) + 128; \
} \
}
// Generate variations of RGBToI420. Parameters are r,g,b offsets within a
// pixel, and number of bytes per pixel.
MAKEROWRGBTOI420(ARGBToI420Row_C, 2, 1, 0, 4)
MAKEROWRGBTOI420(BGRAToI420Row_C, 1, 2, 3, 4)
MAKEROWRGBTOI420(ABGRToI420Row_C, 0, 1, 2, 4)
MAKEROWRGBTOI420(RGB24ToI420Row_C, 2, 1, 0, 3)
MAKEROWRGBTOI420(RAWToI420Row_C, 0, 1, 2, 3)
static int RGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
void (*RGBToI420Row)(const uint8* src_row0,
const uint8* src_row1,
uint8* dst_yplane0,
uint8* dst_yplane1,
uint8* dst_u,
uint8* dst_v,
int width)) {
if (src_frame == NULL || dst_y == NULL ||
dst_v == NULL || dst_v == NULL)
return -1;
if (height < 0) { if (height < 0) {
height = -height; height = -height;
src_frame = src_frame + src_stride_frame * (height -1); src_frame = src_frame + (height - 1) * src_stride_frame;
src_stride_frame = -src_stride_frame; src_stride_frame = -src_stride_frame;
} }
for (int y = 0; y < height - 1; y += 2) { void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
RGBToI420Row(src_frame, src_frame + src_stride_frame, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
dst_y, dst_y + dst_stride_y, uint8* dst_u, uint8* dst_v, int width);
dst_u, dst_v, #if defined(HAS_ARGBTOYROW_SSSE3)
width); if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
}
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = ARGBToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2; src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2; dst_y += dst_stride_y * 2;
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
if (height & 1) { if (height & 1) {
RGBToI420Row(src_frame, src_frame, ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
dst_y, dst_y, ARGBToYRow(src_frame, dst_y, width);
dst_u, dst_v,
width);
} }
return 0; return 0;
} }
int ARGBToI420_Reference(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return RGBToI420(src_frame, src_stride_frame,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height, ARGBToI420Row_C);
}
int BGRAToI420(const uint8* src_frame, int src_stride_frame, int BGRAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
return RGBToI420(src_frame, src_stride_frame, if (height < 0) {
dst_y, dst_stride_y, height = -height;
dst_u, dst_stride_u, src_frame = src_frame + (height - 1) * src_stride_frame;
dst_v, dst_stride_v, src_stride_frame = -src_stride_frame;
width, height, BGRAToI420Row_C); }
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_BGRATOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = BGRAToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = BGRAToYRow_C;
}
#if defined(HAS_BGRATOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = BGRAToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = BGRAToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
} }
int ABGRToI420(const uint8* src_frame, int src_stride_frame, int ABGRToI420(const uint8* src_frame, int src_stride_frame,
...@@ -784,11 +747,52 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, ...@@ -784,11 +747,52 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
return RGBToI420(src_frame, src_stride_frame, if (height < 0) {
dst_y, dst_stride_y, height = -height;
dst_u, dst_stride_u, src_frame = src_frame + (height - 1) * src_stride_frame;
dst_v, dst_stride_v, src_stride_frame = -src_stride_frame;
width, height, ABGRToI420Row_C); }
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ABGRTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ABGRToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ABGRToYRow_C;
}
#if defined(HAS_ABGRTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ABGRToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = ABGRToUVRow_C;
}
for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
} }
int RGB24ToI420(const uint8* src_frame, int src_stride_frame, int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
...@@ -796,30 +800,59 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame, ...@@ -796,30 +800,59 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
return RGBToI420(src_frame, src_stride_frame, if (height < 0) {
dst_y, dst_stride_y, height = -height;
dst_u, dst_stride_u, src_frame = src_frame + (height - 1) * src_stride_frame;
dst_v, dst_stride_v, src_stride_frame = -src_stride_frame;
width, height, RGB24ToI420Row_C); }
} void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_RGB24TOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = RGB24ToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = RGB24ToYRow_C;
}
#if defined(HAS_RGB24TOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = RGB24ToUVRow_SSSE3;
} else
#endif
{
ARGBToUVRow = RGB24ToUVRow_C;
}
int RAWToI420(const uint8* src_frame, int src_stride_frame, for (int y = 0; y < (height - 1); y += 2) {
uint8* dst_y, int dst_stride_y, ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
uint8* dst_u, int dst_stride_u, ARGBToYRow(src_frame, dst_y, width);
uint8* dst_v, int dst_stride_v, ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
int width, int height) { src_frame += src_stride_frame * 2;
return RGBToI420(src_frame, src_stride_frame, dst_y += dst_stride_y * 2;
dst_y, dst_stride_y, dst_u += dst_stride_u;
dst_u, dst_stride_u, dst_v += dst_stride_v;
dst_v, dst_stride_v, }
width, height, RAWToI420Row_C); if (height & 1) {
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
}
return 0;
} }
int ARGBToI420(const uint8* src_frame, int src_stride_frame, int RAWToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
if (height < 0) { if (height < 0) {
height = -height; height = -height;
src_frame = src_frame + (height - 1) * src_stride_frame; src_frame = src_frame + (height - 1) * src_stride_frame;
...@@ -828,42 +861,42 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, ...@@ -828,42 +861,42 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_RAWTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = RAWToYRow_SSSE3;
} else } else
#endif #endif
{ {
ARGBToYRow = ARGBToYRow_C; ARGBToYRow = RAWToYRow_C;
} }
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_RAWTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 4) && (dst_stride_u % 4 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 4) && (dst_stride_v % 4 == 0)) { IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToUVRow = RAWToUVRow_SSSE3;
} else } else
#endif #endif
{ {
ARGBToUVRow = ARGBToUVRow_C; ARGBToUVRow = RAWToUVRow_C;
} }
for (int y = 0; y < (height - 1); y += 2) { for (int y = 0; y < (height - 1); y += 2) {
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width); ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
src_frame += src_stride_frame * 2; src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2; dst_y += dst_stride_y * 2;
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
if (height & 1) { if (height & 1) {
ARGBToYRow(src_frame, dst_y, width);
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
} }
return 0; return 0;
} }
......
...@@ -15,9 +15,6 @@ ...@@ -15,9 +15,6 @@
#include <intrin.h> #include <intrin.h>
#endif #endif
// Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 16;
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static inline void __cpuid(int cpu_info[4], int info_type) { static inline void __cpuid(int cpu_info[4], int info_type) {
...@@ -64,11 +61,11 @@ static void InitCpuFlags() { ...@@ -64,11 +61,11 @@ static void InitCpuFlags() {
void MaskCpuFlags(int enable_flags) { void MaskCpuFlags(int enable_flags) {
InitCpuFlags(); InitCpuFlags();
cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized; cpu_info_ &= enable_flags;
} }
bool TestCpuFlag(int flag) { bool TestCpuFlag(int flag) {
if (!cpu_info_) { if (0 == cpu_info_) {
InitCpuFlags(); InitCpuFlags();
} }
return cpu_info_ & flag ? true : false; return cpu_info_ & flag ? true : false;
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "video_common.h" #include "video_common.h"
#include "row.h" #include "row.h"
#define kMaxStride (2048 * 4)
namespace libyuv { namespace libyuv {
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
...@@ -329,6 +331,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -329,6 +331,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
if (width * 4 > kMaxStride) {
return -1;
}
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
...@@ -347,23 +352,29 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -347,23 +352,29 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#define kMaxStride (2048 * 4)
SIMD_ALIGNED(uint8 row[kMaxStride * 2]); SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) && (width % 16 == 0) &&
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
}
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToUVRow = ARGBToUVRow_SSSE3;
#else
ARGBToUVRow = ARGBToUVRow_C;
#endif
} else } else
#endif #endif
{ {
ARGBToYRow = ARGBToYRow_C;
ARGBToUVRow = ARGBToUVRow_C; ARGBToUVRow = ARGBToUVRow_C;
} }
...@@ -392,9 +403,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -392,9 +403,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow0(src_bayer, src_stride_bayer, row, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
row + kMaxStride, width); row + kMaxStride, width);
ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width); ARGBToYRow(row, dst_y, width);
ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
src_bayer += src_stride_bayer * 2; src_bayer += src_stride_bayer * 2;
dst_y += dst_stride_y * 2; dst_y += dst_stride_y * 2;
dst_u += dst_stride_u; dst_u += dst_stride_u;
...@@ -403,8 +414,8 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -403,8 +414,8 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
// TODO(fbarchard): Make sure this filters properly // TODO(fbarchard): Make sure this filters properly
if (height & 1) { if (height & 1) {
BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow0(src_bayer, src_stride_bayer, row, width);
ARGBToYRow(row, dst_y, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToUVRow(row, 0, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
} }
return 0; return 0;
} }
......
...@@ -58,16 +58,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = { ...@@ -58,16 +58,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
}; };
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
#if defined(WIN32) && !defined(COVERAGE_ENABLED) #if defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SPLITUV_SSE2 #define HAS_SPLITUV_SSE2
__declspec(naked) __declspec(naked)
...@@ -206,7 +196,7 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -206,7 +196,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
__asm__ volatile __asm__ volatile
( (
"vdup.u32 {q0}, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1:\n" "1:\n"
"vst1.u32 {q0}, [%0]! \n" // store "vst1.u32 {q0}, [%0]! \n" // store
"subs %1, %1, #16 \n" // 16 processed per loop "subs %1, %1, #16 \n" // 16 processed per loop
...@@ -1282,85 +1272,6 @@ __asm { ...@@ -1282,85 +1272,6 @@ __asm {
} }
} }
#define HAS_BG24TOARGBROW_SSSE3
__declspec(naked)
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_bg24
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskBG24ToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
#define HAS_RAWTOARGBROW_SSSE3
__declspec(naked)
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskRAWToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
#elif (defined(__x86_64__) || defined(__i386__)) && \ #elif (defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
...@@ -1435,84 +1346,6 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, ...@@ -1435,84 +1346,6 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
); );
} }
#define HAS_BG24TOARGBROW_SSSE3
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n"
"por %%xmm7,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_bg24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskBG24ToARGB) // %3
: "memory"
);
}
#define HAS_RAWTOARGBROW_SSSE3
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n"
"por %%xmm7,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskRAWToARGB) // %3
: "memory"
);
}
#endif #endif
static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
...@@ -1556,97 +1389,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1556,97 +1389,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 r = src_raw[0];
uint8 g = src_raw[1];
uint8 b = src_raw[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb += 4;
src_raw += 3;
}
}
// Convert RAW to ARGB.
int RAWToARGB(const uint8* src_raw, int src_stride_raw,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
} else
#endif
{
RAWToARGBRow = RAWToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
src_raw += src_stride_raw;
dst_argb += dst_stride_argb;
}
return 0;
}
static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 b = src_bg24[0];
uint8 g = src_bg24[1];
uint8 r = src_bg24[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb[3] = 255u;
dst_argb += 4;
src_bg24 += 3;
}
}
// Convert BG24 to ARGB.
int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
src_stride_bg24 = -src_stride_bg24;
}
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(HAS_BG24TOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
BG24ToARGBRow = BG24ToARGBRow_SSSE3;
} else
#endif
{
BG24ToARGBRow = BG24ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
BG24ToARGBRow(src_bg24, dst_argb, width);
src_bg24 += src_stride_bg24;
dst_argb += dst_stride_argb;
}
return 0;
}
static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) { for (int x = 0; x < pix; ++x) {
// To support in-place conversion. // To support in-place conversion.
...@@ -1768,5 +1510,66 @@ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); ...@@ -1768,5 +1510,66 @@ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
return 0; return 0;
} }
// Convert RAW to ARGB.
int RAWToARGB(const uint8* src_raw, int src_stride_raw,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
} else
#endif
{
RAWToARGBRow = RAWToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
src_raw += src_stride_raw;
dst_argb += dst_stride_argb;
}
return 0;
}
// Convert BG24 to ARGB.
int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
if (height < 0) {
height = -height;
src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
src_stride_bg24 = -src_stride_bg24;
}
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(HAS_BG24TOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
BG24ToARGBRow = BG24ToARGBRow_SSSE3;
} else
#endif
{
BG24ToARGBRow = BG24ToARGBRow_C;
}
for (int y = 0; y < height; ++y) {
BG24ToARGBRow(src_bg24, dst_argb, width);
src_bg24 += src_stride_bg24;
dst_argb += dst_stride_argb;
}
return 0;
}
} // namespace libyuv } // namespace libyuv
...@@ -497,6 +497,143 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ...@@ -497,6 +497,143 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
); );
#if defined (__x86_64__) #if defined (__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile(
"1:"
// Read in the data from the source pointer.
// First round of bit swap.
"movdqa (%0),%%xmm0\n"
"movdqa (%0,%3),%%xmm1\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm0,%%xmm8\n"
"punpcklbw %%xmm1,%%xmm0\n"
"punpckhbw %%xmm1,%%xmm8\n"
"movdqa (%0),%%xmm2\n"
"movdqa %%xmm0,%%xmm1\n"
"movdqa %%xmm8,%%xmm9\n"
"palignr $0x8,%%xmm1,%%xmm1\n"
"palignr $0x8,%%xmm9,%%xmm9\n"
"movdqa (%0,%3),%%xmm3\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm2,%%xmm10\n"
"punpcklbw %%xmm3,%%xmm2\n"
"punpckhbw %%xmm3,%%xmm10\n"
"movdqa %%xmm2,%%xmm3\n"
"movdqa %%xmm10,%%xmm11\n"
"movdqa (%0),%%xmm4\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"palignr $0x8,%%xmm11,%%xmm11\n"
"movdqa (%0,%3),%%xmm5\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm4,%%xmm12\n"
"punpcklbw %%xmm5,%%xmm4\n"
"punpckhbw %%xmm5,%%xmm12\n"
"movdqa %%xmm4,%%xmm5\n"
"movdqa %%xmm12,%%xmm13\n"
"movdqa (%0),%%xmm6\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"palignr $0x8,%%xmm13,%%xmm13\n"
"movdqa (%0,%3),%%xmm7\n"
"lea (%0,%3,2),%0\n"
"movdqa %%xmm6,%%xmm14\n"
"punpcklbw %%xmm7,%%xmm6\n"
"punpckhbw %%xmm7,%%xmm14\n"
"neg %3\n"
"movdqa %%xmm6,%%xmm7\n"
"movdqa %%xmm14,%%xmm15\n"
"lea 0x10(%0,%3,8),%0\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"neg %3\n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0\n"
"punpcklwd %%xmm3,%%xmm1\n"
"movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n"
"palignr $0x8,%%xmm2,%%xmm2\n"
"palignr $0x8,%%xmm3,%%xmm3\n"
"punpcklwd %%xmm6,%%xmm4\n"
"punpcklwd %%xmm7,%%xmm5\n"
"movdqa %%xmm4,%%xmm6\n"
"movdqa %%xmm5,%%xmm7\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"punpcklwd %%xmm10,%%xmm8\n"
"punpcklwd %%xmm11,%%xmm9\n"
"movdqa %%xmm8,%%xmm10\n"
"movdqa %%xmm9,%%xmm11\n"
"palignr $0x8,%%xmm10,%%xmm10\n"
"palignr $0x8,%%xmm11,%%xmm11\n"
"punpcklwd %%xmm14,%%xmm12\n"
"punpcklwd %%xmm15,%%xmm13\n"
"movdqa %%xmm12,%%xmm14\n"
"movdqa %%xmm13,%%xmm15\n"
"palignr $0x8,%%xmm14,%%xmm14\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0\n"
"movq %%xmm0,(%1)\n"
"movdqa %%xmm0,%%xmm4\n"
"palignr $0x8,%%xmm4,%%xmm4\n"
"movq %%xmm4,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm6,%%xmm2\n"
"movdqa %%xmm2,%%xmm6\n"
"movq %%xmm2,(%1)\n"
"palignr $0x8,%%xmm6,%%xmm6\n"
"punpckldq %%xmm5,%%xmm1\n"
"movq %%xmm6,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm1,%%xmm5\n"
"movq %%xmm1,(%1)\n"
"palignr $0x8,%%xmm5,%%xmm5\n"
"movq %%xmm5,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm7,%%xmm3\n"
"movq %%xmm3,(%1)\n"
"movdqa %%xmm3,%%xmm7\n"
"palignr $0x8,%%xmm7,%%xmm7\n"
"movq %%xmm7,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm12,%%xmm8\n"
"movq %%xmm8,(%1)\n"
"movdqa %%xmm8,%%xmm12\n"
"palignr $0x8,%%xmm12,%%xmm12\n"
"movq %%xmm12,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm14,%%xmm10\n"
"movdqa %%xmm10,%%xmm14\n"
"movq %%xmm10,(%1)\n"
"palignr $0x8,%%xmm14,%%xmm14\n"
"punpckldq %%xmm13,%%xmm9\n"
"movq %%xmm14,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"movdqa %%xmm9,%%xmm13\n"
"movq %%xmm9,(%1)\n"
"palignr $0x8,%%xmm13,%%xmm13\n"
"movq %%xmm13,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"punpckldq %%xmm15,%%xmm11\n"
"movq %%xmm11,(%1)\n"
"movdqa %%xmm11,%%xmm15\n"
"palignr $0x8,%%xmm15,%%xmm15\n"
"movq %%xmm15,(%1,%4)\n"
"lea (%1,%4,2),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
"r"(static_cast<intptr_t>(dst_stride)) // %4
: "memory"
);
}
#define HAS_TRANSPOSE_UVWX8_SSE2 #define HAS_TRANSPOSE_UVWX8_SSE2
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
...@@ -644,17 +781,26 @@ void TransposePlane(const uint8* src, int src_stride, ...@@ -644,17 +781,26 @@ void TransposePlane(const uint8* src, int src_stride,
#if defined(HAS_TRANSPOSE_WX8_NEON) #if defined(HAS_TRANSPOSE_WX8_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(width % 8 == 0) && (width % 8 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) && IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) { IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_NEON; TransposeWx8 = TransposeWx8_NEON;
TransposeWxH = TransposeWxH_C; TransposeWxH = TransposeWxH_C;
} else } else
#endif #endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 16 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3;
TransposeWxH = TransposeWxH_C;
} else
#endif
#if defined(HAS_TRANSPOSE_WX8_SSSE3) #if defined(HAS_TRANSPOSE_WX8_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) && (width % 8 == 0) &&
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) && IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) { IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_SSSE3; TransposeWx8 = TransposeWx8_SSSE3;
TransposeWxH = TransposeWxH_C; TransposeWxH = TransposeWxH_C;
} else } else
......
...@@ -13,17 +13,91 @@ ...@@ -13,17 +13,91 @@
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
// The following are available on all x86 platforms
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ #if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOYROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3
#define HAS_BG24TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOUVROW_SSSE3
#define HAS_RAWTOUVROW_SSSE3
#endif #endif
// The following are available only on Windows
#if defined(WIN32) \ #if defined(WIN32) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#endif #endif
extern "C" { extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
#define HASRGB24TOYROW_SSSE3
#endif
#ifdef HASRGB24TOYROW_SSSE3
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#ifdef HAS_BG24TOARGBROW_SSSE3
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
#endif
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
#endif
#ifdef OSX
extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
#else
extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
#endif
void FastConvertYUVToRGB32Row(const uint8* y_buf, void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
...@@ -52,34 +126,6 @@ void FastConvertYToRGB32Row(const uint8* y_buf, ...@@ -52,34 +126,6 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
#endif
#ifdef OSX
extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
#else
extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
#endif
// Method to force C version. // Method to force C version.
//#define USE_MMX 0 //#define USE_MMX 0
//#define USE_SSE2 0 //#define USE_SSE2 0
......
...@@ -23,6 +23,16 @@ extern "C" TALIGN16(const uint8, kAdd16[16]) = { ...@@ -23,6 +23,16 @@ extern "C" TALIGN16(const uint8, kAdd16[16]) = {
1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
}; };
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile( asm volatile(
"movdqa (%3),%%xmm7\n" "movdqa (%3),%%xmm7\n"
...@@ -55,47 +65,81 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -55,47 +65,81 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
} }
#endif #endif
static inline int RGBToY(uint8 r, uint8 g, uint8 b) { #ifdef HAS_BG24TOARGBROW_SSSE3
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
} asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
static inline int RGBToU(uint8 r, uint8 g, uint8 b) { "pslld $0x18,%%xmm7\n"
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; "movdqa (%3),%%xmm6\n"
} "1:"
static inline int RGBToV(uint8 r, uint8 g, uint8 b) { "movdqa (%0),%%xmm0\n"
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; "movdqa 0x10(%0),%%xmm1\n"
} "movdqa 0x20(%0),%%xmm3\n"
"lea 0x30(%0),%0\n"
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { "movdqa %%xmm3,%%xmm2\n"
for (int x = 0; x < width; ++x) { "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); "pshufb %%xmm6,%%xmm2\n"
src_argb0 += 4; "por %%xmm7,%%xmm2\n"
dst_y += 1; "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
} "pshufb %%xmm6,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_bg24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskBG24ToARGB) // %3
: "memory"
);
} }
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
uint8* dst_u, uint8* dst_v, int width) { asm volatile(
const uint8* src_argb1 = src_argb0 + src_stride_argb; "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
for (int x = 0; x < width - 1; x += 2) { "pslld $0x18,%%xmm7\n"
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; "movdqa (%3),%%xmm6\n"
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; "1:"
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; "movdqa (%0),%%xmm0\n"
dst_u[0] = RGBToU(ar, ag, ab); "movdqa 0x10(%0),%%xmm1\n"
dst_v[0] = RGBToV(ar, ag, ab); "movdqa 0x20(%0),%%xmm3\n"
src_argb0 += 8; "lea 0x30(%0),%0\n"
src_argb1 += 8; "movdqa %%xmm3,%%xmm2\n"
dst_u += 1; "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
dst_v += 1; "pshufb %%xmm6,%%xmm2\n"
} "por %%xmm7,%%xmm2\n"
if (width & 1) { "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1; "pshufb %%xmm6,%%xmm0\n"
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1; "movdqa %%xmm2,0x20(%1)\n"
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1; "por %%xmm7,%%xmm0\n"
dst_u[0] = RGBToU(ar, ag, ab); "pshufb %%xmm6,%%xmm1\n"
dst_v[0] = RGBToV(ar, ag, ab); "movdqa %%xmm0,(%1)\n"
} "por %%xmm7,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskRAWToARGB) // %3
: "memory"
);
} }
#endif
#if defined(__x86_64__) #if defined(__x86_64__)
...@@ -611,4 +655,5 @@ void FastConvertYToRGB32Row(const uint8* y_buf, ...@@ -611,4 +655,5 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
} }
#endif #endif
} // extern "C" } // extern "C"
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include "row.h" #include "row.h"
#define kMaxStride (2048 * 4)
extern "C" { extern "C" {
#define MAKETABLE(NAME) \ #define MAKETABLE(NAME) \
...@@ -301,4 +303,167 @@ MAKETABLE(kCoefficientsAbgrY) ...@@ -301,4 +303,167 @@ MAKETABLE(kCoefficientsAbgrY)
MAKETABLE(_kCoefficientsAbgrY) MAKETABLE(_kCoefficientsAbgrY)
#endif #endif
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 r = src_raw[0];
uint8 g = src_raw[1];
uint8 b = src_raw[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb += 4;
src_raw += 3;
}
}
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
uint8 b = src_bg24[0];
uint8 g = src_bg24[1];
uint8 r = src_bg24[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = 255u;
dst_argb[3] = 255u;
dst_argb += 4;
src_bg24 += 3;
}
}
// C versions do the same
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
BG24ToARGBRow_C(src_argb, row, pix);
ARGBToYRow_C(row, dst_y, pix);
}
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
RAWToARGBRow_C(src_argb, row, pix);
ARGBToYRow_C(row, dst_y, pix);
}
void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_C(src_argb, row, pix);
BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_C(src_argb, row, pix);
RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
}
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
}
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
#define MAKEROWY(NAME,R,G,B) \
void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
for (int x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
src_argb0 += 4; \
dst_y += 1; \
} \
} \
void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
uint8* dst_u, uint8* dst_v, int width) { \
const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
for (int x = 0; x < width - 1; x += 2) { \
uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \
src_rgb1[B] + src_rgb1[B + 4]) >> 2; \
uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \
src_rgb1[G] + src_rgb1[G + 4]) >> 2; \
uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \
src_rgb1[R] + src_rgb1[R + 4]) >> 2; \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb0 += 8; \
src_rgb1 += 8; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
MAKEROWY(ARGB,2,1,0)
MAKEROWY(BGRA,1,2,3)
MAKEROWY(ABGR,0,1,2)
#if defined(HAS_RAWTOYROW_SSSE3)
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
ARGBToYRow_SSSE3(row, dst_y, pix);
}
#endif
#if defined(HAS_RAWTOUVROW_SSSE3)
#if defined(HAS_ARGBTOUVROW_SSSE3)
void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
#else
void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
BG24ToARGBRow_SSSE3(src_argb, row, pix);
BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int pix) {
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
RAWToARGBRow_SSSE3(src_argb, row, pix);
RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
}
#endif
#endif
} // extern "C" } // extern "C"
...@@ -16,59 +16,160 @@ extern "C" { ...@@ -16,59 +16,160 @@ extern "C" {
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
// Constant multiplication table for converting ARGB to I400. // Constant multiplication table for converting ARGB to I400.
extern "C" TALIGN16(const int8, kRGBToY[16]) = { extern "C" TALIGN16(const int8, kARGBToY[16]) = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
}; };
extern "C" TALIGN16(const int8, kRGBToU[16]) = { extern "C" TALIGN16(const int8, kARGBToU[16]) = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
}; };
extern "C" TALIGN16(const int8, kRGBToV[16]) = { extern "C" TALIGN16(const int8, kARGBToV[16]) = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
}; };
// Constants for BGRA
extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
};
extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
};
extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
};
// Constants for ABGR
extern "C" TALIGN16(const int8, kABGRToY[16]) = {
33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
};
extern "C" TALIGN16(const int8, kABGRToU[16]) = {
-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
};
extern "C" TALIGN16(const int8, kABGRToV[16]) = {
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
};
extern "C" TALIGN16(const uint8, kAddY16[16]) = { extern "C" TALIGN16(const uint8, kAddY16[16]) = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
}; };
extern "C" TALIGN16(const uint8, kAddUV128[16]) = { extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
}; };
// Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
// Convert 16 ARGB pixels (64 bytes) to 16 Y values
__declspec(naked) __declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kRGBToY movdqa xmm7, _kARGBToY
movdqa xmm6, _kAddY16 movdqa xmm6, _kAddY16
pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff
psrld xmm5, 16
convertloop : convertloop :
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
pmaddubsw xmm0, xmm7 movdqa xmm2, [eax + 32]
lea eax, [eax + 32] movdqa xmm3, [eax + 48]
pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra pmaddubsw xmm0, xmm7
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx pmaddubsw xmm1, xmm7
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx pmaddubsw xmm2, xmm7
pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 pmaddubsw xmm3, xmm7
palignr xmm3, xmm1, 2 lea eax, [eax + 64]
paddw xmm3, xmm1 phaddw xmm0, xmm1
pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 phaddw xmm2, xmm3
packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA psrlw xmm0, 7
psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx psrlw xmm2, 7
packuswb xmm2, xmm2 packuswb xmm0, xmm2
paddb xmm2, xmm6 paddb xmm0, xmm6
movq qword ptr [edx], xmm2 movdqa [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 16]
sub ecx, 8 sub ecx, 16
ja convertloop ja convertloop
ret
}
}
__declspec(naked)
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kBGRAToY
movdqa xmm6, _kAddY16
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
pmaddubsw xmm3, xmm7
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm6
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kABGRToY
movdqa xmm6, _kAddY16
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
pmaddubsw xmm3, xmm7
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
paddb xmm0, xmm6
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
ja convertloop
ret ret
} }
} }
...@@ -84,55 +185,52 @@ __asm { ...@@ -84,55 +185,52 @@ __asm {
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kRGBToU movdqa xmm7, _kARGBToU
movdqa xmm6, _kRGBToV movdqa xmm6, _kARGBToV
movdqa xmm5, _kAddUV128 movdqa xmm5, _kAddUV128
pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff sub edi, edx // stride from u to v
psrld xmm4, 16
convertloop : convertloop :
// step 1 - subsample 8x2 argb pixels to 4x1 /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax] // 32x2 -> 32x1 movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi] movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + esi + 16] movdqa xmm3, [eax + 48]
lea eax, [eax + 32] pavgb xmm0, [eax + esi]
pavgb xmm0, xmm2 pavgb xmm1, [eax + esi + 16]
pavgb xmm1, xmm3 pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
movdqa xmm2, xmm0 // 32x1 -> 16x1 lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88 shufps xmm0, xmm1, 0x88
shufps xmm2, xmm1, 0xdd shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm2 pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V // step 2 - convert to U and V
// from here down is very similar to Y code except // from here down is very similar to Y code except
// instead of 8 different pixels, its 4 pixels of U and 4 of V // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0 movdqa xmm1, xmm0
pmaddubsw xmm0, xmm7 // U movdqa xmm3, xmm2
pmaddubsw xmm1, xmm6 // V pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx pmaddubsw xmm1, xmm6 // V
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx pmaddubsw xmm3, xmm6
pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 phaddw xmm0, xmm2
phaddw xmm1, xmm3
palignr xmm3, xmm1, 2 psraw xmm0, 8
paddw xmm3, xmm1 psraw xmm1, 8
pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
psraw xmm2, 8
psraw xmm3, 8 // step 3 - store 8 U and 8 V values
packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA movlps qword ptr [edx], xmm0 // U
paddb xmm2, xmm5 // -> unsigned movhps qword ptr [edx + edi], xmm0 // V
packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V lea edx, [edx + 8]
sub ecx, 16
// step 3 - store 4 U and 4 V values
movd dword ptr [edx], xmm2 // U
lea edx, [edx + 4]
pshufd xmm0, xmm2, 0x55 // V
movd dword ptr [edi], xmm0
lea edi, [edi + 4]
sub ecx, 8
ja convertloop ja convertloop
pop edi pop edi
pop esi pop esi
...@@ -140,45 +238,208 @@ __asm { ...@@ -140,45 +238,208 @@ __asm {
} }
} }
static inline int RGBToY(uint8 r, uint8 g, uint8 b) { __declspec(naked)
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
} uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kBGRAToU
movdqa xmm6, _kBGRAToV
movdqa xmm5, _kAddUV128
sub edi, edx // stride from u to v
static inline int RGBToU(uint8 r, uint8 g, uint8 b) { convertloop :
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; /* step 1 - subsample 16x2 argb pixels to 8x1 */
} movdqa xmm0, [eax]
static inline int RGBToV(uint8 r, uint8 g, uint8 b) { movdqa xmm1, [eax + 16]
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
}
} }
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { __declspec(naked)
for (int x = 0; x < width; ++x) { void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); uint8* dst_u, uint8* dst_v, int width) {
src_argb0 += 4; __asm {
dst_y += 1; push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kABGRToU
movdqa xmm6, _kABGRToV
movdqa xmm5, _kAddUV128
sub edi, edx // stride from u to v
convertloop :
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
pavgb xmm0, [eax + esi]
pavgb xmm1, [eax + esi + 16]
pavgb xmm2, [eax + esi + 32]
pavgb xmm3, [eax + esi + 48]
lea eax, [eax + 64]
movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm4, xmm1, 0xdd
pavgb xmm0, xmm4
movdqa xmm4, xmm2
shufps xmm2, xmm3, 0x88
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm6 // V
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
psraw xmm0, 8
psraw xmm1, 8
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
sub ecx, 16
ja convertloop
pop edi
pop esi
ret
} }
} }
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, __declspec(naked)
uint8* dst_u, uint8* dst_v, int width) { void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
const uint8* src_argb1 = src_argb0 + src_stride_argb; __asm {
for (int x = 0; x < width - 1; x += 2) { mov eax, [esp + 4] // src_bg24
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; mov edx, [esp + 8] // dst_argb
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; mov ecx, [esp + 12] // pix
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; pcmpeqb xmm7, xmm7 // generate mask 0xff000000
dst_u[0] = RGBToU(ar, ag, ab); pslld xmm7, 24
dst_v[0] = RGBToV(ar, ag, ab); movdqa xmm6, _kShuffleMaskBG24ToARGB
src_argb0 += 8;
src_argb1 += 8; convertloop :
dst_u += 1; movdqa xmm0, [eax]
dst_v += 1; movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
} }
if (width & 1) { }
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1; __declspec(naked)
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1; void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
dst_u[0] = RGBToU(ar, ag, ab); int pix) {
dst_v[0] = RGBToV(ar, ag, ab); __asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskRAWToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment