Commit b6149763 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGB to and from I420 ported to x64

BUG=none
TEST=media_unittests
Review URL: http://webrtc-codereview.appspot.com/266003

git-svn-id: http://libyuv.googlecode.com/svn/trunk@61 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 755de365
...@@ -19,15 +19,6 @@ ...@@ -19,15 +19,6 @@
#ifndef INT_TYPES_DEFINED #ifndef INT_TYPES_DEFINED
#define INT_TYPES_DEFINED #define INT_TYPES_DEFINED
#ifdef COMPILER_MSVC
typedef __int64 int64;
#else
typedef long long int64;
#endif /* COMPILER_MSVC */
typedef int int32;
typedef short int16;
typedef char int8;
#ifdef COMPILER_MSVC #ifdef COMPILER_MSVC
typedef unsigned __int64 uint64; typedef unsigned __int64 uint64;
typedef __int64 int64; typedef __int64 int64;
...@@ -38,7 +29,18 @@ typedef __int64 int64; ...@@ -38,7 +29,18 @@ typedef __int64 int64;
#define UINT64_C(x) x ## UI64 #define UINT64_C(x) x ## UI64
#endif #endif
#define INT64_F "I64" #define INT64_F "I64"
#else #else // COMPILER_MSVC
#ifdef __LP64__
typedef unsigned long uint64;
typedef long int64;
#ifndef INT64_C
#define INT64_C(x) x ## L
#endif
#ifndef UINT64_C
#define UINT64_C(x) x ## UL
#endif
#define INT64_F "l"
#else // __LP64__
typedef unsigned long long uint64; typedef unsigned long long uint64;
typedef long long int64; typedef long long int64;
#ifndef INT64_C #ifndef INT64_C
...@@ -48,10 +50,14 @@ typedef long long int64; ...@@ -48,10 +50,14 @@ typedef long long int64;
#define UINT64_C(x) x ## ULL #define UINT64_C(x) x ## ULL
#endif #endif
#define INT64_F "ll" #define INT64_F "ll"
#endif /* COMPILER_MSVC */ #endif // __LP64__
#endif // COMPILER_MSVC
typedef unsigned int uint32; typedef unsigned int uint32;
typedef int int32;
typedef unsigned short uint16; typedef unsigned short uint16;
typedef short int16;
typedef unsigned char uint8; typedef unsigned char uint8;
typedef char int8;
#endif // INT_TYPES_DEFINED #endif // INT_TYPES_DEFINED
// Detect compiler is for x86 or x64. // Detect compiler is for x86 or x64.
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#define INCLUDE_LIBYUV_CONVERT_H_ #define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/rotate.h"
namespace libyuv { namespace libyuv {
...@@ -92,6 +93,17 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, ...@@ -92,6 +93,17 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame, uint8* dst_frame, int dst_stride_frame,
int width, int height); int width, int height);
} // namespace libyuv // Convert camera sample to I420 with cropping, rotation and vertical flip.
int ConvertToI420(const uint8* src_frame, size_t src_size,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int horiz_crop, int vert_crop,
int w, int h,
int dw, int idh,
RotationMode rotation,
uint32 format);
} // namespace libyuv
#endif // INCLUDE_LIBYUV_CONVERT_H_ #endif // INCLUDE_LIBYUV_CONVERT_H_
...@@ -27,7 +27,9 @@ static const int kCpuInitialized = 8; ...@@ -27,7 +27,9 @@ static const int kCpuInitialized = 8;
bool TestCpuFlag(int flag); bool TestCpuFlag(int flag);
// For testing, allow CPU flags to be disabled. // For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -1 to enable all. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// -1 to enable all cpu specific optimizations.
// 0 to disable all cpu specific optimizations.
void MaskCpuFlags(int enable_flags); void MaskCpuFlags(int enable_flags);
} // namespace libyuv } // namespace libyuv
......
...@@ -13,7 +13,11 @@ ...@@ -13,7 +13,11 @@
#include "conversion_tables.h" #include "conversion_tables.h"
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#include "row.h" #include "row.h"
#include "video_common.h"
//#define SCALEOPT //Currently for windows only. June 2010 //#define SCALEOPT //Currently for windows only. June 2010
...@@ -650,7 +654,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, ...@@ -650,7 +654,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
...@@ -661,7 +665,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, ...@@ -661,7 +665,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
ARGBToYRow = ARGBToYRow_C; ARGBToYRow = ARGBToYRow_C;
} }
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
...@@ -703,7 +707,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame, ...@@ -703,7 +707,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_BGRATOYROW_SSSE3) #if defined(HAS_BGRATOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
...@@ -714,7 +718,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame, ...@@ -714,7 +718,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
ARGBToYRow = BGRAToYRow_C; ARGBToYRow = BGRAToYRow_C;
} }
#if defined(HAS_BGRATOUVROW_SSSE3) #if defined(HAS_BGRATOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
...@@ -756,7 +760,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, ...@@ -756,7 +760,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ABGRTOYROW_SSSE3) #if defined(HAS_ABGRTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
...@@ -767,7 +771,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, ...@@ -767,7 +771,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
ARGBToYRow = ABGRToYRow_C; ARGBToYRow = ABGRToYRow_C;
} }
#if defined(HAS_ABGRTOUVROW_SSSE3) #if defined(HAS_ABGRTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
...@@ -809,7 +813,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame, ...@@ -809,7 +813,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_RGB24TOYROW_SSSE3) #if defined(HAS_RGB24TOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
...@@ -820,7 +824,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame, ...@@ -820,7 +824,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
ARGBToYRow = RGB24ToYRow_C; ARGBToYRow = RGB24ToYRow_C;
} }
#if defined(HAS_RGB24TOUVROW_SSSE3) #if defined(HAS_RGB24TOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
...@@ -862,7 +866,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame, ...@@ -862,7 +866,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_RAWTOYROW_SSSE3) #if defined(HAS_RAWTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
...@@ -873,7 +877,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame, ...@@ -873,7 +877,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
ARGBToYRow = RAWToYRow_C; ARGBToYRow = RAWToYRow_C;
} }
#if defined(HAS_RAWTOUVROW_SSSE3) #if defined(HAS_RAWTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
...@@ -901,4 +905,163 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame, ...@@ -901,4 +905,163 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
return 0; return 0;
} }
// Convert camera sample to I420 with cropping, rotation and vertical flip.
int ConvertToI420(const uint8* sample, size_t sample_size,
uint8* y, int y_stride,
uint8* u, int u_stride,
uint8* v, int v_stride,
int horiz_crop, int vert_crop,
int w, int h,
int dw, int idh,
RotationMode rotation,
uint32 format) {
int aw = (w + 1) & ~1;
const uint8* src;
const uint8* src_uv;
int abs_h = (h < 0) ? -h : h;
switch (format) {
// Single plane formats
case FOURCC_YUY2:
src = sample + (aw * vert_crop + horiz_crop) * 2 ;
YUY2ToI420(src, aw * 2,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
case FOURCC_UYVY:
src = sample + (aw * vert_crop + horiz_crop) * 2;
UYVYToI420(src, aw * 2,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
case FOURCC_24BG:
src = sample + (w * vert_crop + horiz_crop) * 3;
RGB24ToI420(src, w * 3,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
case FOURCC_RAW:
src = sample + (w * vert_crop + horiz_crop) * 3;
RAWToI420(src, w * 3,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
case FOURCC_ARGB:
src = sample + (w * vert_crop + horiz_crop) * 4;
ARGBToI420(src, w * 4,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
case FOURCC_BGRA:
src = sample + (w * vert_crop + horiz_crop) * 4;
BGRAToI420(src, w * 4,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
case FOURCC_ABGR:
src = sample + (w * vert_crop + horiz_crop) * 4;
ABGRToI420(src, w * 4,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
case FOURCC_BGGR:
case FOURCC_RGGB:
case FOURCC_GRBG:
case FOURCC_GBRG:
// TODO(fbarchard): We could support cropping by odd numbers by
// adjusting fourcc.
src = sample + (w * vert_crop + horiz_crop);
BayerRGBToI420(src, w, format,
y, y_stride, u, u_stride, v, v_stride,
dw, idh);
break;
// Biplanar formats
case FOURCC_M420:
src = sample + (w * vert_crop) * 12 / 8 + horiz_crop;
M420ToI420(src, w,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
case FOURCC_NV12:
src = sample + (w * vert_crop + horiz_crop);
src_uv = sample + aw * (h + vert_crop / 2) + horiz_crop;
NV12ToI420Rotate(src, w,
src_uv, aw,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh, rotation);
break;
case FOURCC_NV21:
src = sample + (w * vert_crop + horiz_crop);
src_uv = sample + aw * (h + vert_crop / 2) + horiz_crop;
// Call NV12 but with u and v parameters swapped.
NV12ToI420Rotate(src, w,
src_uv, aw,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh, rotation);
break;
case FOURCC_Q420:
src = sample + (w + aw * 2) * vert_crop + horiz_crop;
src_uv = sample + (w + aw * 2) * vert_crop + w + horiz_crop * 2;
Q420ToI420(src, w * 3,
src_uv, w * 3,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh);
break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
const uint8* src_y = sample + (w * vert_crop + horiz_crop);
const uint8* src_u;
const uint8* src_v;
int halfwidth = (w + 1) / 2;
int halfheight = (abs_h + 1) / 2;
if (format == FOURCC_I420) {
src_u = sample + w * abs_h +
(halfwidth * vert_crop + horiz_crop) / 2;
src_v = sample + w * abs_h +
halfwidth * (halfheight + vert_crop / 2) + horiz_crop / 2;
} else {
src_v = sample + w * abs_h +
(halfwidth * vert_crop + horiz_crop) / 2;
src_u = sample + w * abs_h +
halfwidth * (halfheight + vert_crop / 2) + horiz_crop / 2;
}
I420Rotate(src_y, w,
src_u, halfwidth,
src_v, halfwidth,
y, y_stride,
u, u_stride,
v, v_stride,
dw, idh, rotation);
break;
}
// Formats not supported
case FOURCC_MJPG:
default:
return -1; // unknown fourcc - return failure code.
}
return 0;
}
} // namespace libyuv } // namespace libyuv
...@@ -14,11 +14,14 @@ ...@@ -14,11 +14,14 @@
#ifdef _MSC_VER #ifdef _MSC_VER
#include <intrin.h> #include <intrin.h>
#endif #endif
#ifdef __ANDROID__
#include <cpu-features.h>
#endif
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static inline void __cpuid(int cpu_info[4], int info_type) { static inline void __cpuid(int cpu_info[4], int info_type) {
__asm__ volatile ( asm volatile (
"mov %%ebx, %%edi\n" "mov %%ebx, %%edi\n"
"cpuid\n" "cpuid\n"
"xchg %%edi, %%ebx\n" "xchg %%edi, %%ebx\n"
...@@ -28,7 +31,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) { ...@@ -28,7 +31,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) {
} }
#elif defined(__i386__) || defined(__x86_64__) #elif defined(__i386__) || defined(__x86_64__)
static inline void __cpuid(int cpu_info[4], int info_type) { static inline void __cpuid(int cpu_info[4], int info_type) {
__asm__ volatile ( asm volatile (
"cpuid\n" "cpuid\n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type) : "a"(info_type)
...@@ -49,6 +52,10 @@ static void InitCpuFlags() { ...@@ -49,6 +52,10 @@ static void InitCpuFlags() {
cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) | cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
(cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
kCpuInitialized; kCpuInitialized;
#elif defined(__ANDROID__) && defined(__ARM_NEON__)
features = android_getCpuFeatures();
cpu_info_ = (features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
kCpuInitialized;
#elif defined(__ARM_NEON__) #elif defined(__ARM_NEON__)
// gcc -mfpu=neon defines __ARM_NEON__ // gcc -mfpu=neon defines __ARM_NEON__
// Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
...@@ -61,14 +68,14 @@ static void InitCpuFlags() { ...@@ -61,14 +68,14 @@ static void InitCpuFlags() {
void MaskCpuFlags(int enable_flags) { void MaskCpuFlags(int enable_flags) {
InitCpuFlags(); InitCpuFlags();
cpu_info_ &= enable_flags; cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
} }
bool TestCpuFlag(int flag) { bool TestCpuFlag(int flag) {
if (0 == cpu_info_) { if (0 == cpu_info_) {
InitCpuFlags(); InitCpuFlags();
} }
return cpu_info_ & flag ? true : false; return (cpu_info_ & flag) ? true : false;
} }
} // namespace libyuv } // namespace libyuv
...@@ -14,8 +14,6 @@ ...@@ -14,8 +14,6 @@
#include "video_common.h" #include "video_common.h"
#include "row.h" #include "row.h"
#define kMaxStride (2048 * 4)
namespace libyuv { namespace libyuv {
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
...@@ -168,7 +166,7 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, ...@@ -168,7 +166,7 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 g = src_bayer0[1]; uint8 g = src_bayer0[1];
uint8 r = src_bayer1[1]; uint8 r = src_bayer1[1];
for (int x = 0; x < (pix - 2); x += 2) { for (int x = 0; x < pix - 3; x += 2) {
dst_rgb[0] = src_bayer0[0]; dst_rgb[0] = src_bayer0[0];
dst_rgb[1] = AVG(g, src_bayer0[1]); dst_rgb[1] = AVG(g, src_bayer0[1]);
dst_rgb[2] = AVG(r, src_bayer1[1]); dst_rgb[2] = AVG(r, src_bayer1[1]);
...@@ -187,10 +185,12 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, ...@@ -187,10 +185,12 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
dst_rgb[1] = AVG(g, src_bayer0[1]); dst_rgb[1] = AVG(g, src_bayer0[1]);
dst_rgb[2] = AVG(r, src_bayer1[1]); dst_rgb[2] = AVG(r, src_bayer1[1]);
dst_rgb[3] = 255U; dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer0[0]; if (pix & 1) {
dst_rgb[5] = src_bayer0[1]; dst_rgb[4] = src_bayer0[0];
dst_rgb[6] = src_bayer1[1]; dst_rgb[5] = src_bayer0[1];
dst_rgb[7] = 255U; dst_rgb[6] = src_bayer1[1];
dst_rgb[7] = 255U;
}
} }
static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
...@@ -198,7 +198,7 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, ...@@ -198,7 +198,7 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 g = src_bayer0[1]; uint8 g = src_bayer0[1];
uint8 b = src_bayer1[1]; uint8 b = src_bayer1[1];
for (int x = 0; x < (pix - 2); x += 2) { for (int x = 0; x < pix - 3; x += 2) {
dst_rgb[0] = AVG(b, src_bayer1[1]); dst_rgb[0] = AVG(b, src_bayer1[1]);
dst_rgb[1] = AVG(g, src_bayer0[1]); dst_rgb[1] = AVG(g, src_bayer0[1]);
dst_rgb[2] = src_bayer0[0]; dst_rgb[2] = src_bayer0[0];
...@@ -217,17 +217,19 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, ...@@ -217,17 +217,19 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
dst_rgb[1] = AVG(g, src_bayer0[1]); dst_rgb[1] = AVG(g, src_bayer0[1]);
dst_rgb[2] = src_bayer0[0]; dst_rgb[2] = src_bayer0[0];
dst_rgb[3] = 255U; dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer1[1]; if (pix & 1) {
dst_rgb[5] = src_bayer0[1]; dst_rgb[4] = src_bayer1[1];
dst_rgb[6] = src_bayer0[0]; dst_rgb[5] = src_bayer0[1];
dst_rgb[7] = 255U; dst_rgb[6] = src_bayer0[0];
dst_rgb[7] = 255U;
}
} }
static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_rgb, int pix) { uint8* dst_rgb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 b = src_bayer0[1]; uint8 b = src_bayer0[1];
for (int x = 0; x < (pix - 2); x += 2) { for (int x = 0; x < pix - 3; x += 2) {
dst_rgb[0] = AVG(b, src_bayer0[1]); dst_rgb[0] = AVG(b, src_bayer0[1]);
dst_rgb[1] = src_bayer0[0]; dst_rgb[1] = src_bayer0[0];
dst_rgb[2] = src_bayer1[0]; dst_rgb[2] = src_bayer1[0];
...@@ -245,17 +247,19 @@ static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, ...@@ -245,17 +247,19 @@ static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
dst_rgb[1] = src_bayer0[0]; dst_rgb[1] = src_bayer0[0];
dst_rgb[2] = src_bayer1[0]; dst_rgb[2] = src_bayer1[0];
dst_rgb[3] = 255U; dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer0[1]; if (pix & 1) {
dst_rgb[5] = src_bayer0[0]; dst_rgb[4] = src_bayer0[1];
dst_rgb[6] = src_bayer1[0]; dst_rgb[5] = src_bayer0[0];
dst_rgb[7] = 255U; dst_rgb[6] = src_bayer1[0];
dst_rgb[7] = 255U;
}
} }
static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_rgb, int pix) { uint8* dst_rgb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 r = src_bayer0[1]; uint8 r = src_bayer0[1];
for (int x = 0; x < (pix - 2); x += 2) { for (int x = 0; x < pix - 3; x += 2) {
dst_rgb[0] = src_bayer1[0]; dst_rgb[0] = src_bayer1[0];
dst_rgb[1] = src_bayer0[0]; dst_rgb[1] = src_bayer0[0];
dst_rgb[2] = AVG(r, src_bayer0[1]); dst_rgb[2] = AVG(r, src_bayer0[1]);
...@@ -273,10 +277,12 @@ static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, ...@@ -273,10 +277,12 @@ static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
dst_rgb[1] = src_bayer0[0]; dst_rgb[1] = src_bayer0[0];
dst_rgb[2] = AVG(r, src_bayer0[1]); dst_rgb[2] = AVG(r, src_bayer0[1]);
dst_rgb[3] = 255U; dst_rgb[3] = 255U;
dst_rgb[4] = src_bayer1[0]; if (pix & 1) {
dst_rgb[5] = src_bayer0[0]; dst_rgb[4] = src_bayer1[0];
dst_rgb[6] = src_bayer0[1]; dst_rgb[5] = src_bayer0[0];
dst_rgb[7] = 255U; dst_rgb[6] = src_bayer0[1];
dst_rgb[7] = 255U;
}
} }
// Converts any Bayer RGB format to ARGB. // Converts any Bayer RGB format to ARGB.
...@@ -315,7 +321,7 @@ int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer, ...@@ -315,7 +321,7 @@ int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
break; break;
} }
for (int y = 0; y < (height - 1); y += 2) { for (int y = 0; y < height - 1; y += 2) {
BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width); BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
dst_rgb + dst_stride_rgb, width); dst_rgb + dst_stride_rgb, width);
...@@ -403,7 +409,7 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, ...@@ -403,7 +409,7 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
break; break;
} }
for (int y = 0; y < (height - 1); y += 2) { for (int y = 0; y < height - 1; y += 2) {
BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow0(src_bayer, src_stride_bayer, row, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
row + kMaxStride, width); row + kMaxStride, width);
......
...@@ -26,11 +26,11 @@ static void SplitUV_NEON(const uint8* src_uv, ...@@ -26,11 +26,11 @@ static void SplitUV_NEON(const uint8* src_uv,
__asm__ volatile __asm__ volatile
( (
"1:\n" "1:\n"
"vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV "vld2.u8 {q0,q1}, [%0]!\n" // load 16 pairs of UV
"vst1.u8 {q0}, [%1]! \n" // store U "vst1.u8 {q0}, [%1]!\n" // store U
"vst1.u8 {q1}, [%2]! \n" // Store V "vst1.u8 {q1}, [%2]!\n" // Store V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16\n" // 16 processed per loop
"bhi 1b \n" "bhi 1b\n"
: "+r"(src_uv), : "+r"(src_uv),
"+r"(dst_u), "+r"(dst_u),
"+r"(dst_v), "+r"(dst_v),
...@@ -48,16 +48,6 @@ static void SplitUV_NEON(const uint8* src_uv, ...@@ -48,16 +48,6 @@ static void SplitUV_NEON(const uint8* src_uv,
#define TALIGN16(t, var) t var __attribute__((aligned(16))) #define TALIGN16(t, var) t var __attribute__((aligned(16)))
#endif #endif
// Shuffle table for converting ABGR to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting BGRA to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
#if defined(WIN32) && !defined(COVERAGE_ENABLED) #if defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SPLITUV_SSE2 #define HAS_SPLITUV_SSE2
__declspec(naked) __declspec(naked)
...@@ -69,8 +59,8 @@ static void SplitUV_SSE2(const uint8* src_uv, ...@@ -69,8 +59,8 @@ static void SplitUV_SSE2(const uint8* src_uv,
mov edx, [esp + 4 + 8] // dst_u mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // pix mov ecx, [esp + 4 + 16] // pix
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm5, 8
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -78,8 +68,8 @@ static void SplitUV_SSE2(const uint8* src_uv, ...@@ -78,8 +68,8 @@ static void SplitUV_SSE2(const uint8* src_uv,
lea eax, [eax + 32] lea eax, [eax + 32]
movdqa xmm2, xmm0 movdqa xmm2, xmm0
movdqa xmm3, xmm1 movdqa xmm3, xmm1
pand xmm0, xmm7 // even bytes pand xmm0, xmm5 // even bytes
pand xmm1, xmm7 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
...@@ -101,16 +91,16 @@ static void SplitUV_SSE2(const uint8* src_uv, ...@@ -101,16 +91,16 @@ static void SplitUV_SSE2(const uint8* src_uv,
static void SplitUV_SSE2(const uint8* src_uv, static void SplitUV_SSE2(const uint8* src_uv,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"movdqa %%xmm0,%%xmm2\n" "movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n" "movdqa %%xmm1,%%xmm3\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"pand %%xmm7,%%xmm1\n" "pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,(%1)\n" "movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n" "lea 0x10(%1),%1\n"
...@@ -126,7 +116,10 @@ static void SplitUV_SSE2(const uint8* src_uv, ...@@ -126,7 +116,10 @@ static void SplitUV_SSE2(const uint8* src_uv,
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: :
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
); );
} }
#endif #endif
...@@ -196,15 +189,15 @@ int I420Copy(const uint8* src_y, int src_stride_y, ...@@ -196,15 +189,15 @@ int I420Copy(const uint8* src_y, int src_stride_y,
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
__asm__ volatile __asm__ volatile
( (
"vdup.u32 q0, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2\n" // duplicate 4 ints
"1:\n" "1:\n"
"vst1.u32 {q0}, [%0]! \n" // store "vst1.u32 {q0}, [%0]!\n" // store
"subs %1, %1, #16 \n" // 16 processed per loop "subs %1, %1, #16\n" // 16 processed per loop
"bhi 1b \n" "bhi 1b\n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
: "r"(v32) // %2 : "r"(v32) // %2
: "q0", "memory" : "q0", "memory", "cc"
); );
} }
...@@ -214,12 +207,12 @@ __declspec(naked) ...@@ -214,12 +207,12 @@ __declspec(naked)
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
__asm { __asm {
mov eax, [esp + 4] // dst mov eax, [esp + 4] // dst
movd xmm7, [esp + 8] // v32 movd xmm5, [esp + 8] // v32
mov ecx, [esp + 12] // count mov ecx, [esp + 12] // count
pshufd xmm7, xmm7, 0 pshufd xmm5, xmm5, 0
wloop: wloop:
movdqa [eax], xmm7 movdqa [eax], xmm5
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 16 sub ecx, 16
ja wloop ja wloop
...@@ -233,17 +226,20 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { ...@@ -233,17 +226,20 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
#define HAS_SETROW_SSE2 #define HAS_SETROW_SSE2
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
asm volatile( asm volatile(
"movd %2, %%xmm7\n" "movd %2, %%xmm5\n"
"pshufd $0x0,%%xmm7,%%xmm7\n" "pshufd $0x0,%%xmm5,%%xmm5\n"
"1:" "1:"
"movdqa %%xmm7,(%0)\n" "movdqa %%xmm5,(%0)\n"
"lea 0x10(%0),%0\n" "lea 0x10(%0),%0\n"
"sub $0x10,%1\n" "sub $0x10,%1\n"
"ja 1b\n" "ja 1b\n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
: "r"(v32) // %2 : "r"(v32) // %2
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm5"
#endif
); );
} }
#endif #endif
...@@ -257,13 +253,13 @@ static void I420SetPlane(uint8* dst_y, int dst_stride_y, ...@@ -257,13 +253,13 @@ static void I420SetPlane(uint8* dst_y, int dst_stride_y,
int value) { int value) {
void (*SetRow)(uint8* dst, uint32 value, int pix); void (*SetRow)(uint8* dst, uint32 value, int pix);
#if defined(HAS_SETROW_NEON) #if defined(HAS_SETROW_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && if (TestCpuFlag(kCpuHasNEON) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
SetRow = SetRow32_NEON; SetRow = SetRow32_NEON;
} else } else
#elif defined(HAS_SETROW_SSE2) #elif defined(HAS_SETROW_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
SetRow = SetRow32_SSE2; SetRow = SetRow32_SSE2;
...@@ -418,7 +414,7 @@ static int X420ToI420(const uint8* src_y, ...@@ -418,7 +414,7 @@ static int X420ToI420(const uint8* src_y,
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
#if defined(HAS_SPLITUV_NEON) #if defined(HAS_SPLITUV_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && if (TestCpuFlag(kCpuHasNEON) &&
(halfwidth % 16 == 0) && (halfwidth % 16 == 0) &&
IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
...@@ -426,7 +422,7 @@ static int X420ToI420(const uint8* src_y, ...@@ -426,7 +422,7 @@ static int X420ToI420(const uint8* src_y,
SplitUV = SplitUV_NEON; SplitUV = SplitUV_NEON;
} else } else
#elif defined(HAS_SPLITUV_SSE2) #elif defined(HAS_SPLITUV_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(halfwidth % 16 == 0) && (halfwidth % 16 == 0) &&
IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
...@@ -510,8 +506,8 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ...@@ -510,8 +506,8 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
mov esi, [esp + 8 + 12] // dst_u mov esi, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm5, 8
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -519,8 +515,8 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ...@@ -519,8 +515,8 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
lea eax, [eax + 32] lea eax, [eax + 32]
movdqa xmm2, xmm0 movdqa xmm2, xmm0
movdqa xmm3, xmm1 movdqa xmm3, xmm1
pand xmm2, xmm7 // even bytes are Y pand xmm2, xmm5 // even bytes are Y
pand xmm3, xmm7 pand xmm3, xmm5
packuswb xmm2, xmm3 packuswb xmm2, xmm3
movdqa [edx], xmm2 movdqa [edx], xmm2
lea edx, [edx + 16] lea edx, [edx + 16]
...@@ -528,7 +524,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ...@@ -528,7 +524,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqa xmm1, xmm0 movdqa xmm1, xmm0
pand xmm0, xmm7 // U pand xmm0, xmm5 // U
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [esi], xmm0 movq qword ptr [esi], xmm0
lea esi, [esi + 8] lea esi, [esi + 8]
...@@ -551,16 +547,16 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, ...@@ -551,16 +547,16 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"movdqa %%xmm0,%%xmm2\n" "movdqa %%xmm0,%%xmm2\n"
"movdqa %%xmm1,%%xmm3\n" "movdqa %%xmm1,%%xmm3\n"
"pand %%xmm7,%%xmm2\n" "pand %%xmm5,%%xmm2\n"
"pand %%xmm7,%%xmm3\n" "pand %%xmm5,%%xmm3\n"
"packuswb %%xmm3,%%xmm2\n" "packuswb %%xmm3,%%xmm2\n"
"movdqa %%xmm2,(%1)\n" "movdqa %%xmm2,(%1)\n"
"lea 0x10(%1),%1\n" "lea 0x10(%1),%1\n"
...@@ -568,7 +564,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, ...@@ -568,7 +564,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
"psrlw $0x8,%%xmm1\n" "psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,%%xmm1\n" "movdqa %%xmm0,%%xmm1\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%2)\n" "movq %%xmm0,(%2)\n"
"lea 0x8(%2),%2\n" "lea 0x8(%2),%2\n"
...@@ -584,7 +580,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, ...@@ -584,7 +580,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
); );
} }
#endif #endif
...@@ -626,7 +625,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, ...@@ -626,7 +625,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
void (*SplitYUY2)(const uint8* src_yuy2, void (*SplitYUY2)(const uint8* src_yuy2,
uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
#if defined(HAS_SPLITYUY2_SSE2) #if defined(HAS_SPLITYUY2_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
...@@ -662,15 +661,15 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, ...@@ -662,15 +661,15 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
mov eax, [esp + 4] // src_yuy2 mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm5, 8
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pand xmm0, xmm7 // even bytes are Y pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm7 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
...@@ -691,8 +690,8 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -691,8 +690,8 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm5, 8
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -706,7 +705,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -706,7 +705,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqa xmm1, xmm0 movdqa xmm1, xmm0
pand xmm0, xmm7 // U pand xmm0, xmm5 // U
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
...@@ -758,8 +757,8 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -758,8 +757,8 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
mov edx, [esp + 8 + 12] // dst_u mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm5, 8
wloop: wloop:
movdqa xmm0, [eax] movdqa xmm0, [eax]
...@@ -769,11 +768,11 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -769,11 +768,11 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
lea eax, [eax + 32] lea eax, [eax + 32]
pavgb xmm0, xmm2 pavgb xmm0, xmm2
pavgb xmm1, xmm3 pavgb xmm1, xmm3
pand xmm0, xmm7 // UYVY -> UVUV pand xmm0, xmm5 // UYVY -> UVUV
pand xmm1, xmm7 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqa xmm1, xmm0 movdqa xmm1, xmm0
pand xmm0, xmm7 // U pand xmm0, xmm5 // U
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
...@@ -797,14 +796,14 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -797,14 +796,14 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) { uint8* dst_y, int pix) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"pand %%xmm7,%%xmm1\n" "pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,(%1)\n" "movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n" "lea 0x10(%1),%1\n"
...@@ -814,15 +813,18 @@ static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, ...@@ -814,15 +813,18 @@ static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
); );
} }
static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) { uint8* dst_u, uint8* dst_y, int pix) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
...@@ -835,7 +837,7 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -835,7 +837,7 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
"psrlw $0x8,%%xmm1\n" "psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,%%xmm1\n" "movdqa %%xmm0,%%xmm1\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%1)\n" "movq %%xmm0,(%1)\n"
"lea 0x8(%1),%1\n" "lea 0x8(%1),%1\n"
...@@ -850,7 +852,10 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -850,7 +852,10 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
"+r"(dst_y), // %2 "+r"(dst_y), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
); );
} }
#define HAS_UYVYTOI420ROW_SSE2 #define HAS_UYVYTOI420ROW_SSE2
...@@ -872,15 +877,18 @@ static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, ...@@ -872,15 +877,18 @@ static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
); );
} }
static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) { uint8* dst_u, uint8* dst_y, int pix) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm5\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
...@@ -889,11 +897,11 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -889,11 +897,11 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"pavgb %%xmm2,%%xmm0\n" "pavgb %%xmm2,%%xmm0\n"
"pavgb %%xmm3,%%xmm1\n" "pavgb %%xmm3,%%xmm1\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"pand %%xmm7,%%xmm1\n" "pand %%xmm5,%%xmm1\n"
"packuswb %%xmm1,%%xmm0\n" "packuswb %%xmm1,%%xmm0\n"
"movdqa %%xmm0,%%xmm1\n" "movdqa %%xmm0,%%xmm1\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm5,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%1)\n" "movq %%xmm0,(%1)\n"
"lea 0x8(%1),%1\n" "lea 0x8(%1),%1\n"
...@@ -908,7 +916,10 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -908,7 +916,10 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
"+r"(dst_y), // %2 "+r"(dst_y), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
); );
} }
#endif #endif
...@@ -975,7 +986,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, ...@@ -975,7 +986,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
void (*YUY2ToI420RowY)(const uint8* src_yuy2, void (*YUY2ToI420RowY)(const uint8* src_yuy2,
uint8* dst_y, int pix); uint8* dst_y, int pix);
#if defined(HAS_YUY2TOI420ROW_SSE2) #if defined(HAS_YUY2TOI420ROW_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
...@@ -1022,7 +1033,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -1022,7 +1033,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
void (*UYVYToI420RowY)(const uint8* src_uyvy, void (*UYVYToI420RowY)(const uint8* src_uyvy,
uint8* dst_y, int pix); uint8* dst_y, int pix);
#if defined(HAS_UYVYTOI420ROW_SSE2) #if defined(HAS_UYVYTOI420ROW_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) && IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
...@@ -1053,7 +1064,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, ...@@ -1053,7 +1064,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
} }
// Convert I420 to ARGB. // Convert I420 to ARGB.
// TODO(fbarchard): Add SSE2 version and supply C version for fallback.
int I420ToARGB(const uint8* src_y, int src_stride_y, int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
...@@ -1065,8 +1075,34 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1065,8 +1075,34 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 4 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow4_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
if (width % 2 == 0) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
...@@ -1074,7 +1110,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1074,7 +1110,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
src_v += src_stride_v; src_v += src_stride_v;
} }
} }
// MMX used for FastConvertYUVToRGB32Row requires an emms instruction. // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
EMMS(); EMMS();
return 0; return 0;
} }
...@@ -1091,6 +1127,25 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, ...@@ -1091,6 +1127,25 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*FastConvertYUVToBGRARow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_MMX)
if (width % 2 == 0) {
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_MMX;
} else
#endif
{
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width); FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
...@@ -1104,7 +1159,7 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, ...@@ -1104,7 +1159,7 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Convert I420 to BGRA. // Convert I420 to ABGR.
int I420ToABGR(const uint8* src_y, int src_stride_y, int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
...@@ -1116,6 +1171,25 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, ...@@ -1116,6 +1171,25 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*FastConvertYUVToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOABGRROW_MMX)
if (width % 2 == 0) {
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_MMX;
} else
#endif
{
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width); FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
...@@ -1141,14 +1215,33 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1141,14 +1215,33 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0)) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
if (width % 2 == 0) {
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
} else
#endif
{
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
src_u += src_stride_u; src_u += src_stride_u;
src_v += src_stride_v; src_v += src_stride_v;
} }
// MMX used for FastConvertYUVToRGB32Row requires an emms instruction. // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
EMMS(); EMMS();
return 0; return 0;
} }
...@@ -1165,14 +1258,31 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1165,14 +1258,31 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*FastConvertYUV444ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_MMX;
#else
{
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
}
#endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width); FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
src_u += src_stride_u; src_u += src_stride_u;
src_v += src_stride_v; src_v += src_stride_v;
} }
// MMX used for FastConvertYUVToRGB32Row requires an emms instruction. // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
EMMS(); EMMS();
return 0; return 0;
} }
...@@ -1187,178 +1297,34 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, ...@@ -1187,178 +1297,34 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
void (*FastConvertYToARGBRow)(const uint8* y_buf,
uint8* rgb_buf,
int width);
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
(width % 2 == 0) &&
IS_ALIGNED(dst_argb, 8) && (dst_stride_argb % 8 == 0)) {
FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
} else
#endif
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
if (width % 2 == 0) {
FastConvertYToARGBRow = FastConvertYToARGBRow_MMX;
} else
#endif
{
FastConvertYToARGBRow = FastConvertYToARGBRow_C;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYToRGB32Row(src_y, dst_argb, width); FastConvertYToARGBRow(src_y, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
} }
// MMX used for FastConvertYUVToRGB32Row requires an emms instruction. // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
EMMS(); EMMS();
return 0; return 0;
} }
// TODO(fbarchard): 64 bit version
#if defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_I400TOARGBROW_SSE2
__declspec(naked)
static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
wloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm0
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0
punpckhwd xmm1, xmm1
por xmm0, xmm7
por xmm1, xmm7
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
ja wloop
ret
}
}
#define HAS_ABGRTOARGBROW_SSSE3
__declspec(naked)
static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_abgr
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
movdqa xmm7, _kShuffleMaskABGRToARGB
convertloop :
movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm7
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
ja convertloop
ret
}
}
#define HAS_BGRATOARGBROW_SSSE3
__declspec(naked)
static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_bgra
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
movdqa xmm7, _kShuffleMaskBGRAToARGB
convertloop :
movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm7
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
ja convertloop
ret
}
}
#elif (defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
// TODO(yuche): consider moving ARGB related codes to a separate file.
#define HAS_I400TOARGBROW_SSE2
static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n"
"pslld $0x18,%%xmm7\n"
"1:"
"movq (%0),%%xmm0\n"
"lea 0x8(%0),%0\n"
"punpcklbw %%xmm0,%%xmm0\n"
"movdqa %%xmm0,%%xmm1\n"
"punpcklwd %%xmm0,%%xmm0\n"
"punpckhwd %%xmm1,%%xmm1\n"
"por %%xmm7,%%xmm0\n"
"por %%xmm7,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
"movdqa %%xmm1,0x10(%1)\n"
"lea 0x20(%1),%1\n"
"sub $0x8,%2\n"
"ja 1b\n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
: "memory"
);
}
#define HAS_ABGRTOARGBROW_SSSE3
static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
int pix) {
asm volatile(
"movdqa (%3),%%xmm7\n"
"1:"
"movdqa (%0),%%xmm0\n"
"lea 0x10(%0),%0\n"
"pshufb %%xmm7,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x4,%2\n"
"ja 1b\n"
: "+r"(src_abgr), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskABGRToARGB) // %3
: "memory"
);
}
#define HAS_BGRATOARGBROW_SSSE3
static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
int pix) {
asm volatile(
"movdqa (%3),%%xmm7\n"
"1:"
"movdqa (%0),%%xmm0\n"
"lea 0x10(%0),%0\n"
"pshufb %%xmm7,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x4,%2\n"
"ja 1b\n"
: "+r"(src_bgra), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(kShuffleMaskBGRAToARGB) // %3
: "memory"
);
}
#endif
static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
// Copy a Y to RGB.
for (int x = 0; x < pix; ++x) {
uint8 y = src_y[0];
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
dst_argb[3] = 255u;
dst_argb += 4;
++src_y;
}
}
// Convert I400 to ARGB. // Convert I400 to ARGB.
int I400ToARGB(const uint8* src_y, int src_stride_y, int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
...@@ -1370,7 +1336,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1370,7 +1336,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
} }
void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix); void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
#if defined(HAS_I400TOARGBROW_SSE2) #if defined(HAS_I400TOARGBROW_SSE2)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && if (TestCpuFlag(kCpuHasSSE2) &&
(width % 8 == 0) && (width % 8 == 0) &&
IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) && IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
...@@ -1389,22 +1355,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, ...@@ -1389,22 +1355,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
// To support in-place conversion.
uint8 r = src_abgr[0];
uint8 g = src_abgr[1];
uint8 b = src_abgr[2];
uint8 a = src_abgr[3];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = a;
dst_argb += 4;
src_abgr += 4;
}
}
int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
...@@ -1415,7 +1365,7 @@ int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, ...@@ -1415,7 +1365,7 @@ int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
} }
void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix); void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
#if defined(HAS_ABGRTOARGBROW_SSSE3) #if defined(HAS_ABGRTOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 4 == 0) && (width % 4 == 0) &&
IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) && IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
...@@ -1434,22 +1384,6 @@ void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix); ...@@ -1434,22 +1384,6 @@ void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
return 0; return 0;
} }
static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
// To support in-place conversion.
uint8 a = src_bgra[0];
uint8 r = src_bgra[1];
uint8 g = src_bgra[2];
uint8 b = src_bgra[3];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = a;
dst_argb += 4;
src_bgra += 4;
}
}
// Convert BGRA to ARGB. // Convert BGRA to ARGB.
int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
...@@ -1461,7 +1395,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, ...@@ -1461,7 +1395,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
} }
void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix); void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix);
#if defined(HAS_BGRATOARGBROW_SSSE3) #if defined(HAS_BGRATOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 4 == 0) && (width % 4 == 0) &&
IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) && IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
...@@ -1491,7 +1425,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, ...@@ -1491,7 +1425,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
} }
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 4 == 0) && (width % 4 == 0) &&
IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) && IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) &&
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
...@@ -1522,7 +1456,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, ...@@ -1522,7 +1456,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
} }
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix); void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
#if defined(HAS_RAWTOARGBROW_SSSE3) #if defined(HAS_RAWTOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) && IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
...@@ -1552,7 +1486,7 @@ int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, ...@@ -1552,7 +1486,7 @@ int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
} }
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix); void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
#if defined(HAS_BG24TOARGBROW_SSSE3) #if defined(HAS_BG24TOARGBROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && if (TestCpuFlag(kCpuHasSSSE3) &&
(width % 16 == 0) && (width % 16 == 0) &&
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) && IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
......
...@@ -13,9 +13,13 @@ ...@@ -13,9 +13,13 @@
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#define kMaxStride (2048 * 4)
// The following are available on all x86 platforms // The following are available on all x86 platforms
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ #if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_BGRATOARGBROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3
#define HAS_BG24TOARGBROW_SSSE3 #define HAS_BG24TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3
...@@ -23,19 +27,41 @@ ...@@ -23,19 +27,41 @@
#define HAS_RAWTOYROW_SSSE3 #define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOUVROW_SSSE3 #define HAS_RGB24TOUVROW_SSSE3
#define HAS_RAWTOUVROW_SSSE3 #define HAS_RAWTOUVROW_SSSE3
#endif
// The following are available only on Windows
#if defined(WIN32) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#endif
// The following are available on Windows and Linux
#if (defined(WIN32) || defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3
#endif #endif
// The following are available on Linux (32/64 bit)
// TODO(fbarchard): enable for fpic on linux
#if (defined(__x86_64__) || \
(defined(__i386__) && !defined(__pic__))) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_FASTCONVERTYUVTOARGBROW_SSE2
#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
#define HAS_FASTCONVERTYUVTOABGRROW_SSE2
#endif
// The following are available on Windows and GCC 32 bit
#if (defined(WIN32) || \
defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_FASTCONVERTYUVTOARGBROW_MMX
#define HAS_FASTCONVERTYUVTOBGRAROW_MMX
#define HAS_FASTCONVERTYUVTOABGRROW_MMX
#endif
extern "C" { extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
...@@ -75,56 +101,128 @@ void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, ...@@ -75,56 +101,128 @@ void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width); uint8* dst_u, uint8* dst_v, int width);
#ifdef HAS_BG24TOARGBROW_SSSE3 #ifdef HAS_BG24TOARGBROW_SSSE3
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
#endif #endif
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
#ifdef HAS_I400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
#endif
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var #define SIMD_ALIGNED(var) __declspec(align(16)) var
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
#else #else // __GNUC__
#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#define TALIGN16(t, var) t var __attribute__((aligned(16))) #define TALIGN16(t, var) t var __attribute__((aligned(16)))
typedef signed char __attribute__((vector_size(16))) vec8;
typedef unsigned char __attribute__((vector_size(16))) uvec8;
#endif #endif
#ifdef OSX extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); extern "C" SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); extern "C" SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
#else void FastConvertYUVToARGBRow_C(const uint8* y_buf,
extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); const uint8* u_buf,
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]); const uint8* v_buf,
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]); uint8* rgb_buf,
#endif int width);
void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf, void FastConvertYUVToBGRARow_C(const uint8* y_buf,
const uint8* v_buf, const uint8* u_buf,
uint8* rgb_buf, const uint8* v_buf,
int width); uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow(const uint8* y_buf,
const uint8* u_buf, void FastConvertYUVToABGRRow_C(const uint8* y_buf,
const uint8* v_buf, const uint8* u_buf,
uint8* rgb_buf, const uint8* v_buf,
int width); uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow(const uint8* y_buf, void FastConvertYToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void FastConvertYUV444ToRGB32Row(const uint8* y_buf, #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
void FastConvertYToRGB32Row(const uint8* y_buf, void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
uint8* rgb_buf, const uint8* u_buf,
int width); const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf,
int width);
#endif
// Method to force C version. // Method to force C version.
//#define USE_MMX 0 //#define USE_MMX 0
......
...@@ -15,62 +15,128 @@ extern "C" { ...@@ -15,62 +15,128 @@ extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
// Constant multiplication table for converting ARGB to I400. // Constant multiplication table for converting ARGB to I400.
extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = { static const vec8 kARGBToY = {
13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
}; };
extern "C" TALIGN16(const uint8, kAdd16[16]) = { static const uvec8 kAddY16 = {
1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
}; };
#ifdef HAS_ARGBTOUVROW_SSSE3
static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
static const uvec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
#endif
// Shuffle table for converting BG24 to ARGB. // Shuffle table for converting BG24 to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { static const uvec8 kShuffleMaskBG24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
}; };
// Shuffle table for converting RAW to ARGB. // Shuffle table for converting RAW to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { static const uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
}; };
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { // Shuffle table for converting ABGR to ARGB.
static const uvec8 kShuffleMaskABGRToARGB = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting BGRA to ARGB.
static const uvec8 kShuffleMaskBGRAToARGB = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile( asm volatile(
"movdqa (%3),%%xmm7\n" "pcmpeqb %%xmm5,%%xmm5\n"
"movdqa (%4),%%xmm6\n" "pslld $0x18,%%xmm5\n"
"movdqa %%xmm6,%%xmm5\n"
"psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte.
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movq (%0),%%xmm0\n"
"pmaddubsw %%xmm7,%%xmm0\n" "lea 0x8(%0),%0\n"
"movdqa 0x10(%0),%%xmm1\n" "punpcklbw %%xmm0,%%xmm0\n"
"psrlw $0x7,%%xmm0\n" "movdqa %%xmm0,%%xmm1\n"
"pmaddubsw %%xmm7,%%xmm1\n" "punpcklwd %%xmm0,%%xmm0\n"
"lea 0x20(%0),%0\n" "punpckhwd %%xmm1,%%xmm1\n"
"psrlw $0x7,%%xmm1\n" "por %%xmm5,%%xmm0\n"
"packuswb %%xmm1,%%xmm0\n" "por %%xmm5,%%xmm1\n"
"pmaddubsw %%xmm6,%%xmm0\n" "movdqa %%xmm0,(%1)\n"
"packuswb %%xmm0,%%xmm0\n" "movdqa %%xmm1,0x10(%1)\n"
"paddb %%xmm5,%%xmm0\n" "lea 0x20(%1),%1\n"
"movq %%xmm0,(%1)\n"
"lea 0x8(%1),%1\n"
"sub $0x8,%2\n" "sub $0x8,%2\n"
"ja 1b\n" "ja 1b\n"
: "+r"(src_argb), // %0 : "+r"(src_y), // %0
"+r"(dst_y), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "r"(kMultiplyMaskARGBToI400), // %3 :
"r"(kAdd16) // %4 : "memory", "cc"
: "memory" #if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
); );
} }
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile(
"movdqa %3,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"lea 0x10(%0),%0\n"
"pshufb %%xmm5,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x4,%2\n"
"ja 1b\n"
: "+r"(src_abgr), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "m"(kShuffleMaskABGRToARGB) // %3
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm5"
#endif #endif
#ifdef HAS_BG24TOARGBROW_SSSE3 );
}
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
asm volatile(
"movdqa %3,%%xmm5\n"
"1:"
"movdqa (%0),%%xmm0\n"
"lea 0x10(%0),%0\n"
"pshufb %%xmm5,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x4,%2\n"
"ja 1b\n"
: "+r"(src_bgra), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "m"(kShuffleMaskBGRAToARGB) // %3
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm5"
#endif
);
}
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n" "pslld $0x18,%%xmm5\n"
"movdqa (%3),%%xmm6\n" "movdqa %3,%%xmm4\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
...@@ -78,19 +144,19 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { ...@@ -78,19 +144,19 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
"lea 0x30(%0),%0\n" "lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n" "movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n" "pshufb %%xmm4,%%xmm2\n"
"por %%xmm7,%%xmm2\n" "por %%xmm5,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n" "pshufb %%xmm4,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n" "movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n" "por %%xmm5,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n" "pshufb %%xmm4,%%xmm1\n"
"movdqa %%xmm0,(%1)\n" "movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n" "por %%xmm5,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n" "pshufb %%xmm4,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n" "movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n" "por %%xmm5,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n" "movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n" "lea 0x40(%1),%1\n"
"sub $0x10,%2\n" "sub $0x10,%2\n"
...@@ -98,16 +164,19 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { ...@@ -98,16 +164,19 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
: "+r"(src_bg24), // %0 : "+r"(src_bg24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "r"(kShuffleMaskBG24ToARGB) // %3 : "m"(kShuffleMaskBG24ToARGB) // %3
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
); );
} }
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n" "pslld $0x18,%%xmm5\n"
"movdqa (%3),%%xmm6\n" "movdqa %3,%%xmm4\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
...@@ -115,19 +184,19 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -115,19 +184,19 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"lea 0x30(%0),%0\n" "lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n" "movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
"pshufb %%xmm6,%%xmm2\n" "pshufb %%xmm4,%%xmm2\n"
"por %%xmm7,%%xmm2\n" "por %%xmm5,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
"pshufb %%xmm6,%%xmm0\n" "pshufb %%xmm4,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n" "movdqa %%xmm2,0x20(%1)\n"
"por %%xmm7,%%xmm0\n" "por %%xmm5,%%xmm0\n"
"pshufb %%xmm6,%%xmm1\n" "pshufb %%xmm4,%%xmm1\n"
"movdqa %%xmm0,(%1)\n" "movdqa %%xmm0,(%1)\n"
"por %%xmm7,%%xmm1\n" "por %%xmm5,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
"pshufb %%xmm6,%%xmm3\n" "pshufb %%xmm4,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n" "movdqa %%xmm1,0x10(%1)\n"
"por %%xmm7,%%xmm3\n" "por %%xmm5,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n" "movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n" "lea 0x40(%1),%1\n"
"sub $0x10,%2\n" "sub $0x10,%2\n"
...@@ -135,147 +204,320 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { ...@@ -135,147 +204,320 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "r"(kShuffleMaskRAWToARGB) // %3 : "m"(kShuffleMaskRAWToARGB) // %3
: "memory" : "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
); );
} }
#endif
#if defined(__x86_64__) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile(
"movdqa %4,%%xmm5\n"
"movdqa %3,%%xmm4\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
"movdqa 0x20(%0),%%xmm2\n"
"movdqa 0x30(%0),%%xmm3\n"
"pmaddubsw %%xmm4,%%xmm0\n"
"pmaddubsw %%xmm4,%%xmm1\n"
"pmaddubsw %%xmm4,%%xmm2\n"
"pmaddubsw %%xmm4,%%xmm3\n"
"lea 0x40(%0),%0\n"
"phaddw %%xmm1,%%xmm0\n"
"phaddw %%xmm3,%%xmm2\n"
"psrlw $0x7,%%xmm0\n"
"psrlw $0x7,%%xmm2\n"
"packuswb %%xmm2,%%xmm0\n"
"paddb %%xmm5,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kARGBToY), // %3
"m"(kAddY16) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
// 64 bit linux gcc version );
}
#endif
void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi #ifdef HAS_ARGBTOUVROW_SSSE3
const uint8* u_buf, // rsi void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
const uint8* v_buf, // rdx uint8* dst_u, uint8* dst_v, int width) {
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile( asm volatile(
"movdqa %5,%%xmm7\n"
"movdqa %6,%%xmm6\n"
"movdqa %7,%%xmm5\n"
"sub %1,%2\n"
"1:" "1:"
"movzb (%1),%%r10\n" "movdqa (%0),%%xmm0\n"
"lea 1(%1),%1\n" "movdqa 0x10(%0),%%xmm1\n"
"movzb (%2),%%r11\n" "movdqa 0x20(%0),%%xmm2\n"
"lea 1(%2),%2\n" "movdqa 0x30(%0),%%xmm3\n"
"movq 2048(%5,%%r10,8),%%xmm0\n" "pavgb (%0,%4,1),%%xmm0\n"
"movzb (%0),%%r10\n" "pavgb 0x10(%0,%4,1),%%xmm1\n"
"movq 4096(%5,%%r11,8),%%xmm1\n" "pavgb 0x20(%0,%4,1),%%xmm2\n"
"movzb 0x1(%0),%%r11\n" "pavgb 0x30(%0,%4,1),%%xmm3\n"
"paddsw %%xmm1,%%xmm0\n" "lea 0x40(%0),%0\n"
"movq (%5,%%r10,8),%%xmm2\n" "movdqa %%xmm0,%%xmm4\n"
"lea 2(%0),%0\n" "shufps $0x88,%%xmm1,%%xmm0\n"
"movq (%5,%%r11,8),%%xmm3\n" "shufps $0xdd,%%xmm1,%%xmm4\n"
"paddsw %%xmm0,%%xmm2\n" "pavgb %%xmm4,%%xmm0\n"
"paddsw %%xmm0,%%xmm3\n" "movdqa %%xmm2,%%xmm4\n"
"shufps $0x44,%%xmm3,%%xmm2\n" "shufps $0x88,%%xmm3,%%xmm2\n"
"psraw $0x6,%%xmm2\n" "shufps $0xdd,%%xmm3,%%xmm4\n"
"packuswb %%xmm2,%%xmm2\n" "pavgb %%xmm4,%%xmm2\n"
"movq %%xmm2,0x0(%3)\n" "movdqa %%xmm0,%%xmm1\n"
"lea 8(%3),%3\n" "movdqa %%xmm2,%%xmm3\n"
"sub $0x2,%4\n" "pmaddubsw %%xmm7,%%xmm0\n"
"pmaddubsw %%xmm7,%%xmm2\n"
"pmaddubsw %%xmm6,%%xmm1\n"
"pmaddubsw %%xmm6,%%xmm3\n"
"phaddw %%xmm2,%%xmm0\n"
"phaddw %%xmm3,%%xmm1\n"
"psraw $0x8,%%xmm0\n"
"psraw $0x8,%%xmm1\n"
"packsswb %%xmm1,%%xmm0\n"
"paddb %%xmm5,%%xmm0\n"
"movlps %%xmm0,(%1)\n"
"movhps %%xmm0,(%1,%2,1)\n"
"lea 0x8(%1),%1\n"
"sub $0x10,%3\n"
"ja 1b\n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_argb)), // %4
"m"(kARGBToU), // %5
"m"(kARGBToV), // %6
"m"(kAddUV128) // %7
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
#endif
// The following code requires 6 registers and prefers 7 registers.
// 7 registers requires -fpic to be off, and -fomit-frame-pointer
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
#if defined(__x86_64__)
#define REG_a "rax"
#define REG_d "rdx"
#else
#define REG_a "eax"
#define REG_d "edx"
#endif
#if defined(__APPLE__) || defined(__x86_64__)
#define OMITFP
#else
#define OMITFP __attribute__((optimize("omit-frame-pointer")))
#endif
#if defined(__APPLE__)
// REG6 version uses 1 less register but is slower
#define REG6
#endif
#ifdef REG6
// 6 register version only has REG_a for temporary
#define CLOBBER "%"REG_a
#define YUVTORGB \
"1:" \
"movzb (%1),%%"REG_a"\n" \
"lea 1(%1),%1\n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
"movzb (%2),%%"REG_a"\n" \
"lea 1(%2),%2\n" \
"movq 4096(%5,%%"REG_a",8),%%xmm1\n" \
"paddsw %%xmm1,%%xmm0\n" \
"movzb (%0),%%"REG_a"\n" \
"movq 0(%5,%%"REG_a",8),%%xmm2\n" \
"movzb 0x1(%0),%%"REG_a"\n" \
"movq 0(%5,%%"REG_a",8),%%xmm3\n" \
"lea 2(%0),%0\n" \
"paddsw %%xmm0,%%xmm2\n" \
"paddsw %%xmm0,%%xmm3\n" \
"shufps $0x44,%%xmm3,%%xmm2\n" \
"psraw $0x6,%%xmm2\n" \
"packuswb %%xmm2,%%xmm2\n" \
"movq %%xmm2,0x0(%3)\n" \
"lea 8(%3),%3\n" \
"sub $0x2,%4\n" \
"ja 1b\n" "ja 1b\n"
#else
#define CLOBBER "%"REG_a, "%"REG_d
// This version produces 2 pixels
#define YUVTORGB \
"1:" \
"movzb (%1),%%"REG_a"\n" \
"lea 1(%1),%1\n" \
"movzb (%2),%%"REG_d"\n" \
"lea 1(%2),%2\n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
"movzb 0(%0),%%"REG_a"\n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
"paddsw %%xmm1,%%xmm0\n" \
"movzb 1(%0),%%"REG_d"\n" \
"punpcklqdq %%xmm0,%%xmm0\n" \
"lea 2(%0),%0\n" \
"movq 0(%5,%%"REG_a",8),%%xmm1\n" \
"movhps 0(%5,%%"REG_d",8),%%xmm1\n" \
"paddsw %%xmm0,%%xmm1\n" \
"psraw $6,%%xmm1\n" \
"packuswb %%xmm1,%%xmm1\n" \
"movq %%xmm1,0(%3)\n" \
"lea 8(%3),%3\n" \
"sub $0x2,%4\n" \
"ja 1b\n"
// This version produces 4 pixels
#define YUVTORGB4 \
"1:" \
"movzb 0(%1),%%"REG_a"\n" \
"movzb 0(%2),%%"REG_d"\n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
"movzb 0(%0),%%"REG_a"\n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
"paddsw %%xmm1,%%xmm0\n" \
"movzb 1(%0),%%"REG_d"\n" \
"punpcklqdq %%xmm0,%%xmm0\n" \
"movq 0(%5,%%"REG_a",8),%%xmm2\n" \
"movhps 0(%5,%%"REG_d",8),%%xmm2\n" \
"paddsw %%xmm0,%%xmm2\n" \
"psraw $6,%%xmm2\n" \
"movzb 1(%1),%%"REG_a"\n" \
"movzb 1(%2),%%"REG_d"\n" \
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
"movzb 2(%0),%%"REG_a"\n" \
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
"paddsw %%xmm1,%%xmm0\n" \
"movzb 3(%0),%%"REG_d"\n" \
"punpcklqdq %%xmm0,%%xmm0\n" \
"movq 0(%5,%%"REG_a",8),%%xmm3\n" \
"movhps 0(%5,%%"REG_d",8),%%xmm3\n" \
"paddsw %%xmm0,%%xmm3\n" \
"psraw $6,%%xmm3\n" \
"lea 2(%1),%1\n" \
"lea 2(%2),%2\n" \
"lea 4(%0),%0\n" \
"packuswb %%xmm3,%%xmm2\n" \
"movdqa %%xmm2,0(%3)\n" \
"lea 16(%3),%3\n" \
"sub $0x4,%4\n" \
"ja 1b\n"
#endif
// 6 or 7 registers
void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi
const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile(
YUVTORGB
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(rgb_buf), // %3
"+r"(width) // %4 "+rm"(width) // %4
: "r" (_kCoefficientsRgbY) // %5 : "r" (kCoefficientsRgbY) // %5
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" : "memory", "cc", CLOBBER
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
); );
} }
void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi // 6 or 7 registers
const uint8* u_buf, // rsi void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx const uint8* u_buf, // rsi
uint8* rgb_buf, // rcx const uint8* v_buf, // rdx
int width) { // r8 uint8* rgb_buf, // rcx
int width) { // r8
asm volatile( asm volatile(
"1:" YUVTORGB4
"movzb (%1),%%r10\n"
"lea 1(%1),%1\n"
"movzb (%2),%%r11\n"
"lea 1(%2),%2\n"
"movq 2048(%5,%%r10,8),%%xmm0\n"
"movzb (%0),%%r10\n"
"movq 4096(%5,%%r11,8),%%xmm1\n"
"movzb 0x1(%0),%%r11\n"
"paddsw %%xmm1,%%xmm0\n"
"movq (%5,%%r10,8),%%xmm2\n"
"lea 2(%0),%0\n"
"movq (%5,%%r11,8),%%xmm3\n"
"paddsw %%xmm0,%%xmm2\n"
"paddsw %%xmm0,%%xmm3\n"
"shufps $0x44,%%xmm3,%%xmm2\n"
"psraw $0x6,%%xmm2\n"
"packuswb %%xmm2,%%xmm2\n"
"movq %%xmm2,0x0(%3)\n"
"lea 8(%3),%3\n"
"sub $0x2,%4\n"
"ja 1b\n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(rgb_buf), // %3
"+r"(width) // %4 "+rm"(width) // %4
: "r" (_kCoefficientsBgraY) // %5 : "r" (kCoefficientsRgbY) // %5
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" : "memory", "cc", CLOBBER
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
); );
} }
void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi
const uint8* u_buf, // rsi const uint8* u_buf, // rsi
const uint8* v_buf, // rdx const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm volatile( asm volatile(
"1:" YUVTORGB
"movzb (%1),%%r10\n" : "+r"(y_buf), // %0
"lea 1(%1),%1\n" "+r"(u_buf), // %1
"movzb (%2),%%r11\n" "+r"(v_buf), // %2
"lea 1(%2),%2\n" "+r"(rgb_buf), // %3
"movq 2048(%5,%%r10,8),%%xmm0\n" "+rm"(width) // %4
"movzb (%0),%%r10\n" : "r" (kCoefficientsBgraY) // %5
"movq 4096(%5,%%r11,8),%%xmm1\n" : "memory", "cc", CLOBBER
"movzb 0x1(%0),%%r11\n" #if defined(__SSE2__)
"paddsw %%xmm1,%%xmm0\n" , "xmm0", "xmm1", "xmm2", "xmm3"
"movq (%5,%%r10,8),%%xmm2\n" #endif
"lea 2(%0),%0\n" );
"movq (%5,%%r11,8),%%xmm3\n" }
"paddsw %%xmm0,%%xmm2\n"
"paddsw %%xmm0,%%xmm3\n" void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi
"shufps $0x44,%%xmm3,%%xmm2\n" const uint8* u_buf, // rsi
"psraw $0x6,%%xmm2\n" const uint8* v_buf, // rdx
"packuswb %%xmm2,%%xmm2\n" uint8* rgb_buf, // rcx
"movq %%xmm2,0x0(%3)\n" int width) { // r8
"lea 8(%3),%3\n" asm volatile(
"sub $0x2,%4\n" YUVTORGB
"ja 1b\n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(rgb_buf), // %3
"+r"(width) // %4 "+rm"(width) // %4
: "r" (_kCoefficientsAbgrY) // %5 : "r" (kCoefficientsAbgrY) // %5
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" : "memory", "cc", CLOBBER
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
); );
} }
void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi // 6 registers
const uint8* u_buf, // rsi void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx const uint8* u_buf, // rsi
uint8* rgb_buf, // rcx const uint8* v_buf, // rdx
int width) { // r8 uint8* rgb_buf, // rcx
int width) { // r8
asm volatile( asm volatile(
"1:" "1:"
"movzb (%1),%%r10\n" "movzb (%1),%%"REG_a"\n"
"lea 1(%1),%1\n" "lea 1(%1),%1\n"
"movzb (%2),%%r11\n" "movq 2048(%5,%%"REG_a",8),%%xmm0\n"
"movzb (%2),%%"REG_a"\n"
"lea 1(%2),%2\n" "lea 1(%2),%2\n"
"movq 2048(%5,%%r10,8),%%xmm0\n" "movq 4096(%5,%%"REG_a",8),%%xmm1\n"
"movzb (%0),%%r10\n"
"movq 4096(%5,%%r11,8),%%xmm1\n"
"paddsw %%xmm1,%%xmm0\n" "paddsw %%xmm1,%%xmm0\n"
"movq (%5,%%r10,8),%%xmm2\n" "movzb (%0),%%"REG_a"\n"
"lea 1(%0),%0\n" "lea 1(%0),%0\n"
"movq 0(%5,%%"REG_a",8),%%xmm2\n"
"paddsw %%xmm0,%%xmm2\n" "paddsw %%xmm0,%%xmm2\n"
"shufps $0x44,%%xmm2,%%xmm2\n" "shufps $0x44,%%xmm2,%%xmm2\n"
"psraw $0x6,%%xmm2\n" "psraw $0x6,%%xmm2\n"
...@@ -288,23 +530,26 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi ...@@ -288,23 +530,26 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi
"+r"(u_buf), // %1 "+r"(u_buf), // %1
"+r"(v_buf), // %2 "+r"(v_buf), // %2
"+r"(rgb_buf), // %3 "+r"(rgb_buf), // %3
"+r"(width) // %4 "+rm"(width) // %4
: "r" (_kCoefficientsRgbY) // %5 : "r" (kCoefficientsRgbY) // %5
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2" : "memory", "cc", "%"REG_a
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
#endif
); );
} }
void FastConvertYToRGB32Row(const uint8* y_buf, // rdi // 5 registers
uint8* rgb_buf, // rcx void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
int width) { // r8 uint8* rgb_buf, // rcx
int width) { // r8
asm volatile( asm volatile(
"1:" "1:"
"movzb (%0),%%r10\n" "movzb (%0),%%"REG_a"\n"
"movzb 0x1(%0),%%r11\n" "movzb 0x1(%0),%%"REG_d"\n"
"movq (%3,%%r10,8),%%xmm2\n" "movq (%3,%%"REG_a",8),%%xmm2\n"
"lea 2(%0),%0\n" "lea 2(%0),%0\n"
"movq (%3,%%r11,8),%%xmm3\n" "movhps (%3,%%"REG_d",8),%%xmm2\n"
"shufps $0x44,%%xmm3,%%xmm2\n"
"psraw $0x6,%%xmm2\n" "psraw $0x6,%%xmm2\n"
"packuswb %%xmm2,%%xmm2\n" "packuswb %%xmm2,%%xmm2\n"
"movq %%xmm2,0x0(%1)\n" "movq %%xmm2,0x0(%1)\n"
...@@ -313,28 +558,39 @@ void FastConvertYToRGB32Row(const uint8* y_buf, // rdi ...@@ -313,28 +558,39 @@ void FastConvertYToRGB32Row(const uint8* y_buf, // rdi
"ja 1b\n" "ja 1b\n"
: "+r"(y_buf), // %0 : "+r"(y_buf), // %0
"+r"(rgb_buf), // %1 "+r"(rgb_buf), // %1
"+r"(width) // %2 "+rm"(width) // %2
: "r" (_kCoefficientsRgbY) // %3 : "r" (kCoefficientsRgbY) // %3
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" : "memory", "cc", "%"REG_a, "%"REG_d
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
#endif
); );
} }
#elif defined(__i386__) #endif
// 32 bit gcc version
#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
// 32 bit mmx gcc version
void FastConvertYUVToRGB32Row(const uint8* y_buf, #ifdef OSX
const uint8* u_buf, #define UNDERSCORE "_"
const uint8* v_buf, #else
uint8* rgb_buf, #define UNDERSCORE ""
int width); #endif
void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
asm( asm(
".text\n" ".text\n"
#if defined(OSX) || defined(IOS) #if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToRGB32Row\n" ".globl _FastConvertYUVToARGBRow_MMX\n"
"_FastConvertYUVToRGB32Row:\n" "_FastConvertYUVToARGBRow_MMX:\n"
#else #else
".global FastConvertYUVToRGB32Row\n" ".global FastConvertYUVToARGBRow_MMX\n"
"FastConvertYUVToRGB32Row:\n" "FastConvertYUVToARGBRow_MMX:\n"
#endif #endif
"pusha\n" "pusha\n"
"mov 0x24(%esp),%edx\n" "mov 0x24(%esp),%edx\n"
...@@ -348,19 +604,19 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, ...@@ -348,19 +604,19 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
"lea 1(%edi),%edi\n" "lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n" "movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n" "lea 1(%esi),%esi\n"
"movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n" "movzbl (%edx),%eax\n"
"paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx\n" "movzbl 0x1(%edx),%ebx\n"
"movq _kCoefficientsRgbY(,%eax,8),%mm1\n" "movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx\n" "lea 2(%edx),%edx\n"
"movq _kCoefficientsRgbY(,%ebx,8),%mm2\n" "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1\n" "paddsw %mm0,%mm1\n"
"paddsw %mm0,%mm2\n" "paddsw %mm0,%mm2\n"
"psraw $0x6,%mm1\n" "psraw $0x6,%mm1\n"
"psraw $0x6,%mm2\n" "psraw $0x6,%mm2\n"
"packuswb %mm2,%mm1\n" "packuswb %mm2,%mm1\n"
"movntq %mm1,0x0(%ebp)\n" "movq %mm1,0x0(%ebp)\n"
"lea 8(%ebp),%ebp\n" "lea 8(%ebp),%ebp\n"
"sub $0x2,%ecx\n" "sub $0x2,%ecx\n"
"ja 1b\n" "ja 1b\n"
...@@ -368,19 +624,19 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, ...@@ -368,19 +624,19 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf,
"ret\n" "ret\n"
); );
void FastConvertYUVToBGRARow(const uint8* y_buf, void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
asm( asm(
".text\n" ".text\n"
#if defined(OSX) || defined(IOS) #if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToBGRARow\n" ".globl _FastConvertYUVToBGRARow_MMX\n"
"_FastConvertYUVToBGRARow:\n" "_FastConvertYUVToBGRARow_MMX:\n"
#else #else
".global FastConvertYUVToBGRARow\n" ".global FastConvertYUVToBGRARow_MMX\n"
"FastConvertYUVToBGRARow:\n" "FastConvertYUVToBGRARow_MMX:\n"
#endif #endif
"pusha\n" "pusha\n"
"mov 0x24(%esp),%edx\n" "mov 0x24(%esp),%edx\n"
...@@ -394,19 +650,19 @@ void FastConvertYUVToBGRARow(const uint8* y_buf, ...@@ -394,19 +650,19 @@ void FastConvertYUVToBGRARow(const uint8* y_buf,
"lea 1(%edi),%edi\n" "lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n" "movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n" "lea 1(%esi),%esi\n"
"movq _kCoefficientsBgraY+2048(,%eax,8),%mm0\n" "movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n" "movzbl (%edx),%eax\n"
"paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n" "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx\n" "movzbl 0x1(%edx),%ebx\n"
"movq _kCoefficientsBgraY(,%eax,8),%mm1\n" "movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx\n" "lea 2(%edx),%edx\n"
"movq _kCoefficientsBgraY(,%ebx,8),%mm2\n" "movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1\n" "paddsw %mm0,%mm1\n"
"paddsw %mm0,%mm2\n" "paddsw %mm0,%mm2\n"
"psraw $0x6,%mm1\n" "psraw $0x6,%mm1\n"
"psraw $0x6,%mm2\n" "psraw $0x6,%mm2\n"
"packuswb %mm2,%mm1\n" "packuswb %mm2,%mm1\n"
"movntq %mm1,0x0(%ebp)\n" "movq %mm1,0x0(%ebp)\n"
"lea 8(%ebp),%ebp\n" "lea 8(%ebp),%ebp\n"
"sub $0x2,%ecx\n" "sub $0x2,%ecx\n"
"ja 1b\n" "ja 1b\n"
...@@ -414,19 +670,19 @@ void FastConvertYUVToBGRARow(const uint8* y_buf, ...@@ -414,19 +670,19 @@ void FastConvertYUVToBGRARow(const uint8* y_buf,
"ret\n" "ret\n"
); );
void FastConvertYUVToABGRRow(const uint8* y_buf, void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
asm( asm(
".text\n" ".text\n"
#if defined(OSX) || defined(IOS) #if defined(OSX) || defined(IOS)
".globl _FastConvertYUVToABGRRow\n" ".globl _FastConvertYUVToABGRRow_MMX\n"
"_FastConvertYUVToABGRRow:\n" "_FastConvertYUVToABGRRow_MMX:\n"
#else #else
".global FastConvertYUVToABGRRow\n" ".global FastConvertYUVToABGRRow_MMX\n"
"FastConvertYUVToABGRRow:\n" "FastConvertYUVToABGRRow_MMX:\n"
#endif #endif
"pusha\n" "pusha\n"
"mov 0x24(%esp),%edx\n" "mov 0x24(%esp),%edx\n"
...@@ -440,19 +696,19 @@ void FastConvertYUVToABGRRow(const uint8* y_buf, ...@@ -440,19 +696,19 @@ void FastConvertYUVToABGRRow(const uint8* y_buf,
"lea 1(%edi),%edi\n" "lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n" "movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n" "lea 1(%esi),%esi\n"
"movq _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n" "movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n" "movzbl (%edx),%eax\n"
"paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n" "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
"movzbl 0x1(%edx),%ebx\n" "movzbl 0x1(%edx),%ebx\n"
"movq _kCoefficientsAbgrY(,%eax,8),%mm1\n" "movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
"lea 2(%edx),%edx\n" "lea 2(%edx),%edx\n"
"movq _kCoefficientsAbgrY(,%ebx,8),%mm2\n" "movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
"paddsw %mm0,%mm1\n" "paddsw %mm0,%mm1\n"
"paddsw %mm0,%mm2\n" "paddsw %mm0,%mm2\n"
"psraw $0x6,%mm1\n" "psraw $0x6,%mm1\n"
"psraw $0x6,%mm2\n" "psraw $0x6,%mm2\n"
"packuswb %mm2,%mm1\n" "packuswb %mm2,%mm1\n"
"movntq %mm1,0x0(%ebp)\n" "movq %mm1,0x0(%ebp)\n"
"lea 8(%ebp),%ebp\n" "lea 8(%ebp),%ebp\n"
"sub $0x2,%ecx\n" "sub $0x2,%ecx\n"
"ja 1b\n" "ja 1b\n"
...@@ -460,19 +716,19 @@ void FastConvertYUVToABGRRow(const uint8* y_buf, ...@@ -460,19 +716,19 @@ void FastConvertYUVToABGRRow(const uint8* y_buf,
"ret\n" "ret\n"
); );
void FastConvertYUV444ToRGB32Row(const uint8* y_buf, void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
asm( asm(
".text\n" ".text\n"
#if defined(OSX) || defined(IOS) #if defined(OSX) || defined(IOS)
".globl _FastConvertYUV444ToRGB32Row\n" ".globl _FastConvertYUV444ToARGBRow_MMX\n"
"_FastConvertYUV444ToRGB32Row:\n" "_FastConvertYUV444ToARGBRow_MMX:\n"
#else #else
".global FastConvertYUV444ToRGB32Row\n" ".global FastConvertYUV444ToARGBRow_MMX\n"
"FastConvertYUV444ToRGB32Row:\n" "FastConvertYUV444ToARGBRow_MMX:\n"
#endif #endif
"pusha\n" "pusha\n"
"mov 0x24(%esp),%edx\n" "mov 0x24(%esp),%edx\n"
...@@ -486,11 +742,11 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, ...@@ -486,11 +742,11 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
"lea 1(%edi),%edi\n" "lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n" "movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n" "lea 1(%esi),%esi\n"
"movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n" "movzbl (%edx),%eax\n"
"paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
"lea 1(%edx),%edx\n" "lea 1(%edx),%edx\n"
"paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n" "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
"psraw $0x6,%mm0\n" "psraw $0x6,%mm0\n"
"packuswb %mm0,%mm0\n" "packuswb %mm0,%mm0\n"
"movd %mm0,0x0(%ebp)\n" "movd %mm0,0x0(%ebp)\n"
...@@ -501,17 +757,17 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, ...@@ -501,17 +757,17 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
"ret\n" "ret\n"
); );
void FastConvertYToRGB32Row(const uint8* y_buf, void FastConvertYToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width); int width);
asm( asm(
".text\n" ".text\n"
#if defined(OSX) || defined(IOS) #if defined(OSX) || defined(IOS)
".globl _FastConvertYToRGB32Row\n" ".globl _FastConvertYToARGBRow_MMX\n"
"_FastConvertYToRGB32Row:\n" "_FastConvertYToARGBRow_MMX:\n"
#else #else
".global FastConvertYToRGB32Row\n" ".global FastConvertYToARGBRow_MMX\n"
"FastConvertYToRGB32Row:\n" "FastConvertYToARGBRow_MMX:\n"
#endif #endif
"push %ebx\n" "push %ebx\n"
"mov 0x8(%esp),%eax\n" "mov 0x8(%esp),%eax\n"
...@@ -520,10 +776,10 @@ void FastConvertYToRGB32Row(const uint8* y_buf, ...@@ -520,10 +776,10 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
"1:" "1:"
"movzbl (%eax),%ebx\n" "movzbl (%eax),%ebx\n"
"movq _kCoefficientsRgbY(,%ebx,8),%mm0\n" "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
"psraw $0x6,%mm0\n" "psraw $0x6,%mm0\n"
"movzbl 0x1(%eax),%ebx\n" "movzbl 0x1(%eax),%ebx\n"
"movq _kCoefficientsRgbY(,%ebx,8),%mm1\n" "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
"psraw $0x6,%mm1\n" "psraw $0x6,%mm1\n"
"packuswb %mm1,%mm0\n" "packuswb %mm1,%mm0\n"
"lea 0x2(%eax),%eax\n" "lea 0x2(%eax),%eax\n"
...@@ -535,125 +791,36 @@ void FastConvertYToRGB32Row(const uint8* y_buf, ...@@ -535,125 +791,36 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
"ret\n" "ret\n"
); );
#else #endif
// C reference code that mimic the YUV assembly.
#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
(((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
static inline void YuvPixel(uint8 y,
uint8 u,
uint8 v,
uint8* rgb_buf,
int ashift,
int rshift,
int gshift,
int bshift) {
int b = _kCoefficientsRgbY[256+u][0];
int g = _kCoefficientsRgbY[256+u][1];
int r = _kCoefficientsRgbY[256+u][2];
int a = _kCoefficientsRgbY[256+u][3];
b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
b = paddsw(b, _kCoefficientsRgbY[y][0]);
g = paddsw(g, _kCoefficientsRgbY[y][1]);
r = paddsw(r, _kCoefficientsRgbY[y][2]);
a = paddsw(a, _kCoefficientsRgbY[y][3]);
b >>= 6;
g >>= 6;
r >>= 6;
a >>= 6;
*reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
(packuswb(g) << gshift) |
(packuswb(r) << rshift) |
(packuswb(a) << ashift);
}
void FastConvertYUVToRGB32Row(const uint8* y_buf, void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
const uint8* u_buf, SIMD_ALIGNED(uint8 row[kMaxStride]);
const uint8* v_buf, ABGRToARGBRow_SSSE3(src_argb, row, pix);
uint8* rgb_buf, ARGBToYRow_SSSE3(row, dst_y, pix);
int width) {
for (int x = 0; x < width; x += 2) {
uint8 u = u_buf[x >> 1];
uint8 v = v_buf[x >> 1];
uint8 y0 = y_buf[x];
YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
if ((x + 1) < width) {
uint8 y1 = y_buf[x + 1];
YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
}
rgb_buf += 8; // Advance 2 pixels.
}
} }
void FastConvertYUVToBGRARow(const uint8* y_buf, void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
const uint8* u_buf, SIMD_ALIGNED(uint8 row[kMaxStride]);
const uint8* v_buf, BGRAToARGBRow_SSSE3(src_argb, row, pix);
uint8* rgb_buf, ARGBToYRow_SSSE3(row, dst_y, pix);
int width) {
for (int x = 0; x < width; x += 2) {
uint8 u = u_buf[x >> 1];
uint8 v = v_buf[x >> 1];
uint8 y0 = y_buf[x];
YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
if ((x + 1) < width) {
uint8 y1 = y_buf[x + 1];
YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
}
rgb_buf += 8; // Advance 2 pixels.
}
} }
void FastConvertYUVToABGRRow(const uint8* y_buf, #ifdef HAS_ARGBTOUVROW_SSSE3
const uint8* u_buf, void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
const uint8* v_buf, uint8* dst_u, uint8* dst_v, int pix) {
uint8* rgb_buf, SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
int width) { ABGRToARGBRow_SSSE3(src_argb, row, pix);
for (int x = 0; x < width; x += 2) { ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
uint8 u = u_buf[x >> 1]; ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
uint8 v = v_buf[x >> 1];
uint8 y0 = y_buf[x];
YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
if ((x + 1) < width) {
uint8 y1 = y_buf[x + 1];
YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
}
rgb_buf += 8; // Advance 2 pixels.
}
} }
void FastConvertYUV444ToRGB32Row(const uint8* y_buf, void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
const uint8* u_buf, uint8* dst_u, uint8* dst_v, int pix) {
const uint8* v_buf, SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
uint8* rgb_buf, BGRAToARGBRow_SSSE3(src_argb, row, pix);
int width) { BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
for (int x = 0; x < width; ++x) { ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
uint8 u = u_buf[x];
uint8 v = v_buf[x];
uint8 y = y_buf[x];
YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
rgb_buf += 4; // Advance 1 pixel.
}
}
void FastConvertYToRGB32Row(const uint8* y_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width; ++x) {
uint8 y = y_buf[x];
YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
rgb_buf += 4; // Advance 1 pixel.
}
} }
#endif #endif
} // extern "C" } // extern "C"
...@@ -10,8 +10,6 @@ ...@@ -10,8 +10,6 @@
#include "row.h" #include "row.h"
#define kMaxStride (2048 * 4)
extern "C" { extern "C" {
#define MAKETABLE(NAME) \ #define MAKETABLE(NAME) \
...@@ -232,11 +230,7 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\ ...@@ -232,11 +230,7 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
0 \ 0 \
} }
#ifdef OSX
MAKETABLE(kCoefficientsRgbY) MAKETABLE(kCoefficientsRgbY)
#else
MAKETABLE(_kCoefficientsRgbY)
#endif
#undef RGBY #undef RGBY
#undef RGBU #undef RGBU
...@@ -264,12 +258,7 @@ MAKETABLE(_kCoefficientsRgbY) ...@@ -264,12 +258,7 @@ MAKETABLE(_kCoefficientsRgbY)
0 \ 0 \
} }
#ifdef OSX
MAKETABLE(kCoefficientsBgraY) MAKETABLE(kCoefficientsBgraY)
#else
MAKETABLE(_kCoefficientsBgraY)
#endif
#undef RGBY #undef RGBY
#undef RGBU #undef RGBU
...@@ -297,12 +286,39 @@ MAKETABLE(_kCoefficientsBgraY) ...@@ -297,12 +286,39 @@ MAKETABLE(_kCoefficientsBgraY)
0 \ 0 \
} }
#ifdef OSX
MAKETABLE(kCoefficientsAbgrY) MAKETABLE(kCoefficientsAbgrY)
#else
MAKETABLE(_kCoefficientsAbgrY)
#endif
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
// To support in-place conversion.
uint8 r = src_abgr[0];
uint8 g = src_abgr[1];
uint8 b = src_abgr[2];
uint8 a = src_abgr[3];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = a;
dst_argb += 4;
src_abgr += 4;
}
}
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
// To support in-place conversion.
uint8 a = src_bgra[0];
uint8 r = src_bgra[1];
uint8 g = src_bgra[2];
uint8 b = src_bgra[3];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
dst_argb[3] = a;
dst_argb += 4;
src_bgra += 4;
}
}
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) { for (int x = 0; x < pix; ++x) {
...@@ -466,4 +482,133 @@ void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, ...@@ -466,4 +482,133 @@ void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
#endif #endif
#endif #endif
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
// Copy a Y to RGB.
for (int x = 0; x < pix; ++x) {
uint8 y = src_y[0];
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
dst_argb[3] = 255u;
dst_argb += 4;
++src_y;
}
}
// C reference code that mimic the YUV assembly.
#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
(((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
static inline void YuvPixel(uint8 y,
uint8 u,
uint8 v,
uint8* rgb_buf,
int ashift,
int rshift,
int gshift,
int bshift) {
int b = kCoefficientsRgbY[256+u][0];
int g = kCoefficientsRgbY[256+u][1];
int r = kCoefficientsRgbY[256+u][2];
int a = kCoefficientsRgbY[256+u][3];
b = paddsw(b, kCoefficientsRgbY[512+v][0]);
g = paddsw(g, kCoefficientsRgbY[512+v][1]);
r = paddsw(r, kCoefficientsRgbY[512+v][2]);
a = paddsw(a, kCoefficientsRgbY[512+v][3]);
b = paddsw(b, kCoefficientsRgbY[y][0]);
g = paddsw(g, kCoefficientsRgbY[y][1]);
r = paddsw(r, kCoefficientsRgbY[y][2]);
a = paddsw(a, kCoefficientsRgbY[y][3]);
b >>= 6;
g >>= 6;
r >>= 6;
a >>= 6;
*reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
(packuswb(g) << gshift) |
(packuswb(r) << rshift) |
(packuswb(a) << ashift);
}
void FastConvertYUVToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
}
}
void FastConvertYUVToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
}
}
void FastConvertYUVToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width - 1; x += 2) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
y_buf += 2;
u_buf += 1;
v_buf += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
}
}
void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width; ++x) {
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
y_buf += 1;
u_buf += 1;
v_buf += 1;
rgb_buf += 4; // Advance 1 pixel.
}
}
void FastConvertYToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf,
int width) {
for (int x = 0; x < width; ++x) {
YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
y_buf += 1;
rgb_buf += 4; // Advance 1 pixel.
}
}
} // extern "C" } // extern "C"
...@@ -74,6 +74,160 @@ extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { ...@@ -74,6 +74,160 @@ extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
}; };
// Shuffle table for converting ABGR to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting BGRA to ARGB.
extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
__declspec(naked)
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
wloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
punpcklbw xmm0, xmm0
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0
punpckhwd xmm1, xmm1
por xmm0, xmm5
por xmm1, xmm5
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
ja wloop
ret
}
}
__declspec(naked)
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_abgr
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
movdqa xmm5, _kShuffleMaskABGRToARGB
convertloop :
movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm5
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
ja convertloop
ret
}
}
__declspec(naked)
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_bgra
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
movdqa xmm5, _kShuffleMaskBGRAToARGB
convertloop :
movdqa xmm0, [eax]
lea eax, [eax + 16]
pshufb xmm0, xmm5
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
ja convertloop
ret
}
}
__declspec(naked)
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_bg24
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
movdqa xmm4, _kShuffleMaskBG24ToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm4
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
movdqa [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
movdqa [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
movdqa [edx + 16], xmm1
por xmm3, xmm5
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
movdqa xmm4, _kShuffleMaskRAWToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm4
por xmm2, xmm5
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm4
movdqa [edx + 32], xmm2
por xmm0, xmm5
pshufb xmm1, xmm4
movdqa [edx], xmm0
por xmm1, xmm5
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm4
movdqa [edx + 16], xmm1
por xmm3, xmm5
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
// Convert 16 ARGB pixels (64 bytes) to 16 Y values // Convert 16 ARGB pixels (64 bytes) to 16 Y values
__declspec(naked) __declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
...@@ -81,25 +235,25 @@ __asm { ...@@ -81,25 +235,25 @@ __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kARGBToY movdqa xmm5, _kAddY16
movdqa xmm6, _kAddY16 movdqa xmm4, _kARGBToY
convertloop : convertloop :
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32] movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48] movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7 pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm7 pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm7 pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm7 pmaddubsw xmm3, xmm4
lea eax, [eax + 64] lea eax, [eax + 64]
phaddw xmm0, xmm1 phaddw xmm0, xmm1
phaddw xmm2, xmm3 phaddw xmm2, xmm3
psrlw xmm0, 7 psrlw xmm0, 7
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
paddb xmm0, xmm6 paddb xmm0, xmm5
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
...@@ -114,25 +268,25 @@ __asm { ...@@ -114,25 +268,25 @@ __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kBGRAToY movdqa xmm5, _kAddY16
movdqa xmm6, _kAddY16 movdqa xmm4, _kBGRAToY
convertloop : convertloop :
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32] movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48] movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7 pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm7 pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm7 pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm7 pmaddubsw xmm3, xmm4
lea eax, [eax + 64] lea eax, [eax + 64]
phaddw xmm0, xmm1 phaddw xmm0, xmm1
phaddw xmm2, xmm3 phaddw xmm2, xmm3
psrlw xmm0, 7 psrlw xmm0, 7
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
paddb xmm0, xmm6 paddb xmm0, xmm5
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
...@@ -147,25 +301,25 @@ __asm { ...@@ -147,25 +301,25 @@ __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */ mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
movdqa xmm7, _kABGRToY movdqa xmm5, _kAddY16
movdqa xmm6, _kAddY16 movdqa xmm4, _kABGRToY
convertloop : convertloop :
movdqa xmm0, [eax] movdqa xmm0, [eax]
movdqa xmm1, [eax + 16] movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32] movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48] movdqa xmm3, [eax + 48]
pmaddubsw xmm0, xmm7 pmaddubsw xmm0, xmm4
pmaddubsw xmm1, xmm7 pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm7 pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm7 pmaddubsw xmm3, xmm4
lea eax, [eax + 64] lea eax, [eax + 64]
phaddw xmm0, xmm1 phaddw xmm0, xmm1
phaddw xmm2, xmm3 phaddw xmm2, xmm3
psrlw xmm0, 7 psrlw xmm0, 7
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
paddb xmm0, xmm6 paddb xmm0, xmm5
movdqa [edx], xmm0 movdqa [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
...@@ -366,230 +520,138 @@ __asm { ...@@ -366,230 +520,138 @@ __asm {
} }
} }
__declspec(naked) #define YUVTORGB(TABLE) __asm { \
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { __asm convertloop : \
__asm { __asm movzx eax, byte ptr [edi] \
mov eax, [esp + 4] // src_bg24 __asm lea edi, [edi + 1] \
mov edx, [esp + 8] // dst_argb __asm movzx ebx, byte ptr [esi] \
mov ecx, [esp + 12] // pix __asm lea esi, [esi + 1] \
pcmpeqb xmm7, xmm7 // generate mask 0xff000000 __asm movq mm0, [TABLE + 2048 + 8 * eax] \
pslld xmm7, 24 __asm movzx eax, byte ptr [edx] \
movdqa xmm6, _kShuffleMaskBG24ToARGB __asm paddsw mm0, [TABLE + 4096 + 8 * ebx] \
__asm movzx ebx, byte ptr [edx + 1] \
convertloop : __asm movq mm1, [TABLE + 8 * eax] \
movdqa xmm0, [eax] __asm lea edx, [edx + 2] \
movdqa xmm1, [eax + 16] __asm movq mm2, [TABLE + 8 * ebx] \
movdqa xmm3, [eax + 32] __asm paddsw mm1, mm0 \
lea eax, [eax + 48] __asm paddsw mm2, mm0 \
movdqa xmm2, xmm3 __asm psraw mm1, 6 \
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} __asm psraw mm2, 6 \
pshufb xmm2, xmm6 __asm packuswb mm1, mm2 \
por xmm2, xmm7 __asm movq [ebp], mm1 \
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} __asm lea ebp, [ebp + 8] \
pshufb xmm0, xmm6 __asm sub ecx, 2 \
movdqa [edx + 32], xmm2 __asm ja convertloop \
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
}
}
__declspec(naked)
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) {
__asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // pix
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
pslld xmm7, 24
movdqa xmm6, _kShuffleMaskRAWToARGB
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm3, [eax + 32]
lea eax, [eax + 48]
movdqa xmm2, xmm3
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
pshufb xmm2, xmm6
por xmm2, xmm7
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
pshufb xmm0, xmm6
movdqa [edx + 32], xmm2
por xmm0, xmm7
pshufb xmm1, xmm6
movdqa [edx], xmm0
por xmm1, xmm7
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
pshufb xmm3, xmm6
movdqa [edx + 16], xmm1
por xmm3, xmm7
movdqa [edx + 48], xmm3
lea edx, [edx + 64]
sub ecx, 16
ja convertloop
ret
} }
}
__declspec(naked) __declspec(naked)
void FastConvertYUVToRGB32Row(const uint8* y_buf, void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
__asm { __asm {
pushad push ebx
mov edx, [esp + 32 + 4] push esi
mov edi, [esp + 32 + 8] push edi
mov esi, [esp + 32 + 12] push ebp
mov ebp, [esp + 32 + 16] mov edx, [esp + 16 + 4]
mov ecx, [esp + 32 + 20] mov edi, [esp + 16 + 8]
mov esi, [esp + 16 + 12]
convertloop : mov ebp, [esp + 16 + 16]
movzx eax, byte ptr [edi] mov ecx, [esp + 16 + 20]
lea edi, [edi + 1]
movzx ebx, byte ptr [esi] YUVTORGB(kCoefficientsRgbY)
lea esi, [esi + 1]
movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] pop ebp
movzx eax, byte ptr [edx] pop edi
paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] pop esi
movzx ebx, byte ptr [edx + 1] pop ebx
movq mm1, [_kCoefficientsRgbY + 8 * eax]
lea edx, [edx + 2]
movq mm2, [_kCoefficientsRgbY + 8 * ebx]
paddsw mm1, mm0
paddsw mm2, mm0
psraw mm1, 6
psraw mm2, 6
packuswb mm1, mm2
movntq [ebp], mm1
lea ebp, [ebp + 8]
sub ecx, 2
ja convertloop
popad
ret ret
} }
} }
__declspec(naked) __declspec(naked)
void FastConvertYUVToBGRARow(const uint8* y_buf, void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
__asm { __asm {
pushad push ebx
mov edx, [esp + 32 + 4] push esi
mov edi, [esp + 32 + 8] push edi
mov esi, [esp + 32 + 12] push ebp
mov ebp, [esp + 32 + 16] mov edx, [esp + 16 + 4]
mov ecx, [esp + 32 + 20] mov edi, [esp + 16 + 8]
mov esi, [esp + 16 + 12]
convertloop : mov ebp, [esp + 16 + 16]
movzx eax, byte ptr [edi] mov ecx, [esp + 16 + 20]
lea edi, [edi + 1]
movzx ebx, byte ptr [esi] YUVTORGB(kCoefficientsBgraY)
lea esi, [esi + 1]
movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax] pop ebp
movzx eax, byte ptr [edx] pop edi
paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx] pop esi
movzx ebx, byte ptr [edx + 1] pop ebx
movq mm1, [_kCoefficientsBgraY + 8 * eax]
lea edx, [edx + 2]
movq mm2, [_kCoefficientsBgraY + 8 * ebx]
paddsw mm1, mm0
paddsw mm2, mm0
psraw mm1, 6
psraw mm2, 6
packuswb mm1, mm2
movntq [ebp], mm1
lea ebp, [ebp + 8]
sub ecx, 2
ja convertloop
popad
ret ret
} }
} }
__declspec(naked) __declspec(naked)
void FastConvertYUVToABGRRow(const uint8* y_buf, void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
__asm { __asm {
pushad push ebx
mov edx, [esp + 32 + 4] push esi
mov edi, [esp + 32 + 8] push edi
mov esi, [esp + 32 + 12] push ebp
mov ebp, [esp + 32 + 16] mov edx, [esp + 16 + 4]
mov ecx, [esp + 32 + 20] mov edi, [esp + 16 + 8]
mov esi, [esp + 16 + 12]
convertloop : mov ebp, [esp + 16 + 16]
movzx eax, byte ptr [edi] mov ecx, [esp + 16 + 20]
lea edi, [edi + 1]
movzx ebx, byte ptr [esi] YUVTORGB(kCoefficientsAbgrY)
lea esi, [esi + 1]
movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax] pop ebp
movzx eax, byte ptr [edx] pop edi
paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx] pop esi
movzx ebx, byte ptr [edx + 1] pop ebx
movq mm1, [_kCoefficientsAbgrY + 8 * eax]
lea edx, [edx + 2]
movq mm2, [_kCoefficientsAbgrY + 8 * ebx]
paddsw mm1, mm0
paddsw mm2, mm0
psraw mm1, 6
psraw mm2, 6
packuswb mm1, mm2
movntq [ebp], mm1
lea ebp, [ebp + 8]
sub ecx, 2
ja convertloop
popad
ret ret
} }
} }
__declspec(naked) __declspec(naked)
void FastConvertYUV444ToRGB32Row(const uint8* y_buf, void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
__asm { __asm {
pushad push ebx
mov edx, [esp + 32 + 4] // Y push esi
mov edi, [esp + 32 + 8] // U push edi
mov esi, [esp + 32 + 12] // V push ebp
mov ebp, [esp + 32 + 16] // rgb mov edx, [esp + 16 + 4]
mov ecx, [esp + 32 + 20] // width mov edi, [esp + 16 + 8]
mov esi, [esp + 16 + 12]
mov ebp, [esp + 16 + 16]
mov ecx, [esp + 16 + 20]
convertloop : convertloop :
movzx eax, byte ptr [edi] movzx eax, byte ptr [edi]
lea edi, [edi + 1] lea edi, [edi + 1]
movzx ebx, byte ptr [esi] movzx ebx, byte ptr [esi]
lea esi, [esi + 1] lea esi, [esi + 1]
movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] movq mm0, [kCoefficientsRgbY + 2048 + 8 * eax]
movzx eax, byte ptr [edx] movzx eax, byte ptr [edx]
paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] paddsw mm0, [kCoefficientsRgbY + 4096 + 8 * ebx]
lea edx, [edx + 1] lea edx, [edx + 1]
paddsw mm0, [_kCoefficientsRgbY + 8 * eax] paddsw mm0, [kCoefficientsRgbY + 8 * eax]
psraw mm0, 6 psraw mm0, 6
packuswb mm0, mm0 packuswb mm0, mm0
movd [ebp], mm0 movd [ebp], mm0
...@@ -597,15 +659,18 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, ...@@ -597,15 +659,18 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
sub ecx, 1 sub ecx, 1
ja convertloop ja convertloop
popad pop ebp
pop edi
pop esi
pop ebx
ret ret
} }
} }
__declspec(naked) __declspec(naked)
void FastConvertYToRGB32Row(const uint8* y_buf, void FastConvertYToARGBRow_MMX(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
__asm { __asm {
push ebx push ebx
mov eax, [esp + 4 + 4] // Y mov eax, [esp + 4 + 4] // Y
...@@ -614,10 +679,10 @@ void FastConvertYToRGB32Row(const uint8* y_buf, ...@@ -614,10 +679,10 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
convertloop : convertloop :
movzx ebx, byte ptr [eax] movzx ebx, byte ptr [eax]
movq mm0, [_kCoefficientsRgbY + 8 * ebx] movq mm0, [kCoefficientsRgbY + 8 * ebx]
psraw mm0, 6 psraw mm0, 6
movzx ebx, byte ptr [eax + 1] movzx ebx, byte ptr [eax + 1]
movq mm1, [_kCoefficientsRgbY + 8 * ebx] movq mm1, [kCoefficientsRgbY + 8 * ebx]
psraw mm1, 6 psraw mm1, 6
packuswb mm0, mm1 packuswb mm0, mm1
lea eax, [eax + 2] lea eax, [eax + 2]
......
...@@ -42,6 +42,7 @@ enum FourCC { ...@@ -42,6 +42,7 @@ enum FourCC {
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
FOURCC_M420 = FOURCC('M', '4', '2', '0'), FOURCC_M420 = FOURCC('M', '4', '2', '0'),
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment