Commit 1d160cb9 authored by fbarchard@google.com's avatar fbarchard@google.com

Attenuate AGRB pixels NEON optimized

BUG=164
TEST=./libyuv_unittest --gtest_filter=*Atten*
Review URL: https://webrtc-codereview.appspot.com/937031

git-svn-id: http://libyuv.googlecode.com/svn/trunk@506 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent 326a521a
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 505
Version: 506
License: BSD
License File: LICENSE
......
......@@ -53,7 +53,7 @@ int ArmCpuCaps(const char* cpuinfo_name);
// returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API extern int cpu_info_;
return (cpu_info_ == 1 ? InitCpuFlags() : cpu_info_) & test_flag;
return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
}
// For testing, allow CPU flags to be disabled.
......
......@@ -139,7 +139,7 @@ extern "C" {
#if !defined(YUV_DISABLE_ASM) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_SSSE3_ONLY)
#define HAS_ARGBATTENUATE_SSE2
#define HAS_ARGBATTENUATEROW_SSE2
#define HAS_ARGBBLENDROW_SSE2
#define HAS_MIRRORROW_SSE2
#endif
......@@ -221,6 +221,7 @@ extern "C" {
// Effects
#define HAS_ARGBINTERPOLATEROW_NEON
#define HAS_ARGBBLENDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
#endif
// The following are available on Mips platforms
......@@ -935,6 +936,12 @@ void YToARGBRow_SSE2(const uint8* src_y,
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
......@@ -1194,6 +1201,13 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
int width);
void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
int width);
void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
int width);
// Inverse table for unattenuate, shared by C and SSE2.
extern uint32 fixed_invtbl8[256];
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 505
#define LIBYUV_VERSION 506
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -18,6 +18,7 @@
#endif
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#include "libyuv/scale.h" // For ScalePlane()
#include "libyuv/video_common.h"
#include "libyuv/row.h"
......@@ -215,12 +216,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
return 0;
}
// use Bilinear for upsampling chroma
void ScalePlaneBilinear(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr);
// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
// 411 chroma is 1/4 width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
......@@ -256,19 +252,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
int halfheight = (height + 1) >> 1;
int quarterwidth = (width + 3) >> 2;
// Resample U plane.
ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height
halfwidth, halfheight, // to 1/2 width, 1/2 height
src_stride_u,
dst_stride_u,
src_u, dst_u);
// Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height.
ScalePlane(src_u, src_stride_u, quarterwidth, height,
dst_u, dst_stride_u, halfwidth, halfheight,
kFilterNone);
// Resample V plane.
ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height
halfwidth, halfheight, // to 1/2 width, 1/2 height
src_stride_v,
dst_stride_v,
src_v, dst_v);
ScalePlane(src_v, src_stride_v, quarterwidth, height,
dst_v, dst_stride_v, halfwidth, halfheight,
kFilterNone);
return 0;
}
......@@ -1738,7 +1730,6 @@ static void JpegI400ToI420(void* opaque,
LIBYUV_API
int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height) {
// TODO(fbarchard): Port to C
MJpegDecoder mjpeg_decoder;
bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret) {
......@@ -1764,7 +1755,7 @@ int MJPGToI420(const uint8* sample,
return -1;
}
// TODO(fbarchard): Port to C
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w ||
......
......@@ -230,13 +230,19 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* rgb_buf,
int width) = YToARGBRow_C;
#if defined(HAS_YTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) &&
if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
YToARGBRow = YToARGBRow_SSE2;
YToARGBRow = YToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_SSE2;
}
}
#elif defined(HAS_YTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_NEON;
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
YToARGBRow = YToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_NEON;
}
}
#endif
......@@ -941,7 +947,7 @@ int MJPGToARGB(const uint8* sample,
return -1;
}
// TODO(fbarchard): Port to C
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w ||
......
......@@ -16,6 +16,7 @@
#include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#include "libyuv/scale.h" // For ScalePlane()
#include "libyuv/video_common.h"
#include "libyuv/row.h"
......@@ -98,12 +99,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
return 0;
}
// use Bilinear for upsampling chroma
void ScalePlaneBilinear(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr);
// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
LIBYUV_API
int I420ToI444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
......@@ -136,19 +132,15 @@ int I420ToI444(const uint8* src_y, int src_stride_y,
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
// Upsample U plane.
ScalePlaneBilinear(halfwidth, halfheight,
width, height,
src_stride_u,
dst_stride_u,
src_u, dst_u);
// Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height.
ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
dst_u, dst_stride_u, width, height,
kFilterNone);
// Upsample V plane.
ScalePlaneBilinear(halfwidth, halfheight,
width, height,
src_stride_v,
dst_stride_v,
src_v, dst_v);
ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
dst_v, dst_stride_v, width, height,
kFilterNone);
return 0;
}
......@@ -187,19 +179,15 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
int halfheight = (height + 1) >> 1;
int quarterwidth = (width + 3) >> 2;
// Resample U plane.
ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height
quarterwidth, height, // to 1/4 width, 1x height
src_stride_u,
dst_stride_u,
src_u, dst_u);
// Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height
ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
dst_u, dst_stride_u,quarterwidth, height,
kFilterNone);
// Resample V plane.
ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height
quarterwidth, height, // to 1/4 width, 1x height
src_stride_v,
dst_stride_v,
src_v, dst_v);
ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
dst_v, dst_stride_v,quarterwidth, height,
kFilterNone);
return 0;
}
......@@ -360,7 +348,6 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
return 0;
}
// TODO(fbarchard): Deprecate, move or expand 422 support?
LIBYUV_API
int I422ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
......
......@@ -138,9 +138,8 @@ static int MipsCpuCaps(const char* search_string) {
#endif
// CPU detect function for SIMD instruction sets.
// TODO(fbarchard): Use constant if/when valgrind says cpu_info is initialized.
LIBYUV_API
int cpu_info_ = 1; // 1 means cpu info is not initialized yet.
int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
......
......@@ -767,18 +767,32 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBAttenuateRow_C;
#if defined(HAS_ARGBATTENUATE_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
#if defined(HAS_ARGBATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_NEON;
}
}
#endif
......@@ -1126,9 +1140,8 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
}
// Interpolate 2 ARGB images by specified amount (0 to 255).
// TODO(fbarchard): Check width is multiple of 16. Do Any version.
// TODO(fbarchard): Consider selecting a specialized interpolator so
// interpolation doesn't need to be checked on each row.
// TODO(fbarchard): Consider selecting a specialization for interpolation so
// row function doesn't need to check interpolation on each row.
LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
......@@ -1147,15 +1160,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = ARGBInterpolateRow_C;
#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
}
#elif defined(HAS_ARGBINTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
ARGBInterpolateRow = ARGBInterpolateRow_NEON;
}
#endif
......
......@@ -113,8 +113,8 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n"
// TODO: rework shuffle above to write
// out with 4 instead of 8 writes
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
"vst1.32 {d4[0]}, [r9], %3 \n"
"vst1.32 {d4[1]}, [r9], %3 \n"
"vst1.32 {d5[0]}, [r9], %3 \n"
......@@ -276,7 +276,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"cmp %6, #4 \n"
"blt 2f \n"
//TODO(frkoenig) : clean this up
//TODO(frkoenig): Clean this up
// 4x8 block
"mov r9, %0 \n"
"vld1.64 {d0}, [r9], %1 \n"
......
......@@ -141,6 +141,8 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
3, 4, 2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
7, 1, 4)
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
15, 2, 4)
RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
......@@ -157,6 +159,8 @@ RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
7, 4, 2)
RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
7, 1, 4)
RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
7, 2, 4)
RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
......@@ -226,6 +230,28 @@ YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
#endif
#undef YANY
// Attenuate is destructive so last16 method can not be used due to overlap.
#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
int n = width & ~MASK; \
ARGBTOY_SIMD(src_argb, dst_y, n); \
ARGBTOY_C(src_argb + n * SBPP, \
dst_y + n * BPP, width & MASK); \
}
#ifdef HAS_ARGBATTENUATEROW_SSSE3
YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
4, 4, 3)
#endif
#ifdef HAS_ARGBATTENUATEROW_SSE2
YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
4, 4, 3)
#endif
#ifdef HAS_ARGBATTENUATEROW_NEON
YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
4, 4, 7)
#endif
// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \
void NAMEANY(const uint8* src_argb, int src_stride_argb, \
......
......@@ -2418,6 +2418,61 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
);
}
// Attenuate 8 pixels at a time.
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
// Attenuate 8 pixels.
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a
"vmull.u8 q11, d1, d3 \n" // g * a
"vmull.u8 q12, d2, d3 \n" // r * a
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q10", "q11", "q12"
);
}
#ifdef ARGBATTENUATEROW_VQRDMULH
// TODO(fbarchard): Remove this. Works but is slower and off by 2.
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
// Attenuate 8 pixels.
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n"
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q2, d4 \n"
"vmovl.u8 q8, d6 \n"
"vshl.u16 q0, q0, #7 \n" // b << 7
"vshl.u16 q1, q1, #7 \n" // g << 7
"vshl.u16 q2, q2, #7 \n" // r << 7
"vqrdmulh.s16 q0, q0, q8 \n" // b * a
"vqrdmulh.s16 q1, q1, q8 \n" // g * a
"vqrdmulh.s16 q2, q2, q8 \n" // r * a
"vmovn.u16 d0, q0 \n"
"vmovn.u16 d2, q1 \n"
"vmovn.u16 d4, q2 \n"
"vst4.8 {d0, d2, d4, d6}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q8"
);
}
#endif
#endif // __ARM_NEON__
#ifdef __cplusplus
......
......@@ -3519,7 +3519,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
}
#endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_ARGBATTENUATE_SSE2
#ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time.
// aligned to 16 bytes
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
......@@ -3564,7 +3564,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
#endif
);
}
#endif // HAS_ARGBATTENUATE_SSE2
#endif // HAS_ARGBATTENUATEROW_SSE2
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
......@@ -4132,7 +4132,6 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
#ifdef HAS_ARGBAFFINEROW_SSE2
// TODO(fbarchard): Find 64 bit way to avoid masking.
// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
// Copy ARGB pixels from source image with slope to a row of destination.
// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
// an error if movq is used. movd %%xmm0,%1
......
......@@ -1675,7 +1675,6 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
// Read 8 UV from 411.
......@@ -3701,7 +3700,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
}
#endif // HAS_ARGBBLENDROW_SSSE3
#ifdef HAS_ARGBATTENUATE_SSE2
#ifdef HAS_ARGBATTENUATEROW_SSE2
// Attenuate 4 pixels at a time.
// Aligned to 16 bytes.
__declspec(naked) __declspec(align(16))
......@@ -3743,7 +3742,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
ret
}
}
#endif // HAS_ARGBATTENUATE_SSE2
#endif // HAS_ARGBATTENUATEROW_SSE2
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha.
......
......@@ -3091,18 +3091,18 @@ void ScalePlaneBilinear(int src_width, int src_height,
int dst_width, int source_y_fraction) =
ScaleFilterRows_C;
#if defined(HAS_SCALEFILTERROWS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) {
ScaleFilterRows = ScaleFilterRows_NEON;
}
#endif
#if defined(HAS_SCALEFILTERROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleFilterRows = ScaleFilterRows_SSE2;
}
#endif
#if defined(HAS_SCALEFILTERROWS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 16)) {
ScaleFilterRows = ScaleFilterRows_Unaligned_SSSE3;
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleFilterRows = ScaleFilterRows_SSSE3;
......@@ -3110,7 +3110,7 @@ void ScalePlaneBilinear(int src_width, int src_height,
}
#endif
#if defined(HAS_SCALEFILTERROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4)) {
ScaleFilterRows = ScaleFilterRows_MIPS_DSPR2;
}
......@@ -3129,7 +3129,6 @@ void ScalePlaneBilinear(int src_width, int src_height,
int yf = (y >> 8) & 255;
const uint8* src = src_ptr + yi * src_stride;
ScaleFilterRows(row, src, src_stride, src_width, yf);
row[src_width] = row[src_width - 1];
ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
dst_ptr += dst_stride;
y += dy;
......
......@@ -856,8 +856,7 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_argb + src_stride;
uint8* end = dst_argb + (dst_width << 2);
do {
for (int x = 0; x < dst_width - 1; x += 2) {
dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
......@@ -869,7 +868,14 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
src_argb += 8;
src_ptr1 += 8;
dst_argb += 8;
} while (dst_argb < end);
}
if (dst_width & 1) {
dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
dst_argb[3] = (src_argb[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
dst_argb += 4;
}
// Duplicate the last pixel (4 bytes) for filtering.
dst_argb[0] = dst_argb[-4];
dst_argb[1] = dst_argb[-3];
......@@ -975,21 +981,20 @@ static void ScaleARGBBilinear(int src_width, int src_height,
ptrdiff_t src_stride,
int dst_width, int source_y_fraction) =
ScaleARGBFilterRows_C;
// TODO(fbarchard): Check aligned width.
#if defined(HAS_SCALEARGBFILTERROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
}
#endif
#if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERROWS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 4)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
}
#endif
......
......@@ -478,8 +478,8 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
/* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \
align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \
memset(dst_argb32_c, 0, kWidth * 4 * kHeight); \
memset(dst_argb32_opt, 0, kWidth * 4 * kHeight); \
memset(dst_argb32_c, 1, kWidth * 4 * kHeight); \
memset(dst_argb32_opt, 2, kWidth * 4 * kHeight); \
FMT_B##ToARGB(dst_argb_c, kStrideB, \
dst_argb32_c, kWidth * 4, \
kWidth, kHeight); \
......@@ -534,6 +534,12 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
align_buffer_64(dst_y_opt, kWidth * kHeight); \
align_buffer_64(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
align_buffer_64(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
memset(dst_y_c, 1, kWidth * kHeight); \
memset(dst_u_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
memset(dst_v_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
memset(dst_y_opt, 2, kWidth * kHeight); \
memset(dst_u_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
memset(dst_v_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
srandom(time(NULL)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kStride; ++j) \
......@@ -753,11 +759,11 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
align_buffer_page_end(src_argb, kStrideA * kHeightA); \
align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
memset(dst_argb_c, 0, kStrideB * kHeightB); \
memset(dst_argb_opt, 0, kStrideB * kHeightB); \
for (int i = 0; i < kStrideA * kHeightA; ++i) { \
src_argb[i] = (random() & 0xff); \
} \
memset(dst_argb_c, 0, kStrideB * kHeightB); \
memset(dst_argb_opt, 0, kStrideB * kHeightB); \
MaskCpuFlags(0); \
FMT_A##To##FMT_B(src_argb, kStrideA, \
dst_argb_c, kStrideB, \
......
......@@ -98,12 +98,75 @@ TEST_F(libyuvTest, TestAttenuate) {
EXPECT_EQ(32, atten_pixels[128][1]);
EXPECT_EQ(21, atten_pixels[128][2]);
EXPECT_EQ(128, atten_pixels[128][3]);
EXPECT_EQ(255, atten_pixels[255][0]);
EXPECT_EQ(127, atten_pixels[255][1]);
EXPECT_EQ(85, atten_pixels[255][2]);
EXPECT_NEAR(255, atten_pixels[255][0], 1);
EXPECT_NEAR(127, atten_pixels[255][1], 1);
EXPECT_NEAR(85, atten_pixels[255][2], 1);
EXPECT_EQ(255, atten_pixels[255][3]);
}
static int TestAttenuateI(int width, int height, int benchmark_iterations,
int invert, int off) {
const int kBpp = 4;
const int kStride = (width * kBpp + 15) & ~15;
align_buffer_64(src_argb, kStride * height + off);
align_buffer_64(dst_argb_c, kStride * height);
align_buffer_64(dst_argb_opt, kStride * height);
srandom(time(NULL));
for (int i = 0; i < kStride * height; ++i) {
src_argb[i + off] = (random() & 0xff);
}
memset(dst_argb_c, 0, kStride * height);
memset(dst_argb_opt, 0, kStride * height);
MaskCpuFlags(0);
ARGBAttenuate(src_argb + off, kStride,
dst_argb_c, kStride,
width, invert * height);
MaskCpuFlags(-1);
for (int i = 0; i < benchmark_iterations; ++i) {
ARGBAttenuate(src_argb + off, kStride,
dst_argb_opt, kStride,
width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStride * height; ++i) {
int abs_diff =
abs(static_cast<int>(dst_argb_c[i]) -
static_cast<int>(dst_argb_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_64(src_argb)
free_aligned_buffer_64(dst_argb_c)
free_aligned_buffer_64(dst_argb_opt)
return max_diff;
}
TEST_F(libyuvTest, ARGBAttenuate_Any) {
int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
benchmark_iterations_, +1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(libyuvTest, ARGBAttenuate_Unaligned) {
int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, +1, 1);
EXPECT_LE(max_diff, 2);
}
TEST_F(libyuvTest, ARGBAttenuate_Invert) {
int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, -1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(libyuvTest, ARGBAttenuate_Opt) {
int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
benchmark_iterations_, +1, 0);
EXPECT_LE(max_diff, 2);
}
TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
SIMD_ALIGNED(int32 added_pixels[16][16][4]);
......@@ -632,7 +695,7 @@ TEST_F(libyuvTest, ARGBInterpolate##TERP##N) { \
#define TESTINTERPOLATE(TERP) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
benchmark_width_ - 4, TERP, 1, _Any, +, 0) \
benchmark_width_ - 1, TERP, 1, _Any, +, 0) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
benchmark_width_, TERP, 1, _Unaligned, +, 1) \
TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
......@@ -648,42 +711,38 @@ TESTINTERPOLATE(255)
static int TestBlend(int width, int height, int benchmark_iterations,
int invert, int off) {
const int BPP_A = 4;
const int STRIDE_A = 1;
const int BPP_B = 4;
const int STRIDE_B = 1;
const int kStrideA = (width * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;
const int kStrideB = (width * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;
align_buffer_64(src_argb_a, kStrideA * height + off);
align_buffer_64(src_argb_b, kStrideA * height + off);
align_buffer_64(dst_argb_c, kStrideB * height);
align_buffer_64(dst_argb_opt, kStrideB * height);
const int kBpp = 4;
const int kStride = width * kBpp;
align_buffer_64(src_argb_a, kStride * height + off);
align_buffer_64(src_argb_b, kStride * height + off);
align_buffer_64(dst_argb_c, kStride * height);
align_buffer_64(dst_argb_opt, kStride * height);
srandom(time(NULL));
for (int i = 0; i < kStrideA * height; ++i) {
for (int i = 0; i < kStride * height; ++i) {
src_argb_a[i + off] = (random() & 0xff);
src_argb_b[i + off] = (random() & 0xff);
}
ARGBAttenuate(src_argb_a + off, kStrideA, src_argb_a + off, kStrideA, width,
ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
height);
ARGBAttenuate(src_argb_b + off, kStrideA, src_argb_b + off, kStrideA, width,
ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
height);
memset(dst_argb_c, 255, kStrideB * height);
memset(dst_argb_opt, 255, kStrideB * height);
memset(dst_argb_c, 255, kStride * height);
memset(dst_argb_opt, 255, kStride * height);
MaskCpuFlags(0);
ARGBBlend(src_argb_a + off, kStrideA,
src_argb_b + off, kStrideA,
dst_argb_c, kStrideB,
ARGBBlend(src_argb_a + off, kStride,
src_argb_b + off, kStride,
dst_argb_c, kStride,
width, invert * height);
MaskCpuFlags(-1);
for (int i = 0; i < benchmark_iterations; ++i) {
ARGBBlend(src_argb_a + off, kStrideA,
src_argb_b + off, kStrideA,
dst_argb_opt, kStrideB,
ARGBBlend(src_argb_a + off, kStride,
src_argb_b + off, kStride,
dst_argb_opt, kStride,
width, invert * height);
}
int max_diff = 0;
for (int i = 0; i < kStrideB * height; ++i) {
for (int i = 0; i < kStride * height; ++i) {
int abs_diff =
abs(static_cast<int>(dst_argb_c[i]) -
static_cast<int>(dst_argb_opt[i]));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment