Commit 9bcc9a25 authored by fbarchard@google.com's avatar fbarchard@google.com

ARGBInterpolateRow_SSSE3 for motion blur. Used to use bilinear row filter,…

ARGBInterpolateRow_SSSE3 for motion blur.  Used to use bilinear row filter, which extrudes edges.  This branches off the code so the extrude can be removed for Interpolate.
BUG=none
TEST=build\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*
Review URL: https://webrtc-codereview.appspot.com/786007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@354 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent a2cc341b
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 353
Version: 354
License: BSD
License File: LICENSE
......
......@@ -65,6 +65,10 @@ typedef signed char int8;
defined(__i386__) || defined(_M_IX86)
#define CPU_X86 1
#endif
// Detect compiler is for arm.
#if defined(__arm__) || defined(_M_ARM)
#define CPU_ARM 1
#endif
#define ALIGNP(p, t) \
(reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
......
......@@ -21,7 +21,7 @@ extern "C" {
// Compute a hash for specified memory. Seed of 5381 recommended.
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
// Sum Square Error - used to compute Mean Square Error or PSNR
// Sum Square Error - used to compute Mean Square Error or PSNR.
uint64 ComputeSumSquareError(const uint8* src_a,
const uint8* src_b, int count);
......
......@@ -12,7 +12,7 @@
#define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h"
// TODO(fbarchard): Remove the following headers includes
// TODO(fbarchard): Remove the following headers includes.
#include "libyuv/convert_from.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
......@@ -22,7 +22,7 @@ namespace libyuv {
extern "C" {
#endif
// Alias
// Alias.
#define I420ToI420 I420Copy
// Copy I420 to I420.
......@@ -112,56 +112,63 @@ int V210ToI420(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_v, int dst_stride_v,
int width, int height);
// ARGB little endian (bgra in memory) to I420
// ARGB little endian (bgra in memory) to I420.
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// BGRA little endian (argb in memory) to I420
// BGRA little endian (argb in memory) to I420.
int BGRAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// ABGR little endian (rgba in memory) to I420
// ABGR little endian (rgba in memory) to I420.
int ABGRToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// RGB little endian (bgr in memory) to I420
// RGBA little endian (rgba in memory) to I420.
int RGBAToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// RGB little endian (bgr in memory) to I420.
int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// RGB big endian (rgb in memory) to I420
// RGB big endian (rgb in memory) to I420.
int RAWToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// RGB16 (RGBP fourcc) little endian to I420
// RGB16 (RGBP fourcc) little endian to I420.
int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// RGB15 (RGBO fourcc) little endian to I420
// RGB15 (RGBO fourcc) little endian to I420.
int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// RGB12 (R444 fourcc) little endian to I420
// RGB12 (R444 fourcc) little endian to I420.
int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
......@@ -169,7 +176,7 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
int width, int height);
#ifdef HAVE_JPEG
// src_width/height provided by capture
// src_width/height provided by capture.
// dst_width/height for clipping determine final size.
int MJPGToI420(const uint8* sample, size_t sample_size,
uint8* dst_y, int dst_stride_y,
......
......@@ -28,7 +28,7 @@ namespace libyuv {
extern "C" {
#endif
// Alias
// Alias.
#define ARGBToARGB ARGBCopy
// Copy ARGB to ARGB.
......@@ -112,17 +112,17 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
// uint8* dst_argb, int dst_stride_argb,
// int width, int height);
// BGRA little endian (argb in memory) to ARGB
// BGRA little endian (argb in memory) to ARGB.
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// ABGR little endian (rgba in memory) to ARGB
// ABGR little endian (rgba in memory) to ARGB.
int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// RGBA little endian (abgr in memory) to ARGB
// RGBA little endian (abgr in memory) to ARGB.
int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
......@@ -130,27 +130,27 @@ int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
// Deprecated function name.
#define BG24ToARGB RGB24ToARGB
// RGB little endian (bgr in memory) to ARGB
// RGB little endian (bgr in memory) to ARGB.
int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// RGB big endian (rgb in memory) to ARGB
// RGB big endian (rgb in memory) to ARGB.
int RAWToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// RGB16 (RGBP fourcc) little endian to ARGB
// RGB16 (RGBP fourcc) little endian to ARGB.
int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// RGB15 (RGBO fourcc) little endian to ARGB
// RGB15 (RGBO fourcc) little endian to ARGB.
int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// RGB12 (R444 fourcc) little endian to ARGB
// RGB12 (R444 fourcc) little endian to ARGB.
int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
......@@ -164,7 +164,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
int dst_width, int dst_height);
#endif
// Note Bayer formats (BGGR) to ARGB are in format_conversion.h
// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
......
......@@ -19,9 +19,9 @@ namespace libyuv {
extern "C" {
#endif
// See Also convert.h for conversions from formats to I420
// See Also convert.h for conversions from formats to I420.
// I420Copy in convert to I420ToI420
// I420Copy in convert to I420ToI420.
int I420ToI422(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
......@@ -47,7 +47,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
......@@ -92,6 +92,12 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int I420ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
......@@ -122,7 +128,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Note Bayer formats (BGGR) To I420 are in format_conversion.h
// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
......
......@@ -43,7 +43,7 @@ int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Temporary API mapper
// Temporary API mapper.
#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
......@@ -79,7 +79,7 @@ int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Temporary API mapper
// Temporary API mapper.
#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
......@@ -107,7 +107,7 @@ int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Temporary API mapper
// Temporary API mapper.
#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
......@@ -132,7 +132,7 @@ int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
uint8* dst_bayer, int dst_stride_bayer,
int width, int height);
// Temporary API mapper
// Temporary API mapper.
#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
......
......@@ -13,6 +13,8 @@
#include "libyuv/basic_types.h"
// NOTE: For a simplified public API use convert.h MJPGToI420().
struct jpeg_common_struct;
struct jpeg_decompress_struct;
struct jpeg_source_mgr;
......@@ -85,10 +87,10 @@ class MJpegDecoder {
int GetVertSubSampFactor(int component);
// Public for testability
// Public for testability.
int GetImageScanlinesPerImcuRow();
// Public for testability
// Public for testability.
int GetComponentScanlinesPerImcuRow(int component);
// Width of a component in bytes.
......
......@@ -13,7 +13,7 @@
#include "libyuv/basic_types.h"
// TODO(fbarchard): Remove the following headers includes
// TODO(fbarchard): Remove the following headers includes.
#include "libyuv/convert.h"
#include "libyuv/convert_argb.h"
......@@ -31,7 +31,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v)
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
int I420ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
......@@ -103,7 +103,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height);
// ARGB little endian (bgra in memory) to I422
// ARGB little endian (bgra in memory) to I422.
int ARGBToI422(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
......
......@@ -31,7 +31,7 @@ enum RotationMode {
kRotateCounterClockwise = 270,
};
// Rotate I420 frame
// Rotate I420 frame.
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
......@@ -40,7 +40,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int src_width, int src_height, RotationMode mode);
// Rotate NV12 input and store in I420
// Rotate NV12 input and store in I420.
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
......
......@@ -20,9 +20,9 @@ extern "C" {
// Supported filtering
enum FilterMode {
kFilterNone = 0, // Point sample; Fastest
kFilterNone = 0, // Point sample; Fastest.
kFilterBilinear = 1, // Faster than box, but lower quality scaling down.
kFilterBox = 2 // Highest quality
kFilterBox = 2 // Highest quality.
};
// Scale a YUV plane.
......@@ -52,7 +52,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
int dst_width, int dst_height,
FilterMode filtering);
// Legacy API. Deprecated
// Legacy API. Deprecated.
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int src_stride_y, int src_stride_u, int src_stride_v,
int src_width, int src_height,
......@@ -61,12 +61,12 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int dst_width, int dst_height,
bool interpolate);
// Legacy API. Deprecated
// Legacy API. Deprecated.
int ScaleOffset(const uint8* src, int src_width, int src_height,
uint8* dst, int dst_width, int dst_height, int dst_yoffset,
bool interpolate);
// For testing, allow disabling of optimizations.
// For testing, allow disabling of specialized scalers.
void SetUseReferenceImpl(bool use);
#ifdef __cplusplus
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 353
#define LIBYUV_VERSION 354
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
// Common definitions for video, including fourcc and VideoFormat
// Common definitions for video, including fourcc and VideoFormat.
#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT
#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
......@@ -107,7 +107,7 @@ enum FourCCBpp {
FOURCC_BPP_UYVY = 16,
FOURCC_BPP_M420 = 12,
FOURCC_BPP_Q420 = 12,
FOURCC_BPP_V210 = 22, // 22.5 actually
FOURCC_BPP_V210 = 22, // 128 / 6 actually.
FOURCC_BPP_24BG = 24,
FOURCC_BPP_ARGB = 32,
FOURCC_BPP_BGRA = 32,
......
......@@ -598,7 +598,7 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SETROW_NEON
static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
asm volatile (
asm volatile ( // NOLINT
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
......@@ -669,7 +669,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
#define HAS_SETROW_X86
static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = static_cast<size_t>(width);
asm volatile (
asm volatile ( // NOLINT
"shr $0x2,%1 \n"
"rep stosl \n"
: "+D"(dst), // %0
......@@ -683,7 +683,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
for (int y = 0; y < height; ++y) {
size_t width_tmp = static_cast<size_t>(width);
uint32* d = reinterpret_cast<uint32*>(dst);
asm volatile (
asm volatile ( // NOLINT
"rep stosl \n"
: "+D"(d), // %0
"+c"(width_tmp) // %1
......@@ -1176,17 +1176,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
return 0;
}
#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
(defined(__x86_64__) || defined(__i386__)))
#define HAS_SCALEARGBFILTERROWS_SSSE3
#endif
void ScaleARGBFilterRows_C(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction);
void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction);
// Interpolate 2 ARGB images by specified amount (0 to 255).
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
......@@ -1201,24 +1190,20 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = ScaleARGBFilterRows_C;
#if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
int source_y_fraction) = ARGBInterpolateRow_C;
#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
}
#endif
uint8 last16[16];
for (int y = 0; y < height; ++y) {
// Filter extrudes edge for its scaling purpose.
memcpy(last16, dst_argb + width * 4, 16); // Save last 16 beyond end.
ScaleARGBFilterRows(dst_argb, src_argb0, src_argb1 - src_argb0,
width, interpolation);
memcpy(dst_argb + width * 4, last16, 16); // Restore last 16 beyond end.
ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
width, interpolation);
src_argb0 += src_stride_argb0;
src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb;
......
......@@ -83,9 +83,7 @@ extern "C" {
#define HAS_CUMULATIVESUMTOAVERAGE_SSE2
#define HAS_ARGBSHADE_SSE2
#define HAS_ARGBAFFINEROW_SSE2
// HAS_ARGBBLENDROW_SSE2 may be faster than SSSE3 version on some CPUs, so
// enable it here instead of LIBYUV_SSSE3_ONLY section.
#define HAS_ARGBBLENDROW_SSE2
#define HAS_ARGBINTERPOLATEROW_SSSE3
#endif
// The following are Windows only:
......@@ -102,6 +100,7 @@ extern "C" {
!defined(LIBYUV_SSSE3_ONLY)
#define HAS_MIRRORROW_SSE2
#define HAS_ARGBATTENUATE_SSE2
#define HAS_ARGBBLENDROW_SSE2
#endif
// The following are available on Neon platforms
......@@ -553,6 +552,13 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride,
int dst_width, int source_y_fraction);
void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -1081,6 +1081,29 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
}
}
// C version 2x2 -> 2x1.
void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
uint8* end = dst_ptr + (dst_width << 2);
do {
dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
src_ptr += 8;
src_ptr1 += 8;
dst_ptr += 8;
} while (dst_ptr < end);
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
......
......@@ -3560,6 +3560,71 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
}
#endif // HAS_ARGBAFFINEROW_SSE2
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
asm volatile (
"sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
"je 2f \n"
"cmp $0x40,%3 \n"
"je 3f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
"movd %3,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
".p2align 4 \n"
"1: \n"
"movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
"pmaddubsw %%xmm5,%%xmm0 \n"
"pmaddubsw %%xmm5,%%xmm1 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 1b \n"
"jmp 4f \n"
".p2align 4 \n"
"2: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 2b \n"
"jmp 4f \n"
".p2align 4 \n"
"3: \n"
"movdqa (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 3b \n"
"4: \n"
".p2align 4 \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm5"
#endif
);
}
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
......
......@@ -3664,6 +3664,81 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
}
#endif // HAS_ARGBAFFINEROW_SSE2
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
__declspec(naked) __declspec(align(16))
void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
__asm {
push esi
push edi
mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1
cmp eax, 0
je xloop1
cmp eax, 64
je xloop2
movd xmm0, eax // high fraction 0..127
neg eax
add eax, 128
movd xmm5, eax // low fraction 128..1
punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
align 16
xloop:
movdqa xmm0, [esi]
movdqa xmm2, [esi + edx]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
pmaddubsw xmm0, xmm5
pmaddubsw xmm1, xmm5
psrlw xmm0, 7
psrlw xmm1, 7
packuswb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop
pop edi
pop esi
ret
align 16
xloop1:
movdqa xmm0, [esi]
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop1
pop edi
pop esi
ret
align 16
xloop2:
movdqa xmm0, [esi]
pavgb xmm0, [esi + edx]
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop2
pop edi
pop esi
ret
}
}
#endif // _M_IX86
#ifdef __cplusplus
......
......@@ -80,7 +80,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \
}
#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \
TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ,) \
TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \
TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
TESTPLANARTOB(I420, 2, 2, ARGB, 4)
......@@ -151,7 +151,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \
}
#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \
TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ,) \
TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \
TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4)
......@@ -233,7 +233,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) { \
}
#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, ,) \
TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +) \
TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -)
TESTATOPLANAR(ARGB, 4, I420, 2, 2)
......@@ -293,7 +293,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \
free_aligned_buffer_16(dst_argb_opt) \
}
#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, ,) \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +) \
TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -)
TESTATOB(ARGB, 4, 4, ARGB, 4)
......@@ -853,14 +853,9 @@ TEST_F(libyuvTest, TestShade) {
}
TEST_F(libyuvTest, TestInterpolate) {
// Interpolate internally used bilinear filtering, which duplicates the last
// value, but the interpolate saves and restores it. The buffer must be
// padded by 16 extra bytes. TODO(fbarchard): Reimplement interpolate with
// code that does not duplicate the last value and remove kPad.
const int kPad = 16;
SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
SIMD_ALIGNED(uint8 orig_pixels_1[256][4]);
SIMD_ALIGNED(uint8 interpolate_pixels[256 + kPad][4]);
SIMD_ALIGNED(uint8 interpolate_pixels[256][4]);
orig_pixels_0[0][0] = 16u;
orig_pixels_0[0][1] = 32u;
......@@ -930,7 +925,7 @@ TEST_F(libyuvTest, TestInterpolate) {
EXPECT_EQ(16u, interpolate_pixels[0][2]);
EXPECT_EQ(32u, interpolate_pixels[0][3]);
for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
for (int i = 0; i < benchmark_iterations_ * (1280 * 720 / 256); ++i) {
ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
&interpolate_pixels[0][0], 0, 256, 1, 128);
}
......
......@@ -25,7 +25,9 @@ TEST_F(libyuvTest, TestVersion) {
printf("LIBYUV_VERSION %d\n", LIBYUV_VERSION);
#ifdef LIBYUV_SVNREVISION
const char *ver = strchr(LIBYUV_SVNREVISION, ':');
if (!ver) {
if (ver) {
++ver;
} else {
ver = LIBYUV_SVNREVISION;
}
int svn_revision = atoi(ver);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment