Commit ba3aeed3 authored by fbarchard@google.com's avatar fbarchard@google.com

gcc port of alpha blend and add align to row_win loops

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/439006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@207 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent c6e7e2a8
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 206
Version: 207
License: BSD
License File: LICENSE
......
......@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 206
#define LIBYUV_VERSION 207
#endif // INCLUDE_LIBYUV_VERSION_H_
......@@ -36,6 +36,7 @@ extern "C" {
// http://www.fourcc.org/yuv.php
// http://v4l2spec.bytesex.org/spec/book1.htm
// http://developer.apple.com/quicktime/icefloe/dispatch020.html
// http://msdn.microsoft.com/en-us/library/windows/desktop/dd206750(v=vs.85).aspx#nv12
enum FourCC {
// Canonical fourcc codes used in our code.
......
......@@ -62,9 +62,6 @@ extern "C" {
#define HAS_UYVYTOYROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
#define HAS_UYVYTOUVROW_SSE2
#endif
#if defined(_MSC_VER)
#define HAS_ARGBBLENDROW_SSE2
#endif
......
......@@ -1923,6 +1923,106 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
}
#endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
uint32 pixel = 0;
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"sub %0,%1 \n"
"mov (%0),%3 \n"
"sub $0x1,%2 \n"
"je 8f \n" // last1
"cmp $0xff000000,%3 \n"
"jae 2f \n" // opaqueloop
"cmp $0xffffff,%3 \n"
"ja 3f \n" // translucientloop
// transparentloop
"1: \n"
"sub $0x1,%2 \n"
"lea 0x4(%0),%0 \n"
"je 8f \n" // last1
"mov (%0),%3 \n"
"cmp $0xffffff,%3 \n"
"jbe 1b \n" // transparentloop
"cmp $0xff000000,%3 \n"
"jb 3f \n" // translucientloop
// opaqueloop
"2: \n"
"mov %3,(%0,%1,1) \n"
"lea 0x4(%0),%0 \n"
"sub $0x1,%2 \n"
"je 8f \n" // last1
"mov (%0),%3 \n"
"cmp $0xff000000,%3 \n"
"jae 2b \n" // opaqueloop
"cmp $0xffffff,%3 \n"
"jbe 1b \n" // transparentloop
"nop \n"
// translucientloop
"3: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%1,1),%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpcklbw %%xmm1,%%xmm1 \n"
"pshuflw $0xff,%%xmm0,%%xmm2 \n"
"pshufhw $0xff,%%xmm2,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"pxor %%xmm4,%%xmm3 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"paddw %%xmm1,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0,(%0,%1,1) \n"
"lea 0x8(%0),%0 \n"
"sub $0x2,%2 \n"
"jbe 8f \n" // last1
"mov (%0),%3 \n"
"cmp $0xffffff,%3 \n"
"jbe 1b \n" // transparentloop
"cmp $0xff000000,%3 \n"
"jb 3b \n" // translucientloop
"jmp 2b \n" // opaqueloop
// last1
"8: \n"
"add $0x1,%2 \n"
"je 9f \n" // done
"movd %3,%%xmm0 \n"
"mov (%0,%1,1),%3 \n"
"movd %3,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpcklbw %%xmm1,%%xmm1 \n"
"pshuflw $0xff,%%xmm0,%%xmm2 \n"
"pshufhw $0xff,%%xmm2,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"pxor %%xmm4,%%xmm3 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"paddw %%xmm1,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%3 \n"
"mov %3,(%0,%1,1) \n"
// done
"9: \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
"+r"(pixel) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
);
}
#endif // HAS_ARGBBLENDROW_SSE2
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
......
This diff is collapsed.
......@@ -1256,6 +1256,8 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
#define HAS_SCALEADDROWS_SSE2
// Reads 16xN bytes and produces 16 shorts at a time.
// TODO(fbarchard): support 1 rows
// TODO(fbarchard): align loops
__declspec(naked)
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width,
......@@ -1699,7 +1701,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0;
intptr_t tmp_src = 0;
intptr_t tmp_src_stride = static_cast<intptr_t>(src_stride);
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%5 \n"
......@@ -1731,9 +1732,9 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1
"+r"(tmp_height), // %2
"+r"(tmp_src), // %3
"+rm"(src_width), // %4
"+r"(src_width), // %4
"+rm"(src_height) // %5
: "rm"(tmp_src_stride) // %6
: "rm"(static_cast<intptr_t>(src_stride)) // %6
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
......
......@@ -16,7 +16,7 @@ namespace libyuv {
extern "C" {
#endif
#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0]))))
#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x) / sizeof(x[0]))))
struct FourCCAliasEntry {
uint32 alias;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment