Commit 540e8af8 authored by fbarchard@google.com's avatar fbarchard@google.com

remove add 16 from ARGBToYJ and add rounding, for consistency with Windows…

remove add 16 from ARGBToYJ and add rounding, for consistency with Windows version.  row.h header macros sorted alphabetically.
BUG=269
TESTED=untested
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/32579005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1185 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent b036cf70
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1184
Version: 1185
License: BSD
License File: LICENSE
......
This diff is collapsed.
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1184
#define LIBYUV_VERSION 1185
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -92,6 +92,7 @@ static uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
// 7 bit fixed point 0.5.
static vec16 kAddYJ64 = {
64, 64, 64, 64, 64, 64, 64, 64
};
......@@ -704,6 +705,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %3,%%xmm4 \n"
......@@ -743,6 +745,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_ARGBTOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %3,%%xmm4 \n"
......@@ -788,6 +792,7 @@ static const lvec32 kPermdARGBToY_AVX = {
0, 4, 1, 5, 2, 6, 3, 7
};
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"vbroadcastf128 %3,%%ymm4 \n"
......@@ -804,13 +809,13 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea " MEMLEA(0x80,0) ",%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n"
"vpermd %%ymm0,%%ymm6,%%ymm0 \n"
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
......@@ -831,6 +836,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
#endif // HAS_ARGBTOYROW_AVX2
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"vbroadcastf128 %3,%%ymm4 \n"
......@@ -847,13 +853,14 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea " MEMLEA(0x80,0) ",%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
"vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n"
"vpermd %%ymm0,%%ymm6,%%ymm0 \n"
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
......@@ -863,7 +870,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
"+r"(dst_y), // %1
"+r"(pix) // %2
: "m"(kARGBToYJ), // %3
"m"(kAddY16), // %4
"m"(kAddYJ64), // %4
"m"(kPermdARGBToY_AVX) // %5
: "memory", "cc"
#if defined(__SSE2__)
......
......@@ -210,6 +210,7 @@ static const uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
// 7 bit fixed point 0.5.
static const vec16 kAddYJ64 = {
64, 64, 64, 64, 64, 64, 64, 64
};
......@@ -697,8 +698,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY
movdqa xmm5, kAddY16
convertloop:
movdqu xmm0, [eax]
......@@ -724,7 +725,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
}
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
__declspec(naked) __declspec(align(16))
void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
......@@ -787,7 +789,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2 // mutates.
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
vpaddb ymm0, ymm0, ymm5
vpaddb ymm0, ymm0, ymm5 // add 16 for Y
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment