Commit 66d16f41 authored by fbarchard@google.com's avatar fbarchard@google.com

argb scale 2x upsample with specialization for 25/75%

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/938014

git-svn-id: http://libyuv.googlecode.com/svn/trunk@486 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent f25ab6d8
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 485
Version: 486
License: BSD
License File: LICENSE
......
......@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 485
#define LIBYUV_VERSION 486
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
......@@ -1035,24 +1035,26 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1
cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop100
sub edi, esi
cmp eax, 32
je xloop75
cmp eax, 64
je xloop50
cmp eax, 96
je xloop25
movd xmm0, eax // high fraction 0..127
movd xmm0, eax // high fraction 1..127.
neg eax
add eax, 128
movd xmm5, eax // low fraction 128..1
movd xmm5, eax // low fraction 127..1.
punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
// General purpose row blend.
align 16
xloop:
movdqa xmm0, [esi]
......@@ -1069,52 +1071,20 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop
jmp xloop99
punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 100 / 0 - Copy row unchanged.
// Blend 25 / 75.
align 16
xloop100:
xloop25:
movdqa xmm0, [esi]
sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop100
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 75 / 25.
align 16
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop75
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
jg xloop25
jmp xloop99
// Blend 50 / 50.
align 16
......@@ -1126,27 +1096,32 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop50
jmp xloop99
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
// Blend 75 / 25.
align 16
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
lea esi, [esi + 16]
jg xloop75
jmp xloop99
// Blend 25 / 75.
// Blend 100 / 0 - Copy row unchanged.
align 16
xloop25:
xloop100:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
jg xloop100
// Extrude last pixel.
xloop99:
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
......@@ -1154,7 +1129,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
pop edi
pop esi
ret
}
}
......@@ -1171,29 +1145,31 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1
cmp eax, 0
cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop100
sub edi, esi
cmp eax, 32
je xloop75
cmp eax, 64
je xloop50
cmp eax, 96
je xloop25
movd xmm0, eax // high fraction 0..127
movd xmm0, eax // high fraction 1..127.
neg eax
add eax, 128
movd xmm5, eax // low fraction 128..1
movd xmm5, eax // low fraction 127..1.
punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
// General purpose row blend.
align 16
xloop:
movdqu xmm0, [esi]
movdqu xmm2, [esi + edx]
movdqu xmm1, xmm0
movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
pmaddubsw xmm0, xmm5
......@@ -1205,52 +1181,20 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop
jmp xloop99
punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqu [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 100 / 0 - Copy row unchanged.
// Blend 25 / 75.
align 16
xloop100:
xloop25:
movdqu xmm0, [esi]
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop100
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqu [esi + edi], xmm0
pop edi
pop esi
ret
// Blend 75 / 25.
align 16
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop75
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
movdqu [esi + edi], xmm0
pop edi
pop esi
ret
jg xloop25
jmp xloop99
// Blend 50 / 50.
align 16
......@@ -1262,27 +1206,32 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop50
jmp xloop99
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
// Blend 75 / 25.
align 16
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
pop edi
pop esi
ret
lea esi, [esi + 16]
jg xloop75
jmp xloop99
// Blend 25 / 75.
// Blend 100 / 0 - Copy row unchanged.
align 16
xloop25:
xloop100:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
jg xloop100
// Extrude last pixel.
xloop99:
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
......@@ -1290,7 +1239,6 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
pop edi
pop esi
ret
}
}
......@@ -2068,9 +2016,13 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
"je 2f \n"
"je 100f \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n"
"je 3f \n"
"je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
......@@ -2078,6 +2030,8 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n"
"1: \n"
"movdqa (%1),%%xmm0 \n"
......@@ -2094,25 +2048,57 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 1b \n"
"jmp 4f \n"
"jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n"
"2: \n"
"25: \n"
"movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 2b \n"
"jmp 4f \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n"
"3: \n"
"50: \n"
"movdqa (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 3b \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n"
"4: \n"
"75: \n"
"movdqa (%1),%%xmm1 \n"
"movdqa (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n"
"100: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
"punpckhbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm0 \n"
"punpckhqdq %%xmm0,%%xmm0 \n"
......@@ -2137,9 +2123,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
"sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
"je 2f \n"
"je 100f \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n"
"je 3f \n"
"je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
......@@ -2147,11 +2137,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n"
"1: \n"
"movdqu (%1),%%xmm0 \n"
"movdqu (%1,%4,1),%%xmm2 \n"
"movdqu %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
"pmaddubsw %%xmm5,%%xmm0 \n"
......@@ -2163,25 +2155,57 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 1b \n"
"jmp 4f \n"
"jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n"
"2: \n"
"25: \n"
"movdqu (%1),%%xmm0 \n"
"movdqu (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 2b \n"
"jmp 4f \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n"
"3: \n"
"50: \n"
"movdqu (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n"
"movdqu (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 3b \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n"
"4: \n"
"75: \n"
"movdqu (%1),%%xmm1 \n"
"movdqu (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n"
"100: \n"
"movdqu (%1),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
"punpckhbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm0 \n"
"punpckhqdq %%xmm0,%%xmm0 \n"
......
......@@ -289,12 +289,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
shr eax, 1
cmp eax, 0
je xloop1
cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop100
sub edi, esi
cmp eax, 32
je xloop75
cmp eax, 64
je xloop2
je xloop50
cmp eax, 96
je xloop25
movd xmm0, eax // high fraction 0..127
neg eax
add eax, 128
......@@ -319,36 +324,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop
jmp xloop99
shufps xmm0, xmm0, 0xff
movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
pop edi
pop esi
ret
// Blend 25 / 75.
align 16
xloop25:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
jmp xloop99
// Blend 50 / 50.
align 16
xloop1:
xloop50:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop1
jg xloop50
jmp xloop99
shufps xmm0, xmm0, 0xff
// Blend 75 / 25.
align 16
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
sub ecx, 4
movdqa [esi + edi], xmm0
pop edi
pop esi
ret
lea esi, [esi + 16]
jg xloop75
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
align 16
xloop2:
xloop100:
movdqa xmm0, [esi]
pavgb xmm0, [esi + edx]
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop2
jg xloop100
// Extrude last pixel.
xloop99:
shufps xmm0, xmm0, 0xff
movdqa [esi + edi], xmm0
pop edi
......@@ -585,12 +611,17 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
asm volatile (
"sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
"je 2f \n"
"je 100f \n"
"sub %1,%0 \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n"
"je 3f \n"
"je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
......@@ -598,6 +629,8 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
".p2align 4 \n"
"1: \n"
"movdqa (%1),%%xmm0 \n"
......@@ -614,25 +647,57 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 1b \n"
"jmp 4f \n"
"jmp 99f \n"
// Blend 25 / 75.
".p2align 4 \n"
"2: \n"
"25: \n"
"movdqa (%1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 2b \n"
"jmp 4f \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
".p2align 4 \n"
"3: \n"
"50: \n"
"movdqa (%1),%%xmm0 \n"
"pavgb (%1,%4,1),%%xmm0 \n"
"movdqa (%1,%4,1),%%xmm1 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 3b \n"
"4: \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
".p2align 4 \n"
"75: \n"
"movdqa (%1),%%xmm1 \n"
"movdqa (%1,%4,1),%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
".p2align 4 \n"
"100: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 100b \n"
// Extrude last pixel.
"99: \n"
"shufps $0xff,%%xmm0,%%xmm0 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
: "+r"(dst_ptr), // %0
......@@ -645,6 +710,7 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
, "xmm0", "xmm1", "xmm2", "xmm5"
#endif
);
}
#endif // defined(__x86_64__) || defined(__i386__)
......
......@@ -477,14 +477,19 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 2f \n"
"beq 100f \n"
"add %2, %1 \n"
"cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n"
"beq 3f \n"
"beq 50f \n"
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
......@@ -497,23 +502,48 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 4f \n"
"b 99f \n"
"2: \n"
// Blend 25 / 75.
"25: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 2b \n"
"b 4f \n"
"bgt 25b \n"
"b 99f \n"
"3: \n"
// Blend 50 / 50.
"50: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 3b \n"
"4: \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.u8 {q1}, [%1]! \n"
"vld1.u8 {q0}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.u8 {q0}, [%1]! \n"
"subs %3, #16 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment