Commit c5aac16a authored by fbarchard@google.com's avatar fbarchard@google.com

Remove loop alignment for benefit of modern cpus that dont require alignment.

BUG=none
TESTED=local libyuv unittest passes
R=brucedawson@google.com, tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/32159004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1180 16f28f9a-4ce2-e073-06de-1de4eb20be90
parent fd89cd79
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1178 Version: 1180
License: BSD License: BSD
License File: LICENSE License File: LICENSE
......
...@@ -200,7 +200,6 @@ extern "C" { ...@@ -200,7 +200,6 @@ extern "C" {
#define HAS_MERGEUVROW_AVX2 #define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2 #define HAS_MIRRORROW_AVX2
#define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2
#define HAS_I422TOARGBROW_AVX2
// Effects: // Effects:
#define HAS_ARGBADDROW_AVX2 #define HAS_ARGBADDROW_AVX2
...@@ -216,6 +215,7 @@ extern "C" { ...@@ -216,6 +215,7 @@ extern "C" {
#define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYROW_AVX2
#define HAS_I422TOARGBROW_AVX2
#define HAS_I422TORGBAROW_AVX2 #define HAS_I422TORGBAROW_AVX2
#define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOABGRROW_AVX2
#define HAS_INTERPOLATEROW_AVX2 #define HAS_INTERPOLATEROW_AVX2
......
...@@ -11,6 +11,6 @@ ...@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1178 #define LIBYUV_VERSION 1180
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
...@@ -27,7 +27,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -27,7 +27,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pxor xmm0, xmm0 pxor xmm0, xmm0
pxor xmm5, xmm5 pxor xmm5, xmm5
align 4
wloop: wloop:
movdqu xmm1, [eax] movdqu xmm1, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -70,7 +69,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { ...@@ -70,7 +69,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax sub edx, eax
align 4
wloop: wloop:
vmovdqu ymm1, [eax] vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx] vmovdqu ymm2, [eax + edx]
...@@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ...@@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pxor xmm7, xmm7 // constant 0 for unpck pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33 movdqa xmm6, kHash16x33
align 4
wloop: wloop:
movdqu xmm1, [eax] // src[0-15] movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { ...@@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
movd xmm0, [esp + 12] // seed movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33 movdqa xmm6, kHash16x33
align 4
wloop: wloop:
vpmovzxbd xmm3, dword ptr [eax] // src[0-3] vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16 pmulld xmm0, xmm6 // hash *= 33 ^ 16
......
...@@ -263,7 +263,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ...@@ -263,7 +263,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24 pslld xmm5, 24
align 4
convertloop: convertloop:
movq xmm0, qword ptr [eax] movq xmm0, qword ptr [eax]
lea eax, [eax + 8] lea eax, [eax + 8]
...@@ -292,7 +291,6 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { ...@@ -292,7 +291,6 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
pslld xmm5, 24 pslld xmm5, 24
movdqa xmm4, kShuffleMaskRGB24ToARGB movdqa xmm4, kShuffleMaskRGB24ToARGB
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -332,7 +330,6 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, ...@@ -332,7 +330,6 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
pslld xmm5, 24 pslld xmm5, 24
movdqa xmm4, kShuffleMaskRAWToARGB movdqa xmm4, kShuffleMaskRAWToARGB
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -392,7 +389,6 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, ...@@ -392,7 +389,6 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
sub edx, eax sub edx, eax
sub edx, eax sub edx, eax
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgr565 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
movdqa xmm1, xmm0 movdqa xmm1, xmm0
...@@ -442,7 +438,6 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, ...@@ -442,7 +438,6 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
sub edx, eax sub edx, eax
sub edx, eax sub edx, eax
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of 1555 movdqu xmm0, [eax] // fetch 8 pixels of 1555
movdqa xmm1, xmm0 movdqa xmm1, xmm0
...@@ -488,7 +483,6 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, ...@@ -488,7 +483,6 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
sub edx, eax sub edx, eax
sub edx, eax sub edx, eax
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
movdqa xmm2, xmm0 movdqa xmm2, xmm0
...@@ -520,7 +514,6 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -520,7 +514,6 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRGB24 movdqa xmm6, kShuffleMaskARGBToRGB24
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm0, [eax] // fetch 16 pixels of argb
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -559,7 +552,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -559,7 +552,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRAW movdqa xmm6, kShuffleMaskARGBToRAW
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm0, [eax] // fetch 16 pixels of argb
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -604,7 +596,6 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -604,7 +596,6 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11 pslld xmm5, 11
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 4 pixels of argb movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B movdqa xmm1, xmm0 // B
...@@ -644,7 +635,6 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -644,7 +635,6 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
pslld xmm7, 15 pslld xmm7, 15
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 4 pixels of argb movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B movdqa xmm1, xmm0 // B
...@@ -682,7 +672,6 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { ...@@ -682,7 +672,6 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
movdqa xmm3, xmm4 // generate mask 0x00f000f0 movdqa xmm3, xmm4 // generate mask 0x00f000f0
psrlw xmm3, 8 psrlw xmm3, 8
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // fetch 4 pixels of argb movdqu xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 movdqa xmm1, xmm0
...@@ -711,7 +700,6 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -711,7 +700,6 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY movdqa xmm4, kARGBToY
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -746,7 +734,6 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -746,7 +734,6 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm4, kARGBToYJ movdqa xmm4, kARGBToYJ
movdqa xmm5, kAddYJ64 movdqa xmm5, kAddYJ64
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -784,7 +771,6 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -784,7 +771,6 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vbroadcastf128 ymm5, kAddY16 vbroadcastf128 ymm5, kAddY16
vmovdqu ymm6, kPermdARGBToY_AVX vmovdqu ymm6, kPermdARGBToY_AVX
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -824,7 +810,6 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -824,7 +810,6 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vbroadcastf128 ymm5, kAddYJ64 vbroadcastf128 ymm5, kAddYJ64
vmovdqu ymm6, kPermdARGBToY_AVX vmovdqu ymm6, kPermdARGBToY_AVX
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -863,7 +848,6 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -863,7 +848,6 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kBGRAToY movdqa xmm4, kBGRAToY
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -897,7 +881,6 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -897,7 +881,6 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kABGRToY movdqa xmm4, kABGRToY
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -931,7 +914,6 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ...@@ -931,7 +914,6 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16 movdqa xmm5, kAddY16
movdqa xmm4, kRGBAToY movdqa xmm4, kRGBAToY
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -972,7 +954,6 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -972,7 +954,6 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 4
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1043,7 +1024,6 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1043,7 +1024,6 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUVJ128 movdqa xmm5, kAddUVJ128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 4
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1116,7 +1096,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, ...@@ -1116,7 +1096,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
vbroadcastf128 ymm7, kARGBToU vbroadcastf128 ymm7, kARGBToU
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 4
convertloop: convertloop:
/* step 1 - subsample 32x2 argb pixels to 16x1 */ /* step 1 - subsample 32x2 argb pixels to 16x1 */
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
...@@ -1180,7 +1159,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, ...@@ -1180,7 +1159,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 4
convertloop: convertloop:
/* convert to U and V */ /* convert to U and V */
movdqu xmm0, [eax] // U movdqu xmm0, [eax] // U
...@@ -1238,7 +1216,6 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, ...@@ -1238,7 +1216,6 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 4
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1299,7 +1276,6 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1299,7 +1276,6 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 4
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1370,7 +1346,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1370,7 +1346,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 4
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1441,7 +1416,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ...@@ -1441,7 +1416,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128 movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v sub edi, edx // stride from u to v
align 4
convertloop: convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */ /* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -1585,7 +1559,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, ...@@ -1585,7 +1559,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4 vpxor ymm4, ymm4, ymm4
align 4
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
YUVTORGB_AVX2 YUVTORGB_AVX2
...@@ -1631,7 +1604,6 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, ...@@ -1631,7 +1604,6 @@ void I422ToBGRARow_AVX2(const uint8* y_buf,
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4 vpxor ymm4, ymm4, ymm4
align 4
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
YUVTORGB_AVX2 YUVTORGB_AVX2
...@@ -1677,7 +1649,6 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, ...@@ -1677,7 +1649,6 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4 vpxor ymm4, ymm4, ymm4
align 4
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
YUVTORGB_AVX2 YUVTORGB_AVX2
...@@ -1723,7 +1694,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ...@@ -1723,7 +1694,6 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4 vpxor ymm4, ymm4, ymm4
align 4
convertloop: convertloop:
READYUV422_AVX2 READYUV422_AVX2
YUVTORGB_AVX2 YUVTORGB_AVX2
...@@ -1864,7 +1834,6 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -1864,7 +1834,6 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
convertloop: convertloop:
READYUV444 READYUV444
YUVTORGB YUVTORGB
...@@ -1908,7 +1877,6 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, ...@@ -1908,7 +1877,6 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
movdqa xmm5, kShuffleMaskARGBToRGB24_0 movdqa xmm5, kShuffleMaskARGBToRGB24_0
movdqa xmm6, kShuffleMaskARGBToRGB24 movdqa xmm6, kShuffleMaskARGBToRGB24
align 4
convertloop: convertloop:
READYUV422 READYUV422
YUVTORGB YUVTORGB
...@@ -1955,7 +1923,6 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, ...@@ -1955,7 +1923,6 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
movdqa xmm5, kShuffleMaskARGBToRAW_0 movdqa xmm5, kShuffleMaskARGBToRAW_0
movdqa xmm6, kShuffleMaskARGBToRAW movdqa xmm6, kShuffleMaskARGBToRAW
align 4
convertloop: convertloop:
READYUV422 READYUV422
YUVTORGB YUVTORGB
...@@ -2007,7 +1974,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, ...@@ -2007,7 +1974,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
pslld xmm7, 11 pslld xmm7, 11
align 4
convertloop: convertloop:
READYUV422 READYUV422
YUVTORGB YUVTORGB
...@@ -2074,7 +2040,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2074,7 +2040,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
convertloop: convertloop:
READYUV422 READYUV422
YUVTORGB YUVTORGB
...@@ -2119,7 +2084,6 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2119,7 +2084,6 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
convertloop: convertloop:
READYUV411 // modifies EBX READYUV411 // modifies EBX
YUVTORGB YUVTORGB
...@@ -2159,7 +2123,6 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2159,7 +2123,6 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
convertloop: convertloop:
READNV12 READNV12
YUVTORGB YUVTORGB
...@@ -2197,7 +2160,6 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, ...@@ -2197,7 +2160,6 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
convertloop: convertloop:
READNV12 READNV12
YVUTORGB YVUTORGB
...@@ -2236,7 +2198,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, ...@@ -2236,7 +2198,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
sub edi, esi sub edi, esi
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
convertloop: convertloop:
READYUV422 READYUV422
YUVTORGB YUVTORGB
...@@ -2278,7 +2239,6 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, ...@@ -2278,7 +2239,6 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
convertloop: convertloop:
READYUV422 READYUV422
YUVTORGB YUVTORGB
...@@ -2318,7 +2278,6 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, ...@@ -2318,7 +2278,6 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
sub edi, esi sub edi, esi
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
convertloop: convertloop:
READYUV422 READYUV422
YUVTORGB YUVTORGB
...@@ -2363,7 +2322,6 @@ void YToARGBRow_SSE2(const uint8* y_buf, ...@@ -2363,7 +2322,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
mov edx, [esp + 8] // rgb mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
align 4
convertloop: convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax] movq xmm0, qword ptr [eax]
...@@ -2407,7 +2365,6 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { ...@@ -2407,7 +2365,6 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
movdqa xmm5, kShuffleMirror movdqa xmm5, kShuffleMirror
align 4
convertloop: convertloop:
movdqu xmm0, [eax - 16 + ecx] movdqu xmm0, [eax - 16 + ecx]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
...@@ -2429,7 +2386,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -2429,7 +2386,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
vbroadcastf128 ymm5, kShuffleMirror vbroadcastf128 ymm5, kShuffleMirror
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax - 32 + ecx] vmovdqu ymm0, [eax - 32 + ecx]
vpshufb ymm0, ymm0, ymm5 vpshufb ymm0, ymm0, ymm5
...@@ -2452,7 +2408,6 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -2452,7 +2408,6 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
align 4
convertloop: convertloop:
movdqu xmm0, [eax - 16 + ecx] movdqu xmm0, [eax - 16 + ecx]
movdqa xmm1, xmm0 // swap bytes movdqa xmm1, xmm0 // swap bytes
...@@ -2490,7 +2445,6 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ...@@ -2490,7 +2445,6 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
lea eax, [eax + ecx * 2 - 16] lea eax, [eax + ecx * 2 - 16]
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax - 16] lea eax, [eax - 16]
...@@ -2516,7 +2470,6 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -2516,7 +2470,6 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
lea eax, [eax - 16 + ecx * 4] // last 4 pixels. lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax - 16] lea eax, [eax - 16]
...@@ -2544,7 +2497,6 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -2544,7 +2497,6 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
vmovdqu ymm5, kARGBShuffleMirror_AVX2 vmovdqu ymm5, kARGBShuffleMirror_AVX2
align 4
convertloop: convertloop:
vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
...@@ -2570,7 +2522,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -2570,7 +2522,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -2609,7 +2560,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ...@@ -2609,7 +2560,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
vpsrlw ymm5, ymm5, 8 vpsrlw ymm5, ymm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -2647,7 +2597,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -2647,7 +2597,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub edx, eax sub edx, eax
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 16 U's movdqu xmm0, [eax] // read 16 U's
movdqu xmm1, [eax + edx] // and 16 V's movdqu xmm1, [eax + edx] // and 16 V's
...@@ -2679,7 +2628,6 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ...@@ -2679,7 +2628,6 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub edx, eax sub edx, eax
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] // read 32 U's vmovdqu ymm0, [eax] // read 32 U's
vmovdqu ymm1, [eax + edx] // and 32 V's vmovdqu ymm1, [eax + edx] // and 32 V's
...@@ -2710,7 +2658,6 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { ...@@ -2710,7 +2658,6 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count mov ecx, [esp + 12] // count
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -2734,7 +2681,6 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { ...@@ -2734,7 +2681,6 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count mov ecx, [esp + 12] // count
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -2780,7 +2726,6 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -2780,7 +2726,6 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
psrld xmm1, 8 psrld xmm1, 8
align 4
convertloop: convertloop:
movdqu xmm2, [eax] movdqu xmm2, [eax]
movdqu xmm3, [eax + 16] movdqu xmm3, [eax + 16]
...@@ -2815,7 +2760,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -2815,7 +2760,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
vpcmpeqb ymm0, ymm0, ymm0 vpcmpeqb ymm0, ymm0, ymm0
vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
align 4
convertloop: convertloop:
vmovdqu ymm1, [eax] vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + 32] vmovdqu ymm2, [eax + 32]
...@@ -2847,7 +2791,6 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { ...@@ -2847,7 +2791,6 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
psrld xmm1, 8 psrld xmm1, 8
align 4
convertloop: convertloop:
movq xmm2, qword ptr [eax] // 8 Y's movq xmm2, qword ptr [eax] // 8 Y's
lea eax, [eax + 8] lea eax, [eax + 8]
...@@ -2884,7 +2827,6 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { ...@@ -2884,7 +2827,6 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
vpcmpeqb ymm0, ymm0, ymm0 vpcmpeqb ymm0, ymm0, ymm0
vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
align 4
convertloop: convertloop:
vpmovzxbd ymm1, qword ptr [eax] vpmovzxbd ymm1, qword ptr [eax]
vpmovzxbd ymm2, qword ptr [eax + 8] vpmovzxbd ymm2, qword ptr [eax + 8]
...@@ -2937,7 +2879,6 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, ...@@ -2937,7 +2879,6 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
lea ecx, [ebp * 4] lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4 sub edx, ecx // stride - width * 4
align 4
convertloop: convertloop:
mov ecx, ebp mov ecx, ebp
rep stosd rep stosd
...@@ -2964,7 +2905,6 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, ...@@ -2964,7 +2905,6 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8 vpsrlw ymm5, ymm5, 8
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -2997,7 +2937,6 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, ...@@ -2997,7 +2937,6 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
vpsrlw ymm5, ymm5, 8 vpsrlw ymm5, ymm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -3040,7 +2979,6 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, ...@@ -3040,7 +2979,6 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
vpsrlw ymm5, ymm5, 8 vpsrlw ymm5, ymm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -3075,7 +3013,6 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, ...@@ -3075,7 +3013,6 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -3108,7 +3045,6 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, ...@@ -3108,7 +3045,6 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
vpsrlw ymm5, ymm5, 8 vpsrlw ymm5, ymm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -3151,7 +3087,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, ...@@ -3151,7 +3087,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
vpsrlw ymm5, ymm5, 8 vpsrlw ymm5, ymm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -3190,7 +3125,6 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, ...@@ -3190,7 +3125,6 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -3221,7 +3155,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, ...@@ -3221,7 +3155,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -3263,7 +3196,6 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, ...@@ -3263,7 +3196,6 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -3295,7 +3227,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, ...@@ -3295,7 +3227,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix mov ecx, [esp + 12] // pix
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -3326,7 +3257,6 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, ...@@ -3326,7 +3257,6 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -3368,7 +3298,6 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, ...@@ -3368,7 +3298,6 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
psrlw xmm5, 8 psrlw xmm5, 8
sub edi, edx sub edi, edx
align 4
convertloop: convertloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -3656,7 +3585,6 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3656,7 +3585,6 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
psrld xmm5, 8 psrld xmm5, 8
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm0 // first 2 punpcklbw xmm0, xmm0 // first 2
...@@ -3706,7 +3634,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3706,7 +3634,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
movdqa xmm4, kShuffleAlpha0 movdqa xmm4, kShuffleAlpha0
movdqa xmm5, kShuffleAlpha1 movdqa xmm5, kShuffleAlpha1
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
pshufb xmm0, xmm4 // isolate first 2 alphas pshufb xmm0, xmm4 // isolate first 2 alphas
...@@ -3751,7 +3678,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3751,7 +3678,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24 vpslld ymm5, ymm5, 24
align 4
convertloop: convertloop:
vmovdqu ymm6, [eax] // read 8 pixels. vmovdqu ymm6, [eax] // read 8 pixels.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
...@@ -3788,7 +3714,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -3788,7 +3714,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
mov edx, [esp + 8 + 8] // dst_argb mov edx, [esp + 8 + 8] // dst_argb
mov ecx, [esp + 8 + 12] // width mov ecx, [esp + 8 + 12] // width
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 3] // first alpha movzx esi, byte ptr [eax + 3] // first alpha
...@@ -3843,7 +3768,6 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -3843,7 +3768,6 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
sub edx, eax sub edx, eax
vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
align 4
convertloop: convertloop:
vmovdqu ymm6, [eax] // read 8 pixels. vmovdqu ymm6, [eax] // read 8 pixels.
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
...@@ -3882,7 +3806,6 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -3882,7 +3806,6 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
push esi push esi
push edi push edi
align 4
convertloop: convertloop:
// replace VPGATHER // replace VPGATHER
movzx esi, byte ptr [eax + 3] // alpha0 movzx esi, byte ptr [eax + 3] // alpha0
...@@ -3945,7 +3868,6 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ...@@ -3945,7 +3868,6 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
movdqa xmm4, kARGBToYJ movdqa xmm4, kARGBToYJ
movdqa xmm5, kAddYJ64 movdqa xmm5, kAddYJ64
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // G movdqu xmm0, [eax] // G
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -4005,7 +3927,6 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { ...@@ -4005,7 +3927,6 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
movdqa xmm3, kARGBToSepiaG movdqa xmm3, kARGBToSepiaG
movdqa xmm4, kARGBToSepiaR movdqa xmm4, kARGBToSepiaR
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // B movdqu xmm0, [eax] // B
movdqu xmm6, [eax + 16] movdqu xmm6, [eax + 16]
...@@ -4068,7 +3989,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, ...@@ -4068,7 +3989,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
pshufd xmm5, xmm5, 0xff pshufd xmm5, xmm5, 0xff
mov ecx, [esp + 16] /* width */ mov ecx, [esp + 16] /* width */
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // B movdqu xmm0, [eax] // B
movdqu xmm7, [eax + 16] movdqu xmm7, [eax + 16]
...@@ -4135,7 +4055,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, ...@@ -4135,7 +4055,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
pcmpeqb xmm6, xmm6 // generate mask 0xff000000 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
pslld xmm6, 24 pslld xmm6, 24
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm5 // first 2 pixels punpcklbw xmm0, xmm5 // first 2 pixels
...@@ -4173,7 +4092,6 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, ...@@ -4173,7 +4092,6 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
punpcklbw xmm2, xmm2 punpcklbw xmm2, xmm2
punpcklqdq xmm2, xmm2 punpcklqdq xmm2, xmm2
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -4208,7 +4126,6 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4208,7 +4126,6 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0 pxor xmm5, xmm5 // constant 0
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0 movdqu xmm0, [eax] // read 4 pixels from src_argb0
movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm2, [esi] // read 4 pixels from src_argb1
...@@ -4250,7 +4167,6 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4250,7 +4167,6 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
sub ecx, 4 sub ecx, 4
jl convertloop49 jl convertloop49
align 4
convertloop4: convertloop4:
movdqu xmm0, [eax] // read 4 pixels from src_argb0 movdqu xmm0, [eax] // read 4 pixels from src_argb0
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -4296,7 +4212,6 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4296,7 +4212,6 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0 movdqu xmm0, [eax] // read 4 pixels from src_argb0
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -4327,7 +4242,6 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4327,7 +4242,6 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0 vpxor ymm5, ymm5, ymm5 // constant 0
align 4
convertloop: convertloop:
vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32] lea eax, [eax + 32]
...@@ -4364,7 +4278,6 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4364,7 +4278,6 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32] lea eax, [eax + 32]
...@@ -4394,7 +4307,6 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, ...@@ -4394,7 +4307,6 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
align 4
convertloop: convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
lea eax, [eax + 32] lea eax, [eax + 32]
...@@ -4433,7 +4345,6 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, ...@@ -4433,7 +4345,6 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
sub edx, eax sub edx, eax
pxor xmm5, xmm5 // constant 0 pxor xmm5, xmm5 // constant 0
align 4
convertloop: convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
...@@ -4487,7 +4398,6 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, ...@@ -4487,7 +4398,6 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
sub edx, eax sub edx, eax
pxor xmm5, xmm5 // constant 0 pxor xmm5, xmm5 // constant 0
align 4
convertloop: convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
...@@ -4541,7 +4451,6 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -4541,7 +4451,6 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
pcmpeqb xmm5, xmm5 // alpha 255 pcmpeqb xmm5, xmm5 // alpha 255
pslld xmm5, 24 // 0xff000000 pslld xmm5, 24 // 0xff000000
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
...@@ -4587,7 +4496,6 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -4587,7 +4496,6 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax sub esi, eax
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
...@@ -4622,7 +4530,6 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, ...@@ -4622,7 +4530,6 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
sub esi, eax sub esi, eax
pcmpeqb xmm5, xmm5 // alpha 255 pcmpeqb xmm5, xmm5 // alpha 255
align 4
convertloop: convertloop:
movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
...@@ -4697,7 +4604,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, ...@@ -4697,7 +4604,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
packssdw xmm5, xmm5 // 16 bit shorts packssdw xmm5, xmm5 // 16 bit shorts
// 4 pixel loop small blocks. // 4 pixel loop small blocks.
align 4
s4: s4:
// top left // top left
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -4740,7 +4646,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, ...@@ -4740,7 +4646,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
jmp l4b jmp l4b
// 4 pixel loop // 4 pixel loop
align 4
l4: l4:
// top left // top left
movdqu xmm0, [eax] movdqu xmm0, [eax]
...@@ -4793,7 +4698,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, ...@@ -4793,7 +4698,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
align 4
l1: l1:
movdqu xmm0, [eax] movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4] psubd xmm0, [eax + edx * 4]
...@@ -4834,7 +4738,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -4834,7 +4738,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
jne l4b jne l4b
// 4 pixel loop // 4 pixel loop
align 4
l4: l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes. movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -4881,7 +4784,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, ...@@ -4881,7 +4784,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
align 4
l1: l1:
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
lea eax, [eax + 4] lea eax, [eax + 4]
...@@ -4936,7 +4838,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -4936,7 +4838,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
addps xmm4, xmm4 // dudv *= 4 addps xmm4, xmm4 // dudv *= 4
// 4 pixel loop // 4 pixel loop
align 4
l4: l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2 cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2
...@@ -4968,7 +4869,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, ...@@ -4968,7 +4869,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
align 4
l1: l1:
cvttps2dq xmm0, xmm2 // x, y float to int cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts packssdw xmm0, xmm0 // x, y as shorts
...@@ -5023,7 +4923,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5023,7 +4923,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vpxor ymm0, ymm0, ymm0 vpxor ymm0, ymm0, ymm0
vpermd ymm5, ymm0, ymm5 vpermd ymm5, ymm0, ymm5
align 4
xloop: xloop:
vmovdqu ymm0, [esi] vmovdqu ymm0, [esi]
vmovdqu ymm2, [esi + edx] vmovdqu ymm2, [esi + edx]
...@@ -5041,7 +4940,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5041,7 +4940,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 25 / 75. // Blend 25 / 75.
align 4
xloop25: xloop25:
vmovdqu ymm0, [esi] vmovdqu ymm0, [esi]
vmovdqu ymm1, [esi + edx] vmovdqu ymm1, [esi + edx]
...@@ -5054,7 +4952,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5054,7 +4952,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 50 / 50. // Blend 50 / 50.
align 4
xloop50: xloop50:
vmovdqu ymm0, [esi] vmovdqu ymm0, [esi]
vpavgb ymm0, ymm0, [esi + edx] vpavgb ymm0, ymm0, [esi + edx]
...@@ -5065,7 +4962,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5065,7 +4962,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 75 / 25. // Blend 75 / 25.
align 4
xloop75: xloop75:
vmovdqu ymm1, [esi] vmovdqu ymm1, [esi]
vmovdqu ymm0, [esi + edx] vmovdqu ymm0, [esi + edx]
...@@ -5078,7 +4974,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5078,7 +4974,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
align 4
xloop100: xloop100:
rep movsb rep movsb
...@@ -5124,7 +5019,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5124,7 +5019,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
punpcklwd xmm5, xmm5 punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0 pshufd xmm5, xmm5, 0
align 4
xloop: xloop:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm2, [esi + edx] movdqu xmm2, [esi + edx]
...@@ -5143,7 +5037,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5143,7 +5037,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 25 / 75. // Blend 25 / 75.
align 4
xloop25: xloop25:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm1, [esi + edx] movdqu xmm1, [esi + edx]
...@@ -5156,7 +5049,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5156,7 +5049,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 50 / 50. // Blend 50 / 50.
align 4
xloop50: xloop50:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm1, [esi + edx] movdqu xmm1, [esi + edx]
...@@ -5168,7 +5060,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5168,7 +5060,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 75 / 25. // Blend 75 / 25.
align 4
xloop75: xloop75:
movdqu xmm1, [esi] movdqu xmm1, [esi]
movdqu xmm0, [esi + edx] movdqu xmm0, [esi + edx]
...@@ -5181,7 +5072,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5181,7 +5072,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
align 4
xloop100: xloop100:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
...@@ -5229,7 +5119,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5229,7 +5119,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
punpcklqdq xmm5, xmm5 punpcklqdq xmm5, xmm5
pxor xmm4, xmm4 pxor xmm4, xmm4
align 4
xloop: xloop:
movdqu xmm0, [esi] // row0 movdqu xmm0, [esi] // row0
movdqu xmm2, [esi + edx] // row1 movdqu xmm2, [esi + edx] // row1
...@@ -5255,7 +5144,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5255,7 +5144,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 25 / 75. // Blend 25 / 75.
align 4
xloop25: xloop25:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm1, [esi + edx] movdqu xmm1, [esi + edx]
...@@ -5268,7 +5156,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5268,7 +5156,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 50 / 50. // Blend 50 / 50.
align 4
xloop50: xloop50:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm1, [esi + edx] movdqu xmm1, [esi + edx]
...@@ -5280,7 +5167,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5280,7 +5167,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 75 / 25. // Blend 75 / 25.
align 4
xloop75: xloop75:
movdqu xmm1, [esi] movdqu xmm1, [esi]
movdqu xmm0, [esi + edx] movdqu xmm0, [esi + edx]
...@@ -5293,7 +5179,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -5293,7 +5179,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99 jmp xloop99
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
align 4
xloop100: xloop100:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
...@@ -5319,7 +5204,6 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, ...@@ -5319,7 +5204,6 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
mov ecx, [esp + 16] // pix mov ecx, [esp + 16] // pix
pshufd xmm5, xmm5, 0 pshufd xmm5, xmm5, 0
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -5347,7 +5231,6 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, ...@@ -5347,7 +5231,6 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
pcmpeqb xmm5, xmm5 // generate mask 0x000000ff pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
psrld xmm5, 24 psrld xmm5, 24
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -5377,7 +5260,6 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, ...@@ -5377,7 +5260,6 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
movdqu xmm5, [ecx] movdqu xmm5, [ecx]
mov ecx, [esp + 16] // pix mov ecx, [esp + 16] // pix
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -5404,7 +5286,6 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, ...@@ -5404,7 +5286,6 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
mov ecx, [esp + 16] // pix mov ecx, [esp + 16] // pix
align 4
wloop: wloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
...@@ -5465,7 +5346,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -5465,7 +5346,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
jg shuf_any1 jg shuf_any1
jmp shuf99 jmp shuf99
align 4
shuf_0123: shuf_0123:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -5483,7 +5363,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -5483,7 +5363,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
jg shuf_0123 jg shuf_0123
jmp shuf99 jmp shuf99
align 4
shuf_0321: shuf_0321:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -5501,7 +5380,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -5501,7 +5380,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
jg shuf_0321 jg shuf_0321
jmp shuf99 jmp shuf99
align 4
shuf_2103: shuf_2103:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -5519,7 +5397,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, ...@@ -5519,7 +5397,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
jg shuf_2103 jg shuf_2103
jmp shuf99 jmp shuf99
align 4
shuf_3012: shuf_3012:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -5564,7 +5441,6 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, ...@@ -5564,7 +5441,6 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edx, esi sub edx, esi
align 4
convertloop: convertloop:
movq xmm2, qword ptr [esi] // U movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V movq xmm3, qword ptr [esi + edx] // V
...@@ -5602,7 +5478,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, ...@@ -5602,7 +5478,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edx, esi sub edx, esi
align 4
convertloop: convertloop:
movq xmm2, qword ptr [esi] // U movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V movq xmm3, qword ptr [esi + edx] // V
...@@ -5639,7 +5514,6 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, ...@@ -5639,7 +5514,6 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
// 2 pixel loop. // 2 pixel loop.
align 4
convertloop: convertloop:
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
...@@ -5701,7 +5575,6 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, ...@@ -5701,7 +5575,6 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
mov ecx, [esp + 16] /* width */ mov ecx, [esp + 16] /* width */
// 2 pixel loop. // 2 pixel loop.
align 4
convertloop: convertloop:
vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
lea eax, [eax + 8] lea eax, [eax + 8]
...@@ -5737,7 +5610,6 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, ...@@ -5737,7 +5610,6 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
mov ecx, [esp + 4 + 12] /* width */ mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop. // 1 pixel loop.
align 4
convertloop: convertloop:
movzx edx, byte ptr [eax] movzx edx, byte ptr [eax]
lea eax, [eax + 4] lea eax, [eax + 4]
...@@ -5771,7 +5643,6 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { ...@@ -5771,7 +5643,6 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
mov ecx, [esp + 4 + 12] /* width */ mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop. // 1 pixel loop.
align 4
convertloop: convertloop:
movzx edx, byte ptr [eax] movzx edx, byte ptr [eax]
lea eax, [eax + 4] lea eax, [eax + 4]
...@@ -5813,7 +5684,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, ...@@ -5813,7 +5684,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
pxor xmm5, xmm5 pxor xmm5, xmm5
// 4 pixel loop. // 4 pixel loop.
align 4
convertloop: convertloop:
movdqu xmm0, qword ptr [eax] // generate luma ptr movdqu xmm0, qword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3 pmaddubsw xmm0, xmm3
......
...@@ -103,7 +103,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -103,7 +103,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -133,7 +132,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -133,7 +132,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -172,7 +170,6 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -172,7 +170,6 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 psrlw xmm5, 8
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -216,7 +213,6 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -216,7 +213,6 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrld xmm5, 24 psrld xmm5, 24
pslld xmm5, 16 pslld xmm5, 16
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -251,7 +247,6 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -251,7 +247,6 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8 psrlw xmm7, 8
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -314,7 +309,6 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -314,7 +309,6 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf1 movdqa xmm4, kShuf1
movdqa xmm5, kShuf2 movdqa xmm5, kShuf2
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -368,7 +362,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ...@@ -368,7 +362,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11 movdqa xmm6, kMadd11
movdqa xmm7, kRound34 movdqa xmm7, kRound34
align 4
wloop: wloop:
movdqu xmm0, [eax] // pixels 0..7 movdqu xmm0, [eax] // pixels 0..7
movdqu xmm1, [eax + esi] movdqu xmm1, [eax + esi]
...@@ -427,7 +420,6 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ...@@ -427,7 +420,6 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11 movdqa xmm6, kMadd11
movdqa xmm7, kRound34 movdqa xmm7, kRound34
align 4
wloop: wloop:
movdqu xmm0, [eax] // pixels 0..7 movdqu xmm0, [eax] // pixels 0..7
movdqu xmm1, [eax + esi] movdqu xmm1, [eax + esi]
...@@ -484,7 +476,6 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -484,7 +476,6 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf38a movdqa xmm4, kShuf38a
movdqa xmm5, kShuf38b movdqa xmm5, kShuf38b
align 4
xloop: xloop:
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
...@@ -520,7 +511,6 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ...@@ -520,7 +511,6 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kScaleAc33 movdqa xmm4, kScaleAc33
pxor xmm5, xmm5 pxor xmm5, xmm5
align 4
xloop: xloop:
movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
movdqu xmm6, [eax + esi] movdqu xmm6, [eax + esi]
...@@ -586,7 +576,6 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ...@@ -586,7 +576,6 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kShufAb2 movdqa xmm4, kShufAb2
movdqa xmm5, kScaleAb2 movdqa xmm5, kScaleAb2
align 4
xloop: xloop:
movdqu xmm0, [eax] // average 2 rows into xmm0 movdqu xmm0, [eax] // average 2 rows into xmm0
movdqu xmm1, [eax + esi] movdqu xmm1, [eax + esi]
...@@ -635,7 +624,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -635,7 +624,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pxor xmm4, xmm4 pxor xmm4, xmm4
dec ebx dec ebx
align 4
xloop: xloop:
// first row // first row
movdqu xmm0, [esi] movdqu xmm0, [esi]
...@@ -649,7 +637,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -649,7 +637,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
je ydone je ydone
// sum remaining rows // sum remaining rows
align 4
yloop: yloop:
movdqu xmm2, [eax] // read 16 pixels movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row lea eax, [eax + edx] // advance to next row
...@@ -661,7 +648,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ...@@ -661,7 +648,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
sub ebp, 1 sub ebp, 1
jg yloop jg yloop
align 4
ydone: ydone:
movdqu [edi], xmm0 movdqu [edi], xmm0
movdqu [edi + 16], xmm1 movdqu [edi + 16], xmm1
...@@ -716,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -716,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
pextrw edx, xmm2, 3 // get x1 integer. preroll pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop. // 2 Pixel loop.
align 4
xloop2: xloop2:
movdqa xmm1, xmm2 // x0, x1 fractions. movdqa xmm1, xmm2 // x0, x1 fractions.
paddd xmm2, xmm3 // x += dx paddd xmm2, xmm3 // x += dx
...@@ -739,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -739,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 2 // 2 pixels sub ecx, 2 // 2 pixels
jge xloop2 jge xloop2
align 4
xloop29: xloop29:
add ecx, 2 - 1 add ecx, 2 - 1
...@@ -757,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ...@@ -757,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd ebx, xmm0 movd ebx, xmm0
mov [edi], bl mov [edi], bl
align 4
xloop99: xloop99:
pop edi pop edi
...@@ -777,7 +760,6 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, ...@@ -777,7 +760,6 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
mov eax, [esp + 8] // src_ptr mov eax, [esp + 8] // src_ptr
mov ecx, [esp + 12] // dst_width mov ecx, [esp + 12] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
...@@ -806,7 +788,6 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ...@@ -806,7 +788,6 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -833,7 +814,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ...@@ -833,7 +814,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -864,7 +844,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ...@@ -864,7 +844,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
...@@ -904,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, ...@@ -904,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea ebx, [ebx * 4] lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2] lea edi, [ebx + ebx * 2]
align 4
wloop: wloop:
movd xmm0, [eax] movd xmm0, [eax]
movd xmm1, [eax + ebx] movd xmm1, [eax + ebx]
...@@ -945,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ...@@ -945,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
lea ebx, [ebx * 4] lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2] lea edi, [ebx + ebx * 2]
align 4
wloop: wloop:
movq xmm0, qword ptr [eax] // row0 4 pairs movq xmm0, qword ptr [eax] // row0 4 pairs
movhps xmm0, qword ptr [eax + ebx] movhps xmm0, qword ptr [eax + ebx]
...@@ -1006,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1006,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
jl xloop49 jl xloop49
// 4 Pixel loop. // 4 Pixel loop.
align 4
xloop4: xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels
...@@ -1026,7 +1002,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1026,7 +1002,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
sub ecx, 4 // 4 pixels sub ecx, 4 // 4 pixels
jge xloop4 jge xloop4
align 4
xloop49: xloop49:
test ecx, 2 test ecx, 2
je xloop29 je xloop29
...@@ -1047,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1047,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// 1 Pixels. // 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0 movd dword ptr [edi], xmm0
align 4
xloop99: xloop99:
pop esi pop esi
...@@ -1097,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, ...@@ -1097,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. preroll pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop. // 2 Pixel loop.
align 4
xloop2: xloop2:
movdqa xmm1, xmm2 // x0, x1 fractions. movdqa xmm1, xmm2 // x0, x1 fractions.
paddd xmm2, xmm3 // x += dx paddd xmm2, xmm3 // x += dx
...@@ -1117,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, ...@@ -1117,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
sub ecx, 2 // 2 pixels sub ecx, 2 // 2 pixels
jge xloop2 jge xloop2
align 4
xloop29: xloop29:
add ecx, 2 - 1 add ecx, 2 - 1
...@@ -1134,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, ...@@ -1134,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
movd [edi], xmm0 movd [edi], xmm0
align 4
xloop99: xloop99:
pop edi pop edi
...@@ -1153,7 +1124,6 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, ...@@ -1153,7 +1124,6 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
mov eax, [esp + 8] // src_argb mov eax, [esp + 8] // src_argb
mov ecx, [esp + 12] // dst_width mov ecx, [esp + 12] // dst_width
align 4
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax + 16] lea eax, [eax + 16]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment