Reenable AVX2 scaling with bug fix for any width

BUG=376 TESTED=unittest on scale functions R=brucedawson@google.com, harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/30759004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1135 16f28f9a-4ce2-e073-06de-1de4eb20be90

Reenable AVX2 scaling with bug fix for any width
BUG=376 TESTED=unittest on scale functions R=brucedawson@google.com, harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/30759004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1135 16f28f9a-4ce2-e073-06de-1de4eb20be90
af6f2524 · fbarchard@google.com · 4165437c · af6f2524 · af6f2524 · af6f2524
Commit af6f2524 authored Oct 22, 2014 by fbarchard@google.com
Showing with 54 additions and 56 deletions

README.chromium README.chromium +1 -1

row.h include/libyuv/row.h +1 -2

version.h include/libyuv/version.h +1 -1

row_any.cc source/row_any.cc +6 -10

row_win.cc source/row_win.cc +45 -42

No files found.
--- a/README.chromium
+++ b/README.chromium
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1134
+Version: 1135
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -201,8 +201,7 @@ extern "C" {
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
-// TODO(fbarchard): fix bug #376.
+#define HAS_INTERPOLATEROW_AVX2
-// #define HAS_INTERPOLATEROW_AVX2
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_SPLITUVROW_AVX2

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1134
+#define LIBYUV_VERSION 1135
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -581,24 +581,20 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
    }
 #ifdef HAS_INTERPOLATEROW_AVX2
-NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
+NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, InterpolateRow_C, 1, 1, 31)
-     InterpolateRow_C, 1, 1, 32)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSSE3
-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3,
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, InterpolateRow_C, 1, 1, 15)
-     InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSE2
-NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2,
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, InterpolateRow_C, 1, 1, 15)
-     InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_NEON
-NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
+NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, InterpolateRow_C, 1, 1, 15)
-     InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
-NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
+NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
-     InterpolateRow_C, 1, 1, 3)
+     1, 1, 3)
 #endif
 #undef NANY

--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4972,11 +4972,11 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 #endif  // HAS_ARGBAFFINEROW_SSE2
 #ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 16x2 -> 16x1
+// Bilinear filter 32x2 -> 32x1
 __declspec(naked) __declspec(align(16))
 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
+                         ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+                         int source_y_fraction) {
  __asm {
    push       esi
    push       edi
@@ -5023,45 +5023,48 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    jg         xloop
    jmp        xloop99
-    // Blend 25 / 75.
+   // Blend 25 / 75.
-    align      4
+   align      4
-  xloop25:
+ xloop25:
-    vmovdqu    ymm0, [esi]
+   vmovdqu    ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    ymm1, [esi + edx]
-    vpavgb     ymm0, ymm0, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
-    sub        ecx, 32
+   vpavgb     ymm0, ymm0, ymm1
-    vmovdqu    [esi + edi], ymm0
+   sub        ecx, 32
-    lea        esi, [esi + 32]
+   vmovdqu    [esi + edi], ymm0
-    jg         xloop25
+   lea        esi, [esi + 32]
-    jmp        xloop99
+   jg         xloop25
+   jmp        xloop99
-    // Blend 50 / 50.
-    align      4
+   // Blend 50 / 50.
-  xloop50:
+   align      4
-    vmovdqu    ymm0, [esi]
+ xloop50:
-    vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    ymm0, [esi]
-    sub        ecx, 32
+   vmovdqu    ymm1, [esi + edx]
-    vmovdqu    [esi + edi], ymm0
+   vpavgb     ymm0, ymm0, ymm1
-    lea        esi, [esi + 32]
+   sub        ecx, 32
-    jg         xloop50
+   vmovdqu    [esi + edi], ymm0
-    jmp        xloop99
+   lea        esi, [esi + 32]
+   jg         xloop50
-    // Blend 75 / 25.
+   jmp        xloop99
-    align      4
-  xloop75:
+   // Blend 75 / 25.
-    vmovdqu    ymm0, [esi + edx]
+   align      4
-    vpavgb     ymm0, ymm0, [esi]
+ xloop75:
-    vpavgb     ymm0, ymm0, [esi]
+   vmovdqu    ymm1, [esi]
-    sub        ecx, 32
+   vmovdqu    ymm0, [esi + edx]
-    vmovdqu     [esi + edi], ymm0
+   vpavgb     ymm0, ymm0, ymm1
-    lea        esi, [esi + 32]
+   vpavgb     ymm0, ymm0, ymm1
-    jg         xloop75
+   sub        ecx, 32
-    jmp        xloop99
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
-    // Blend 100 / 0 - Copy row unchanged.
+   jg         xloop75
-    align      4
+   jmp        xloop99
-  xloop100:
-    rep movsb
+   // Blend 100 / 0 - Copy row unchanged.
+   align      4
+ xloop100:
+   rep movsb
  xloop99:
    pop        edi