SSE4_1 optimized implementation of resize and warp functions migrated to separate file

fadf25ac · Vitaly Tuzov · 3681dcef · fadf25ac · fadf25ac · fadf25ac
Commit fadf25ac authored Jul 03, 2017 by Vitaly Tuzov
4 changed files
--- a/modules/imgproc/src/imgwarp.avx2.cpp
+++ b/modules/imgproc/src/imgwarp.avx2.cpp
@@ -83,7 +83,9 @@ public:
                uchar* Dstart = D;
                int sy = std::min(cvFloor(y*ify), ssize.height-1);
                const uchar* S = src.data + sy*src.step;
+#ifdef CV_ICC
 #pragma unroll(4)
+#endif
                for(x = 0; x < avxWidth; x += 8)
                {
                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
@@ -106,7 +108,9 @@ public:
                uchar* Dstart = D;
                int sy = std::min(cvFloor(y*ify), ssize.height-1);
                const uchar* S = src.data + sy*src.step;
+#ifdef CV_ICC
 #pragma unroll(4)
+#endif
                for(x = 0; x < avxWidth; x += 8)
                {
                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
@@ -157,8 +161,8 @@ public:
        const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
                                                                         15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
        const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
-        const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
+        //const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
-                                                                               13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
+        //                                                                       13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
        if(((int64)(dst.data + dst.step) & 0x1f) == 0)
        {
            for(y = range.start; y < range.end; y++)
@@ -168,7 +172,9 @@ public:
                int sy = std::min(cvFloor(y*ify), ssize.height-1);
                const uchar* S = src.data + sy*src.step;
                const uchar* S2 = S - 2;
+#ifdef CV_ICC
 #pragma unroll(4)
+#endif
                for(x = 0; x < avxWidth; x += 16)
                {
                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
@@ -200,7 +206,9 @@ public:
                int sy = std::min(cvFloor(y*ify), ssize.height-1);
                const uchar* S = src.data + sy*src.step;
                const uchar* S2 = S - 2;
+#ifdef CV_ICC
 #pragma unroll(4)
+#endif
                for(x = 0; x < avxWidth; x += 16)
                {
                    const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);

--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
--- a/modules/imgproc/src/imgwarp.hpp
+++ b/modules/imgproc/src/imgwarp.hpp
@@ -61,11 +61,27 @@ void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double);
 int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
 #endif
 }
-namespace opt_SSE41
+namespace opt_SSE4_1
 {
 #if CV_TRY_SSE4_1
 void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
 void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
+int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width);
+void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width);
+void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width);
+void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width);
+void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
+class WarpPerspectiveLine_SSE4
+{
+public:
+    static Ptr<WarpPerspectiveLine_SSE4> getImpl(const double *M);
+    virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) = 0;
+    virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) = 0;
+    virtual ~WarpPerspectiveLine_SSE4() {};
+};
 #endif
 }
 }

--- a/modules/imgproc/src/imgwarp.sse4_1.cpp
+++ b/modules/imgproc/src/imgwarp.sse4_1.cpp