Commit fadf25ac authored by Vitaly Tuzov's avatar Vitaly Tuzov

SSE4_1 optimized implementation of resize and warp functions migrated to separate file

parent 3681dcef
...@@ -83,7 +83,9 @@ public: ...@@ -83,7 +83,9 @@ public:
uchar* Dstart = D; uchar* Dstart = D;
int sy = std::min(cvFloor(y*ify), ssize.height-1); int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step; const uchar* S = src.data + sy*src.step;
#ifdef CV_ICC
#pragma unroll(4) #pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 8) for(x = 0; x < avxWidth; x += 8)
{ {
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
...@@ -106,7 +108,9 @@ public: ...@@ -106,7 +108,9 @@ public:
uchar* Dstart = D; uchar* Dstart = D;
int sy = std::min(cvFloor(y*ify), ssize.height-1); int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step; const uchar* S = src.data + sy*src.step;
#ifdef CV_ICC
#pragma unroll(4) #pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 8) for(x = 0; x < avxWidth; x += 8)
{ {
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
...@@ -157,8 +161,8 @@ public: ...@@ -157,8 +161,8 @@ public:
const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, //const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); // 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
if(((int64)(dst.data + dst.step) & 0x1f) == 0) if(((int64)(dst.data + dst.step) & 0x1f) == 0)
{ {
for(y = range.start; y < range.end; y++) for(y = range.start; y < range.end; y++)
...@@ -168,7 +172,9 @@ public: ...@@ -168,7 +172,9 @@ public:
int sy = std::min(cvFloor(y*ify), ssize.height-1); int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step; const uchar* S = src.data + sy*src.step;
const uchar* S2 = S - 2; const uchar* S2 = S - 2;
#ifdef CV_ICC
#pragma unroll(4) #pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 16) for(x = 0; x < avxWidth; x += 16)
{ {
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
...@@ -200,7 +206,9 @@ public: ...@@ -200,7 +206,9 @@ public:
int sy = std::min(cvFloor(y*ify), ssize.height-1); int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step; const uchar* S = src.data + sy*src.step;
const uchar* S2 = S - 2; const uchar* S2 = S - 2;
#ifdef CV_ICC
#pragma unroll(4) #pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 16) for(x = 0; x < avxWidth; x += 16)
{ {
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
......
This diff is collapsed.
...@@ -61,11 +61,27 @@ void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double); ...@@ -61,11 +61,27 @@ void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double);
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw); int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
#endif #endif
} }
namespace opt_SSE41
namespace opt_SSE4_1
{ {
#if CV_TRY_SSE4_1 #if CV_TRY_SSE4_1
void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width);
void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width);
void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width);
void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width);
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
class WarpPerspectiveLine_SSE4
{
public:
static Ptr<WarpPerspectiveLine_SSE4> getImpl(const double *M);
virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) = 0;
virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) = 0;
virtual ~WarpPerspectiveLine_SSE4() {};
};
#endif #endif
} }
} }
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment