Commit fadf25ac authored by Vitaly Tuzov's avatar Vitaly Tuzov

SSE4_1 optimized implementation of resize and warp functions migrated to separate file

parent 3681dcef
......@@ -83,7 +83,9 @@ public:
uchar* Dstart = D;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 8)
{
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
......@@ -106,7 +108,9 @@ public:
uchar* Dstart = D;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 8)
{
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
......@@ -157,8 +161,8 @@ public:
const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
//const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
// 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
{
for(y = range.start; y < range.end; y++)
......@@ -168,7 +172,9 @@ public:
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step;
const uchar* S2 = S - 2;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 16)
{
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
......@@ -200,7 +206,9 @@ public:
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.data + sy*src.step;
const uchar* S2 = S - 2;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for(x = 0; x < avxWidth; x += 16)
{
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
......
This diff is collapsed.
......@@ -61,11 +61,27 @@ void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double);
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
#endif
}
namespace opt_SSE41
namespace opt_SSE4_1
{
#if CV_TRY_SSE4_1
void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double);
int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width);
void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width);
void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width);
void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width);
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
class WarpPerspectiveLine_SSE4
{
public:
static Ptr<WarpPerspectiveLine_SSE4> getImpl(const double *M);
virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) = 0;
virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) = 0;
virtual ~WarpPerspectiveLine_SSE4() {};
};
#endif
}
}
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment