Commit 9548093b authored by Vitaly Tuzov's avatar Vitaly Tuzov

Horizontal line processing for pyrDown() reworked using wide universal intrinsics.

parent 3bc9912f
......@@ -1610,6 +1610,16 @@ inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
}
inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
{
return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
}
inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
{
return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
}
////////// Matrix operations /////////
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
......
......@@ -1908,7 +1908,6 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_re
template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
{
v_reg<float, n> c;
int j = 0;
for (int i = 0; i < n/4; i++)
{
c.s[3*i ] = vec.s[4*i ];
......
......@@ -1597,29 +1597,49 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
}
inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
{
short CV_DECL_ALIGNED(32) elems[8] =
schar CV_DECL_ALIGNED(32) elems[16] =
{
*(short*)(tab+idx[0]),
*(short*)(tab+idx[1]),
*(short*)(tab+idx[2]),
*(short*)(tab+idx[3]),
*(short*)(tab+idx[4]),
*(short*)(tab+idx[5]),
*(short*)(tab+idx[6]),
*(short*)(tab+idx[7])
tab[idx[0]],
tab[idx[0] + 1],
tab[idx[1]],
tab[idx[1] + 1],
tab[idx[2]],
tab[idx[2] + 1],
tab[idx[3]],
tab[idx[3] + 1],
tab[idx[4]],
tab[idx[4] + 1],
tab[idx[5]],
tab[idx[5] + 1],
tab[idx[6]],
tab[idx[6] + 1],
tab[idx[7]],
tab[idx[7] + 1]
};
return v_int8x16(vreinterpretq_s8_s16(vld1q_s16(elems)));
return v_int8x16(vld1q_s8(elems));
}
inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =
schar CV_DECL_ALIGNED(32) elems[16] =
{
*(int*)(tab + idx[0]),
*(int*)(tab + idx[1]),
*(int*)(tab + idx[2]),
*(int*)(tab + idx[3])
tab[idx[0]],
tab[idx[0] + 1],
tab[idx[0] + 2],
tab[idx[0] + 3],
tab[idx[1]],
tab[idx[1] + 1],
tab[idx[1] + 2],
tab[idx[1] + 3],
tab[idx[2]],
tab[idx[2] + 1],
tab[idx[2] + 2],
tab[idx[2] + 3],
tab[idx[3]],
tab[idx[3] + 1],
tab[idx[3] + 2],
tab[idx[3] + 3]
};
return v_int8x16(vreinterpretq_s8_s32(vld1q_s32(elems)));
return v_int8x16(vld1q_s8(elems));
}
inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
......@@ -1642,23 +1662,22 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
}
inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =
short CV_DECL_ALIGNED(32) elems[8] =
{
*(int*)(tab + idx[0]),
*(int*)(tab + idx[1]),
*(int*)(tab + idx[2]),
*(int*)(tab + idx[3])
tab[idx[0]],
tab[idx[0] + 1],
tab[idx[1]],
tab[idx[1] + 1],
tab[idx[2]],
tab[idx[2] + 1],
tab[idx[3]],
tab[idx[3] + 1]
};
return v_int16x8(vreinterpretq_s16_s32(vld1q_s32(elems)));
return v_int16x8(vld1q_s16(elems));
}
inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
{
int64 CV_DECL_ALIGNED(32) elems[2] =
{
*(int64*)(tab + idx[0]),
*(int64*)(tab + idx[1])
};
return v_int16x8(vreinterpretq_s16_s64(vld1q_s64(elems)));
return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
}
inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
......@@ -1677,12 +1696,7 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
}
inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
{
int64 CV_DECL_ALIGNED(32) elems[2] =
{
*(int64*)(tab + idx[0]),
*(int64*)(tab + idx[1])
};
return v_int32x4(vreinterpretq_s32_s64(vld1q_s64(elems)));
return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
}
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
{
......@@ -1800,7 +1814,8 @@ inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
{
return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)))));
int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
}
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
......@@ -1824,6 +1839,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
#if CV_SIMD128_64F
inline v_float64x2 v_lut(const double* tab, const int* idx)
{
......
......@@ -2789,7 +2789,7 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
}
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
{
return v_int32x4(_mm_load_si128((const __m128i*)(tab + idx[0])));
return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
}
inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
......@@ -2801,7 +2801,7 @@ inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
}
inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
{
return v_int64x2(_mm_load_si128((const __m128i*)(tab + idx[0])));
return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
}
inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
......@@ -2817,7 +2817,7 @@ inline v_float64x2 v_lut(const double* tab, const int* idx)
{
return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
}
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_load_si128((const __m128i*)(tab + idx[0])))); }
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
{
......@@ -2932,7 +2932,7 @@ inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
#else
__m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
__m128i a = _mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8))));
__m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
#endif
}
......@@ -2948,6 +2948,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
////////////// FP16 support ///////////////////////////
inline v_float32x4 v_load_expand(const float16_t* ptr)
......
......@@ -1160,6 +1160,10 @@ inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
/////// FP16 support ////////
// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment