Commit 64cf206f authored by Tomoaki Teshima's avatar Tomoaki Teshima

optimize blend using universal intrinsic

 - add more channels/depth performance test for blend
parent 90538392
...@@ -1368,6 +1368,24 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NO ...@@ -1368,6 +1368,24 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NO
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
// adopted from sse_utils.hpp // adopted from sse_utils.hpp
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
{
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
__m128i t10 = _mm_unpacklo_epi8(t00, t01);
__m128i t11 = _mm_unpackhi_epi8(t00, t01);
__m128i t20 = _mm_unpacklo_epi8(t10, t11);
__m128i t21 = _mm_unpackhi_epi8(t10, t11);
__m128i t30 = _mm_unpacklo_epi8(t20, t21);
__m128i t31 = _mm_unpackhi_epi8(t20, t21);
a.val = _mm_unpacklo_epi8(t30, t31);
b.val = _mm_unpackhi_epi8(t30, t31);
}
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
{ {
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr); __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
...@@ -1507,6 +1525,15 @@ inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& ...@@ -1507,6 +1525,15 @@ inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8&
_mm_storeu_si128((__m128i*)(ptr + 8), t1); _mm_storeu_si128((__m128i*)(ptr + 8), t1);
} }
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
{
__m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
__m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
}
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c ) const v_uint8x16& c )
{ {
......
...@@ -56,7 +56,7 @@ namespace ocl { ...@@ -56,7 +56,7 @@ namespace ocl {
typedef Size_MatType BlendLinearFixture; typedef Size_MatType BlendLinearFixture;
OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4))) OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC3, CV_32FC4, CV_8UC1, CV_8UC3, CV_8UC4)))
{ {
Size_MatType_t params = GetParam(); Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment