Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
3a426660
Commit
3a426660
authored
Jan 12, 2015
by
Ilya Lavrenov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
sse_utils.hpp
parent
a340ea87
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
581 additions
and
227 deletions
+581
-227
base.hpp
modules/core/include/opencv2/core/base.hpp
+2
-0
sse_utils.hpp
modules/core/include/opencv2/core/sse_utils.hpp
+497
-0
color.cpp
modules/imgproc/src/color.cpp
+82
-227
No files found.
modules/core/include/opencv2/core/base.hpp
View file @
3a426660
...
...
@@ -813,4 +813,6 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val)
}
// cv
#include "sse_utils.hpp"
#endif //__OPENCV_CORE_BASE_HPP__
modules/core/include/opencv2/core/sse_utils.hpp
0 → 100644
View file @
3a426660
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2015, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_CORE_SSE_UTILS_HPP__
#define __OPENCV_CORE_SSE_UTILS_HPP__
#ifndef __cplusplus
# error base.hpp header must be compiled as C++
#endif
#if CV_SSE2
inline
void
_mm_deinterliv_epi8
(
__m128i
&
v_r0
,
__m128i
&
v_r1
,
__m128i
&
v_g0
,
__m128i
&
v_g1
,
__m128i
&
v_b0
,
__m128i
&
v_b1
)
{
__m128i
layer1_chunk0
=
_mm_unpacklo_epi8
(
v_r0
,
v_g1
);
__m128i
layer1_chunk1
=
_mm_unpackhi_epi8
(
v_r0
,
v_g1
);
__m128i
layer1_chunk2
=
_mm_unpacklo_epi8
(
v_r1
,
v_b0
);
__m128i
layer1_chunk3
=
_mm_unpackhi_epi8
(
v_r1
,
v_b0
);
__m128i
layer1_chunk4
=
_mm_unpacklo_epi8
(
v_g0
,
v_b1
);
__m128i
layer1_chunk5
=
_mm_unpackhi_epi8
(
v_g0
,
v_b1
);
__m128i
layer2_chunk0
=
_mm_unpacklo_epi8
(
layer1_chunk0
,
layer1_chunk3
);
__m128i
layer2_chunk1
=
_mm_unpackhi_epi8
(
layer1_chunk0
,
layer1_chunk3
);
__m128i
layer2_chunk2
=
_mm_unpacklo_epi8
(
layer1_chunk1
,
layer1_chunk4
);
__m128i
layer2_chunk3
=
_mm_unpackhi_epi8
(
layer1_chunk1
,
layer1_chunk4
);
__m128i
layer2_chunk4
=
_mm_unpacklo_epi8
(
layer1_chunk2
,
layer1_chunk5
);
__m128i
layer2_chunk5
=
_mm_unpackhi_epi8
(
layer1_chunk2
,
layer1_chunk5
);
__m128i
layer3_chunk0
=
_mm_unpacklo_epi8
(
layer2_chunk0
,
layer2_chunk3
);
__m128i
layer3_chunk1
=
_mm_unpackhi_epi8
(
layer2_chunk0
,
layer2_chunk3
);
__m128i
layer3_chunk2
=
_mm_unpacklo_epi8
(
layer2_chunk1
,
layer2_chunk4
);
__m128i
layer3_chunk3
=
_mm_unpackhi_epi8
(
layer2_chunk1
,
layer2_chunk4
);
__m128i
layer3_chunk4
=
_mm_unpacklo_epi8
(
layer2_chunk2
,
layer2_chunk5
);
__m128i
layer3_chunk5
=
_mm_unpackhi_epi8
(
layer2_chunk2
,
layer2_chunk5
);
__m128i
layer4_chunk0
=
_mm_unpacklo_epi8
(
layer3_chunk0
,
layer3_chunk3
);
__m128i
layer4_chunk1
=
_mm_unpackhi_epi8
(
layer3_chunk0
,
layer3_chunk3
);
__m128i
layer4_chunk2
=
_mm_unpacklo_epi8
(
layer3_chunk1
,
layer3_chunk4
);
__m128i
layer4_chunk3
=
_mm_unpackhi_epi8
(
layer3_chunk1
,
layer3_chunk4
);
__m128i
layer4_chunk4
=
_mm_unpacklo_epi8
(
layer3_chunk2
,
layer3_chunk5
);
__m128i
layer4_chunk5
=
_mm_unpackhi_epi8
(
layer3_chunk2
,
layer3_chunk5
);
v_r0
=
_mm_unpacklo_epi8
(
layer4_chunk0
,
layer4_chunk3
);
v_r1
=
_mm_unpackhi_epi8
(
layer4_chunk0
,
layer4_chunk3
);
v_g0
=
_mm_unpacklo_epi8
(
layer4_chunk1
,
layer4_chunk4
);
v_g1
=
_mm_unpackhi_epi8
(
layer4_chunk1
,
layer4_chunk4
);
v_b0
=
_mm_unpacklo_epi8
(
layer4_chunk2
,
layer4_chunk5
);
v_b1
=
_mm_unpackhi_epi8
(
layer4_chunk2
,
layer4_chunk5
);
}
inline
void
_mm_deinterliv_epi8
(
__m128i
&
v_r0
,
__m128i
&
v_r1
,
__m128i
&
v_g0
,
__m128i
&
v_g1
,
__m128i
&
v_b0
,
__m128i
&
v_b1
,
__m128i
&
v_a0
,
__m128i
&
v_a1
)
{
__m128i
layer1_chunk0
=
_mm_unpacklo_epi8
(
v_r0
,
v_b0
);
__m128i
layer1_chunk1
=
_mm_unpackhi_epi8
(
v_r0
,
v_b0
);
__m128i
layer1_chunk2
=
_mm_unpacklo_epi8
(
v_r1
,
v_b1
);
__m128i
layer1_chunk3
=
_mm_unpackhi_epi8
(
v_r1
,
v_b1
);
__m128i
layer1_chunk4
=
_mm_unpacklo_epi8
(
v_g0
,
v_a0
);
__m128i
layer1_chunk5
=
_mm_unpackhi_epi8
(
v_g0
,
v_a0
);
__m128i
layer1_chunk6
=
_mm_unpacklo_epi8
(
v_g1
,
v_a1
);
__m128i
layer1_chunk7
=
_mm_unpackhi_epi8
(
v_g1
,
v_a1
);
__m128i
layer2_chunk0
=
_mm_unpacklo_epi8
(
layer1_chunk0
,
layer1_chunk4
);
__m128i
layer2_chunk1
=
_mm_unpackhi_epi8
(
layer1_chunk0
,
layer1_chunk4
);
__m128i
layer2_chunk2
=
_mm_unpacklo_epi8
(
layer1_chunk1
,
layer1_chunk5
);
__m128i
layer2_chunk3
=
_mm_unpackhi_epi8
(
layer1_chunk1
,
layer1_chunk5
);
__m128i
layer2_chunk4
=
_mm_unpacklo_epi8
(
layer1_chunk2
,
layer1_chunk6
);
__m128i
layer2_chunk5
=
_mm_unpackhi_epi8
(
layer1_chunk2
,
layer1_chunk6
);
__m128i
layer2_chunk6
=
_mm_unpacklo_epi8
(
layer1_chunk3
,
layer1_chunk7
);
__m128i
layer2_chunk7
=
_mm_unpackhi_epi8
(
layer1_chunk3
,
layer1_chunk7
);
__m128i
layer3_chunk0
=
_mm_unpacklo_epi8
(
layer2_chunk0
,
layer2_chunk4
);
__m128i
layer3_chunk1
=
_mm_unpackhi_epi8
(
layer2_chunk0
,
layer2_chunk4
);
__m128i
layer3_chunk2
=
_mm_unpacklo_epi8
(
layer2_chunk1
,
layer2_chunk5
);
__m128i
layer3_chunk3
=
_mm_unpackhi_epi8
(
layer2_chunk1
,
layer2_chunk5
);
__m128i
layer3_chunk4
=
_mm_unpacklo_epi8
(
layer2_chunk2
,
layer2_chunk6
);
__m128i
layer3_chunk5
=
_mm_unpackhi_epi8
(
layer2_chunk2
,
layer2_chunk6
);
__m128i
layer3_chunk6
=
_mm_unpacklo_epi8
(
layer2_chunk3
,
layer2_chunk7
);
__m128i
layer3_chunk7
=
_mm_unpackhi_epi8
(
layer2_chunk3
,
layer2_chunk7
);
__m128i
layer4_chunk0
=
_mm_unpacklo_epi8
(
layer3_chunk0
,
layer3_chunk4
);
__m128i
layer4_chunk1
=
_mm_unpackhi_epi8
(
layer3_chunk0
,
layer3_chunk4
);
__m128i
layer4_chunk2
=
_mm_unpacklo_epi8
(
layer3_chunk1
,
layer3_chunk5
);
__m128i
layer4_chunk3
=
_mm_unpackhi_epi8
(
layer3_chunk1
,
layer3_chunk5
);
__m128i
layer4_chunk4
=
_mm_unpacklo_epi8
(
layer3_chunk2
,
layer3_chunk6
);
__m128i
layer4_chunk5
=
_mm_unpackhi_epi8
(
layer3_chunk2
,
layer3_chunk6
);
__m128i
layer4_chunk6
=
_mm_unpacklo_epi8
(
layer3_chunk3
,
layer3_chunk7
);
__m128i
layer4_chunk7
=
_mm_unpackhi_epi8
(
layer3_chunk3
,
layer3_chunk7
);
v_r0
=
_mm_unpacklo_epi8
(
layer4_chunk0
,
layer4_chunk4
);
v_r1
=
_mm_unpackhi_epi8
(
layer4_chunk0
,
layer4_chunk4
);
v_g0
=
_mm_unpacklo_epi8
(
layer4_chunk1
,
layer4_chunk5
);
v_g1
=
_mm_unpackhi_epi8
(
layer4_chunk1
,
layer4_chunk5
);
v_b0
=
_mm_unpacklo_epi8
(
layer4_chunk2
,
layer4_chunk6
);
v_b1
=
_mm_unpackhi_epi8
(
layer4_chunk2
,
layer4_chunk6
);
v_a0
=
_mm_unpacklo_epi8
(
layer4_chunk3
,
layer4_chunk7
);
v_a1
=
_mm_unpackhi_epi8
(
layer4_chunk3
,
layer4_chunk7
);
}
inline
void
_mm_interlive_epi8
(
__m128i
&
v_r0
,
__m128i
&
v_r1
,
__m128i
&
v_g0
,
__m128i
&
v_g1
,
__m128i
&
v_b0
,
__m128i
&
v_b1
)
{
__m128i
v_mask
=
_mm_set1_epi16
(
0x00ff
);
__m128i
layer4_chunk0
=
_mm_packus_epi16
(
_mm_and_si128
(
v_r0
,
v_mask
),
_mm_and_si128
(
v_r1
,
v_mask
));
__m128i
layer4_chunk3
=
_mm_packus_epi16
(
_mm_srli_epi16
(
v_r0
,
8
),
_mm_srli_epi16
(
v_r1
,
8
));
__m128i
layer4_chunk1
=
_mm_packus_epi16
(
_mm_and_si128
(
v_g0
,
v_mask
),
_mm_and_si128
(
v_g1
,
v_mask
));
__m128i
layer4_chunk4
=
_mm_packus_epi16
(
_mm_srli_epi16
(
v_g0
,
8
),
_mm_srli_epi16
(
v_g1
,
8
));
__m128i
layer4_chunk2
=
_mm_packus_epi16
(
_mm_and_si128
(
v_b0
,
v_mask
),
_mm_and_si128
(
v_b1
,
v_mask
));
__m128i
layer4_chunk5
=
_mm_packus_epi16
(
_mm_srli_epi16
(
v_b0
,
8
),
_mm_srli_epi16
(
v_b1
,
8
));
__m128i
layer3_chunk0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer4_chunk0
,
v_mask
),
_mm_and_si128
(
layer4_chunk1
,
v_mask
));
__m128i
layer3_chunk3
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer4_chunk0
,
8
),
_mm_srli_epi16
(
layer4_chunk1
,
8
));
__m128i
layer3_chunk1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer4_chunk2
,
v_mask
),
_mm_and_si128
(
layer4_chunk3
,
v_mask
));
__m128i
layer3_chunk4
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer4_chunk2
,
8
),
_mm_srli_epi16
(
layer4_chunk3
,
8
));
__m128i
layer3_chunk2
=
_mm_packus_epi16
(
_mm_and_si128
(
layer4_chunk4
,
v_mask
),
_mm_and_si128
(
layer4_chunk5
,
v_mask
));
__m128i
layer3_chunk5
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer4_chunk4
,
8
),
_mm_srli_epi16
(
layer4_chunk5
,
8
));
__m128i
layer2_chunk0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer3_chunk0
,
v_mask
),
_mm_and_si128
(
layer3_chunk1
,
v_mask
));
__m128i
layer2_chunk3
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer3_chunk0
,
8
),
_mm_srli_epi16
(
layer3_chunk1
,
8
));
__m128i
layer2_chunk1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer3_chunk2
,
v_mask
),
_mm_and_si128
(
layer3_chunk3
,
v_mask
));
__m128i
layer2_chunk4
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer3_chunk2
,
8
),
_mm_srli_epi16
(
layer3_chunk3
,
8
));
__m128i
layer2_chunk2
=
_mm_packus_epi16
(
_mm_and_si128
(
layer3_chunk4
,
v_mask
),
_mm_and_si128
(
layer3_chunk5
,
v_mask
));
__m128i
layer2_chunk5
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer3_chunk4
,
8
),
_mm_srli_epi16
(
layer3_chunk5
,
8
));
__m128i
layer1_chunk0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer2_chunk0
,
v_mask
),
_mm_and_si128
(
layer2_chunk1
,
v_mask
));
__m128i
layer1_chunk3
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer2_chunk0
,
8
),
_mm_srli_epi16
(
layer2_chunk1
,
8
));
__m128i
layer1_chunk1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer2_chunk2
,
v_mask
),
_mm_and_si128
(
layer2_chunk3
,
v_mask
));
__m128i
layer1_chunk4
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer2_chunk2
,
8
),
_mm_srli_epi16
(
layer2_chunk3
,
8
));
__m128i
layer1_chunk2
=
_mm_packus_epi16
(
_mm_and_si128
(
layer2_chunk4
,
v_mask
),
_mm_and_si128
(
layer2_chunk5
,
v_mask
));
__m128i
layer1_chunk5
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer2_chunk4
,
8
),
_mm_srli_epi16
(
layer2_chunk5
,
8
));
v_r0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer1_chunk0
,
v_mask
),
_mm_and_si128
(
layer1_chunk1
,
v_mask
));
v_g1
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer1_chunk0
,
8
),
_mm_srli_epi16
(
layer1_chunk1
,
8
));
v_r1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer1_chunk2
,
v_mask
),
_mm_and_si128
(
layer1_chunk3
,
v_mask
));
v_b0
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer1_chunk2
,
8
),
_mm_srli_epi16
(
layer1_chunk3
,
8
));
v_g0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer1_chunk4
,
v_mask
),
_mm_and_si128
(
layer1_chunk5
,
v_mask
));
v_b1
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer1_chunk4
,
8
),
_mm_srli_epi16
(
layer1_chunk5
,
8
));
}
inline
void
_mm_interlive_epi8
(
__m128i
&
v_r0
,
__m128i
&
v_r1
,
__m128i
&
v_g0
,
__m128i
&
v_g1
,
__m128i
&
v_b0
,
__m128i
&
v_b1
,
__m128i
&
v_a0
,
__m128i
&
v_a1
)
{
__m128i
v_mask
=
_mm_set1_epi16
(
0x00ff
);
__m128i
layer4_chunk0
=
_mm_packus_epi16
(
_mm_and_si128
(
v_r0
,
v_mask
),
_mm_and_si128
(
v_r1
,
v_mask
));
__m128i
layer4_chunk4
=
_mm_packus_epi16
(
_mm_srli_epi16
(
v_r0
,
8
),
_mm_srli_epi16
(
v_r1
,
8
));
__m128i
layer4_chunk1
=
_mm_packus_epi16
(
_mm_and_si128
(
v_g0
,
v_mask
),
_mm_and_si128
(
v_g1
,
v_mask
));
__m128i
layer4_chunk5
=
_mm_packus_epi16
(
_mm_srli_epi16
(
v_g0
,
8
),
_mm_srli_epi16
(
v_g1
,
8
));
__m128i
layer4_chunk2
=
_mm_packus_epi16
(
_mm_and_si128
(
v_b0
,
v_mask
),
_mm_and_si128
(
v_b1
,
v_mask
));
__m128i
layer4_chunk6
=
_mm_packus_epi16
(
_mm_srli_epi16
(
v_b0
,
8
),
_mm_srli_epi16
(
v_b1
,
8
));
__m128i
layer4_chunk3
=
_mm_packus_epi16
(
_mm_and_si128
(
v_a0
,
v_mask
),
_mm_and_si128
(
v_a1
,
v_mask
));
__m128i
layer4_chunk7
=
_mm_packus_epi16
(
_mm_srli_epi16
(
v_a0
,
8
),
_mm_srli_epi16
(
v_a1
,
8
));
__m128i
layer3_chunk0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer4_chunk0
,
v_mask
),
_mm_and_si128
(
layer4_chunk1
,
v_mask
));
__m128i
layer3_chunk4
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer4_chunk0
,
8
),
_mm_srli_epi16
(
layer4_chunk1
,
8
));
__m128i
layer3_chunk1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer4_chunk2
,
v_mask
),
_mm_and_si128
(
layer4_chunk3
,
v_mask
));
__m128i
layer3_chunk5
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer4_chunk2
,
8
),
_mm_srli_epi16
(
layer4_chunk3
,
8
));
__m128i
layer3_chunk2
=
_mm_packus_epi16
(
_mm_and_si128
(
layer4_chunk4
,
v_mask
),
_mm_and_si128
(
layer4_chunk5
,
v_mask
));
__m128i
layer3_chunk6
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer4_chunk4
,
8
),
_mm_srli_epi16
(
layer4_chunk5
,
8
));
__m128i
layer3_chunk3
=
_mm_packus_epi16
(
_mm_and_si128
(
layer4_chunk6
,
v_mask
),
_mm_and_si128
(
layer4_chunk7
,
v_mask
));
__m128i
layer3_chunk7
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer4_chunk6
,
8
),
_mm_srli_epi16
(
layer4_chunk7
,
8
));
__m128i
layer2_chunk0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer3_chunk0
,
v_mask
),
_mm_and_si128
(
layer3_chunk1
,
v_mask
));
__m128i
layer2_chunk4
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer3_chunk0
,
8
),
_mm_srli_epi16
(
layer3_chunk1
,
8
));
__m128i
layer2_chunk1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer3_chunk2
,
v_mask
),
_mm_and_si128
(
layer3_chunk3
,
v_mask
));
__m128i
layer2_chunk5
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer3_chunk2
,
8
),
_mm_srli_epi16
(
layer3_chunk3
,
8
));
__m128i
layer2_chunk2
=
_mm_packus_epi16
(
_mm_and_si128
(
layer3_chunk4
,
v_mask
),
_mm_and_si128
(
layer3_chunk5
,
v_mask
));
__m128i
layer2_chunk6
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer3_chunk4
,
8
),
_mm_srli_epi16
(
layer3_chunk5
,
8
));
__m128i
layer2_chunk3
=
_mm_packus_epi16
(
_mm_and_si128
(
layer3_chunk6
,
v_mask
),
_mm_and_si128
(
layer3_chunk7
,
v_mask
));
__m128i
layer2_chunk7
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer3_chunk6
,
8
),
_mm_srli_epi16
(
layer3_chunk7
,
8
));
__m128i
layer1_chunk0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer2_chunk0
,
v_mask
),
_mm_and_si128
(
layer2_chunk1
,
v_mask
));
__m128i
layer1_chunk4
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer2_chunk0
,
8
),
_mm_srli_epi16
(
layer2_chunk1
,
8
));
__m128i
layer1_chunk1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer2_chunk2
,
v_mask
),
_mm_and_si128
(
layer2_chunk3
,
v_mask
));
__m128i
layer1_chunk5
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer2_chunk2
,
8
),
_mm_srli_epi16
(
layer2_chunk3
,
8
));
__m128i
layer1_chunk2
=
_mm_packus_epi16
(
_mm_and_si128
(
layer2_chunk4
,
v_mask
),
_mm_and_si128
(
layer2_chunk5
,
v_mask
));
__m128i
layer1_chunk6
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer2_chunk4
,
8
),
_mm_srli_epi16
(
layer2_chunk5
,
8
));
__m128i
layer1_chunk3
=
_mm_packus_epi16
(
_mm_and_si128
(
layer2_chunk6
,
v_mask
),
_mm_and_si128
(
layer2_chunk7
,
v_mask
));
__m128i
layer1_chunk7
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer2_chunk6
,
8
),
_mm_srli_epi16
(
layer2_chunk7
,
8
));
v_r0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer1_chunk0
,
v_mask
),
_mm_and_si128
(
layer1_chunk1
,
v_mask
));
v_b0
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer1_chunk0
,
8
),
_mm_srli_epi16
(
layer1_chunk1
,
8
));
v_r1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer1_chunk2
,
v_mask
),
_mm_and_si128
(
layer1_chunk3
,
v_mask
));
v_b1
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer1_chunk2
,
8
),
_mm_srli_epi16
(
layer1_chunk3
,
8
));
v_g0
=
_mm_packus_epi16
(
_mm_and_si128
(
layer1_chunk4
,
v_mask
),
_mm_and_si128
(
layer1_chunk5
,
v_mask
));
v_a0
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer1_chunk4
,
8
),
_mm_srli_epi16
(
layer1_chunk5
,
8
));
v_g1
=
_mm_packus_epi16
(
_mm_and_si128
(
layer1_chunk6
,
v_mask
),
_mm_and_si128
(
layer1_chunk7
,
v_mask
));
v_a1
=
_mm_packus_epi16
(
_mm_srli_epi16
(
layer1_chunk6
,
8
),
_mm_srli_epi16
(
layer1_chunk7
,
8
));
}
inline
void
_mm_deinterliv_epi16
(
__m128i
&
v_r0
,
__m128i
&
v_r1
,
__m128i
&
v_g0
,
__m128i
&
v_g1
,
__m128i
&
v_b0
,
__m128i
&
v_b1
)
{
__m128i
layer1_chunk0
=
_mm_unpacklo_epi16
(
v_r0
,
v_g1
);
__m128i
layer1_chunk1
=
_mm_unpackhi_epi16
(
v_r0
,
v_g1
);
__m128i
layer1_chunk2
=
_mm_unpacklo_epi16
(
v_r1
,
v_b0
);
__m128i
layer1_chunk3
=
_mm_unpackhi_epi16
(
v_r1
,
v_b0
);
__m128i
layer1_chunk4
=
_mm_unpacklo_epi16
(
v_g0
,
v_b1
);
__m128i
layer1_chunk5
=
_mm_unpackhi_epi16
(
v_g0
,
v_b1
);
__m128i
layer2_chunk0
=
_mm_unpacklo_epi16
(
layer1_chunk0
,
layer1_chunk3
);
__m128i
layer2_chunk1
=
_mm_unpackhi_epi16
(
layer1_chunk0
,
layer1_chunk3
);
__m128i
layer2_chunk2
=
_mm_unpacklo_epi16
(
layer1_chunk1
,
layer1_chunk4
);
__m128i
layer2_chunk3
=
_mm_unpackhi_epi16
(
layer1_chunk1
,
layer1_chunk4
);
__m128i
layer2_chunk4
=
_mm_unpacklo_epi16
(
layer1_chunk2
,
layer1_chunk5
);
__m128i
layer2_chunk5
=
_mm_unpackhi_epi16
(
layer1_chunk2
,
layer1_chunk5
);
__m128i
layer3_chunk0
=
_mm_unpacklo_epi16
(
layer2_chunk0
,
layer2_chunk3
);
__m128i
layer3_chunk1
=
_mm_unpackhi_epi16
(
layer2_chunk0
,
layer2_chunk3
);
__m128i
layer3_chunk2
=
_mm_unpacklo_epi16
(
layer2_chunk1
,
layer2_chunk4
);
__m128i
layer3_chunk3
=
_mm_unpackhi_epi16
(
layer2_chunk1
,
layer2_chunk4
);
__m128i
layer3_chunk4
=
_mm_unpacklo_epi16
(
layer2_chunk2
,
layer2_chunk5
);
__m128i
layer3_chunk5
=
_mm_unpackhi_epi16
(
layer2_chunk2
,
layer2_chunk5
);
v_r0
=
_mm_unpacklo_epi16
(
layer3_chunk0
,
layer3_chunk3
);
v_r1
=
_mm_unpackhi_epi16
(
layer3_chunk0
,
layer3_chunk3
);
v_g0
=
_mm_unpacklo_epi16
(
layer3_chunk1
,
layer3_chunk4
);
v_g1
=
_mm_unpackhi_epi16
(
layer3_chunk1
,
layer3_chunk4
);
v_b0
=
_mm_unpacklo_epi16
(
layer3_chunk2
,
layer3_chunk5
);
v_b1
=
_mm_unpackhi_epi16
(
layer3_chunk2
,
layer3_chunk5
);
}
inline
void
_mm_deinterliv_epi16
(
__m128i
&
v_r0
,
__m128i
&
v_r1
,
__m128i
&
v_g0
,
__m128i
&
v_g1
,
__m128i
&
v_b0
,
__m128i
&
v_b1
,
__m128i
&
v_a0
,
__m128i
&
v_a1
)
{
__m128i
layer1_chunk0
=
_mm_unpacklo_epi16
(
v_r0
,
v_b0
);
__m128i
layer1_chunk1
=
_mm_unpackhi_epi16
(
v_r0
,
v_b0
);
__m128i
layer1_chunk2
=
_mm_unpacklo_epi16
(
v_r1
,
v_b1
);
__m128i
layer1_chunk3
=
_mm_unpackhi_epi16
(
v_r1
,
v_b1
);
__m128i
layer1_chunk4
=
_mm_unpacklo_epi16
(
v_g0
,
v_a0
);
__m128i
layer1_chunk5
=
_mm_unpackhi_epi16
(
v_g0
,
v_a0
);
__m128i
layer1_chunk6
=
_mm_unpacklo_epi16
(
v_g1
,
v_a1
);
__m128i
layer1_chunk7
=
_mm_unpackhi_epi16
(
v_g1
,
v_a1
);
__m128i
layer2_chunk0
=
_mm_unpacklo_epi16
(
layer1_chunk0
,
layer1_chunk4
);
__m128i
layer2_chunk1
=
_mm_unpackhi_epi16
(
layer1_chunk0
,
layer1_chunk4
);
__m128i
layer2_chunk2
=
_mm_unpacklo_epi16
(
layer1_chunk1
,
layer1_chunk5
);
__m128i
layer2_chunk3
=
_mm_unpackhi_epi16
(
layer1_chunk1
,
layer1_chunk5
);
__m128i
layer2_chunk4
=
_mm_unpacklo_epi16
(
layer1_chunk2
,
layer1_chunk6
);
__m128i
layer2_chunk5
=
_mm_unpackhi_epi16
(
layer1_chunk2
,
layer1_chunk6
);
__m128i
layer2_chunk6
=
_mm_unpacklo_epi16
(
layer1_chunk3
,
layer1_chunk7
);
__m128i
layer2_chunk7
=
_mm_unpackhi_epi16
(
layer1_chunk3
,
layer1_chunk7
);
__m128i
layer3_chunk0
=
_mm_unpacklo_epi16
(
layer2_chunk0
,
layer2_chunk4
);
__m128i
layer3_chunk1
=
_mm_unpackhi_epi16
(
layer2_chunk0
,
layer2_chunk4
);
__m128i
layer3_chunk2
=
_mm_unpacklo_epi16
(
layer2_chunk1
,
layer2_chunk5
);
__m128i
layer3_chunk3
=
_mm_unpackhi_epi16
(
layer2_chunk1
,
layer2_chunk5
);
__m128i
layer3_chunk4
=
_mm_unpacklo_epi16
(
layer2_chunk2
,
layer2_chunk6
);
__m128i
layer3_chunk5
=
_mm_unpackhi_epi16
(
layer2_chunk2
,
layer2_chunk6
);
__m128i
layer3_chunk6
=
_mm_unpacklo_epi16
(
layer2_chunk3
,
layer2_chunk7
);
__m128i
layer3_chunk7
=
_mm_unpackhi_epi16
(
layer2_chunk3
,
layer2_chunk7
);
v_r0
=
_mm_unpacklo_epi16
(
layer3_chunk0
,
layer3_chunk4
);
v_r1
=
_mm_unpackhi_epi16
(
layer3_chunk0
,
layer3_chunk4
);
v_g0
=
_mm_unpacklo_epi16
(
layer3_chunk1
,
layer3_chunk5
);
v_g1
=
_mm_unpackhi_epi16
(
layer3_chunk1
,
layer3_chunk5
);
v_b0
=
_mm_unpacklo_epi16
(
layer3_chunk2
,
layer3_chunk6
);
v_b1
=
_mm_unpackhi_epi16
(
layer3_chunk2
,
layer3_chunk6
);
v_a0
=
_mm_unpacklo_epi16
(
layer3_chunk3
,
layer3_chunk7
);
v_a1
=
_mm_unpackhi_epi16
(
layer3_chunk3
,
layer3_chunk7
);
}
inline
void
_mm_interliv_epi16
(
__m128i
&
v_r0
,
__m128i
&
v_r1
,
__m128i
&
v_g0
,
__m128i
&
v_g1
,
__m128i
&
v_b0
,
__m128i
&
v_b1
)
{
__m128i
v_mask
=
_mm_set1_epi32
(
0x0000ffff
);
__m128i
layer3_chunk0
=
_mm_packus_epi32
(
_mm_and_si128
(
v_r0
,
v_mask
),
_mm_and_si128
(
v_r1
,
v_mask
));
__m128i
layer3_chunk3
=
_mm_packus_epi32
(
_mm_srli_epi32
(
v_r0
,
16
),
_mm_srli_epi32
(
v_r1
,
16
));
__m128i
layer3_chunk1
=
_mm_packus_epi32
(
_mm_and_si128
(
v_g0
,
v_mask
),
_mm_and_si128
(
v_g1
,
v_mask
));
__m128i
layer3_chunk4
=
_mm_packus_epi32
(
_mm_srli_epi32
(
v_g0
,
16
),
_mm_srli_epi32
(
v_g1
,
16
));
__m128i
layer3_chunk2
=
_mm_packus_epi32
(
_mm_and_si128
(
v_b0
,
v_mask
),
_mm_and_si128
(
v_b1
,
v_mask
));
__m128i
layer3_chunk5
=
_mm_packus_epi32
(
_mm_srli_epi32
(
v_b0
,
16
),
_mm_srli_epi32
(
v_b1
,
16
));
__m128i
layer2_chunk0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer3_chunk0
,
v_mask
),
_mm_and_si128
(
layer3_chunk1
,
v_mask
));
__m128i
layer2_chunk3
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer3_chunk0
,
16
),
_mm_srli_epi32
(
layer3_chunk1
,
16
));
__m128i
layer2_chunk1
=
_mm_packus_epi32
(
_mm_and_si128
(
layer3_chunk2
,
v_mask
),
_mm_and_si128
(
layer3_chunk3
,
v_mask
));
__m128i
layer2_chunk4
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer3_chunk2
,
16
),
_mm_srli_epi32
(
layer3_chunk3
,
16
));
__m128i
layer2_chunk2
=
_mm_packus_epi32
(
_mm_and_si128
(
layer3_chunk4
,
v_mask
),
_mm_and_si128
(
layer3_chunk5
,
v_mask
));
__m128i
layer2_chunk5
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer3_chunk4
,
16
),
_mm_srli_epi32
(
layer3_chunk5
,
16
));
__m128i
layer1_chunk0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer2_chunk0
,
v_mask
),
_mm_and_si128
(
layer2_chunk1
,
v_mask
));
__m128i
layer1_chunk3
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer2_chunk0
,
16
),
_mm_srli_epi32
(
layer2_chunk1
,
16
));
__m128i
layer1_chunk1
=
_mm_packus_epi32
(
_mm_and_si128
(
layer2_chunk2
,
v_mask
),
_mm_and_si128
(
layer2_chunk3
,
v_mask
));
__m128i
layer1_chunk4
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer2_chunk2
,
16
),
_mm_srli_epi32
(
layer2_chunk3
,
16
));
__m128i
layer1_chunk2
=
_mm_packus_epi32
(
_mm_and_si128
(
layer2_chunk4
,
v_mask
),
_mm_and_si128
(
layer2_chunk5
,
v_mask
));
__m128i
layer1_chunk5
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer2_chunk4
,
16
),
_mm_srli_epi32
(
layer2_chunk5
,
16
));
v_r0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer1_chunk0
,
v_mask
),
_mm_and_si128
(
layer1_chunk1
,
v_mask
));
v_g1
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer1_chunk0
,
16
),
_mm_srli_epi32
(
layer1_chunk1
,
16
));
v_r1
=
_mm_packus_epi32
(
_mm_and_si128
(
layer1_chunk2
,
v_mask
),
_mm_and_si128
(
layer1_chunk3
,
v_mask
));
v_b0
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer1_chunk2
,
16
),
_mm_srli_epi32
(
layer1_chunk3
,
16
));
v_g0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer1_chunk4
,
v_mask
),
_mm_and_si128
(
layer1_chunk5
,
v_mask
));
v_b1
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer1_chunk4
,
16
),
_mm_srli_epi32
(
layer1_chunk5
,
16
));
}
inline
void
_mm_interliv_epi16
(
__m128i
&
v_r0
,
__m128i
&
v_r1
,
__m128i
&
v_g0
,
__m128i
&
v_g1
,
__m128i
&
v_b0
,
__m128i
&
v_b1
,
__m128i
&
v_a0
,
__m128i
&
v_a1
)
{
__m128i
v_mask
=
_mm_set1_epi32
(
0x0000ffff
);
__m128i
layer3_chunk0
=
_mm_packus_epi32
(
_mm_and_si128
(
v_r0
,
v_mask
),
_mm_and_si128
(
v_r1
,
v_mask
));
__m128i
layer3_chunk4
=
_mm_packus_epi32
(
_mm_srli_epi32
(
v_r0
,
16
),
_mm_srli_epi32
(
v_r1
,
16
));
__m128i
layer3_chunk1
=
_mm_packus_epi32
(
_mm_and_si128
(
v_g0
,
v_mask
),
_mm_and_si128
(
v_g1
,
v_mask
));
__m128i
layer3_chunk5
=
_mm_packus_epi32
(
_mm_srli_epi32
(
v_g0
,
16
),
_mm_srli_epi32
(
v_g1
,
16
));
__m128i
layer3_chunk2
=
_mm_packus_epi32
(
_mm_and_si128
(
v_b0
,
v_mask
),
_mm_and_si128
(
v_b1
,
v_mask
));
__m128i
layer3_chunk6
=
_mm_packus_epi32
(
_mm_srli_epi32
(
v_b0
,
16
),
_mm_srli_epi32
(
v_b1
,
16
));
__m128i
layer3_chunk3
=
_mm_packus_epi32
(
_mm_and_si128
(
v_a0
,
v_mask
),
_mm_and_si128
(
v_a1
,
v_mask
));
__m128i
layer3_chunk7
=
_mm_packus_epi32
(
_mm_srli_epi32
(
v_a0
,
16
),
_mm_srli_epi32
(
v_a1
,
16
));
__m128i
layer2_chunk0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer3_chunk0
,
v_mask
),
_mm_and_si128
(
layer3_chunk1
,
v_mask
));
__m128i
layer2_chunk4
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer3_chunk0
,
16
),
_mm_srli_epi32
(
layer3_chunk1
,
16
));
__m128i
layer2_chunk1
=
_mm_packus_epi32
(
_mm_and_si128
(
layer3_chunk2
,
v_mask
),
_mm_and_si128
(
layer3_chunk3
,
v_mask
));
__m128i
layer2_chunk5
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer3_chunk2
,
16
),
_mm_srli_epi32
(
layer3_chunk3
,
16
));
__m128i
layer2_chunk2
=
_mm_packus_epi32
(
_mm_and_si128
(
layer3_chunk4
,
v_mask
),
_mm_and_si128
(
layer3_chunk5
,
v_mask
));
__m128i
layer2_chunk6
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer3_chunk4
,
16
),
_mm_srli_epi32
(
layer3_chunk5
,
16
));
__m128i
layer2_chunk3
=
_mm_packus_epi32
(
_mm_and_si128
(
layer3_chunk6
,
v_mask
),
_mm_and_si128
(
layer3_chunk7
,
v_mask
));
__m128i
layer2_chunk7
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer3_chunk6
,
16
),
_mm_srli_epi32
(
layer3_chunk7
,
16
));
__m128i
layer1_chunk0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer2_chunk0
,
v_mask
),
_mm_and_si128
(
layer2_chunk1
,
v_mask
));
__m128i
layer1_chunk4
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer2_chunk0
,
16
),
_mm_srli_epi32
(
layer2_chunk1
,
16
));
__m128i
layer1_chunk1
=
_mm_packus_epi32
(
_mm_and_si128
(
layer2_chunk2
,
v_mask
),
_mm_and_si128
(
layer2_chunk3
,
v_mask
));
__m128i
layer1_chunk5
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer2_chunk2
,
16
),
_mm_srli_epi32
(
layer2_chunk3
,
16
));
__m128i
layer1_chunk2
=
_mm_packus_epi32
(
_mm_and_si128
(
layer2_chunk4
,
v_mask
),
_mm_and_si128
(
layer2_chunk5
,
v_mask
));
__m128i
layer1_chunk6
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer2_chunk4
,
16
),
_mm_srli_epi32
(
layer2_chunk5
,
16
));
__m128i
layer1_chunk3
=
_mm_packus_epi32
(
_mm_and_si128
(
layer2_chunk6
,
v_mask
),
_mm_and_si128
(
layer2_chunk7
,
v_mask
));
__m128i
layer1_chunk7
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer2_chunk6
,
16
),
_mm_srli_epi32
(
layer2_chunk7
,
16
));
v_r0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer1_chunk0
,
v_mask
),
_mm_and_si128
(
layer1_chunk1
,
v_mask
));
v_b0
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer1_chunk0
,
16
),
_mm_srli_epi32
(
layer1_chunk1
,
16
));
v_r0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer1_chunk2
,
v_mask
),
_mm_and_si128
(
layer1_chunk3
,
v_mask
));
v_b1
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer1_chunk2
,
16
),
_mm_srli_epi32
(
layer1_chunk3
,
16
));
v_g0
=
_mm_packus_epi32
(
_mm_and_si128
(
layer1_chunk4
,
v_mask
),
_mm_and_si128
(
layer1_chunk5
,
v_mask
));
v_a0
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer1_chunk4
,
16
),
_mm_srli_epi32
(
layer1_chunk5
,
16
));
v_g1
=
_mm_packus_epi32
(
_mm_and_si128
(
layer1_chunk6
,
v_mask
),
_mm_and_si128
(
layer1_chunk7
,
v_mask
));
v_a1
=
_mm_packus_epi32
(
_mm_srli_epi32
(
layer1_chunk6
,
16
),
_mm_srli_epi32
(
layer1_chunk7
,
16
));
}
inline
void
_mm_deinterliv_ps
(
__m128
&
v_r0
,
__m128
&
v_r1
,
__m128
&
v_g0
,
__m128
&
v_g1
,
__m128
&
v_b0
,
__m128
&
v_b1
)
{
__m128
layer1_chunk0
=
_mm_unpacklo_ps
(
v_r0
,
v_g1
);
__m128
layer1_chunk1
=
_mm_unpackhi_ps
(
v_r0
,
v_g1
);
__m128
layer1_chunk2
=
_mm_unpacklo_ps
(
v_r1
,
v_b0
);
__m128
layer1_chunk3
=
_mm_unpackhi_ps
(
v_r1
,
v_b0
);
__m128
layer1_chunk4
=
_mm_unpacklo_ps
(
v_g0
,
v_b1
);
__m128
layer1_chunk5
=
_mm_unpackhi_ps
(
v_g0
,
v_b1
);
__m128
layer2_chunk0
=
_mm_unpacklo_ps
(
layer1_chunk0
,
layer1_chunk3
);
__m128
layer2_chunk1
=
_mm_unpackhi_ps
(
layer1_chunk0
,
layer1_chunk3
);
__m128
layer2_chunk2
=
_mm_unpacklo_ps
(
layer1_chunk1
,
layer1_chunk4
);
__m128
layer2_chunk3
=
_mm_unpackhi_ps
(
layer1_chunk1
,
layer1_chunk4
);
__m128
layer2_chunk4
=
_mm_unpacklo_ps
(
layer1_chunk2
,
layer1_chunk5
);
__m128
layer2_chunk5
=
_mm_unpackhi_ps
(
layer1_chunk2
,
layer1_chunk5
);
v_r0
=
_mm_unpacklo_ps
(
layer2_chunk0
,
layer2_chunk3
);
v_r1
=
_mm_unpackhi_ps
(
layer2_chunk0
,
layer2_chunk3
);
v_g0
=
_mm_unpacklo_ps
(
layer2_chunk1
,
layer2_chunk4
);
v_g1
=
_mm_unpackhi_ps
(
layer2_chunk1
,
layer2_chunk4
);
v_b0
=
_mm_unpacklo_ps
(
layer2_chunk2
,
layer2_chunk5
);
v_b1
=
_mm_unpackhi_ps
(
layer2_chunk2
,
layer2_chunk5
);
}
inline
void
_mm_deinterliv_ps
(
__m128
&
v_r0
,
__m128
&
v_r1
,
__m128
&
v_g0
,
__m128
&
v_g1
,
__m128
&
v_b0
,
__m128
&
v_b1
,
__m128
&
v_a0
,
__m128
&
v_a1
)
{
__m128
layer1_chunk0
=
_mm_unpacklo_ps
(
v_r0
,
v_b0
);
__m128
layer1_chunk1
=
_mm_unpackhi_ps
(
v_r0
,
v_b0
);
__m128
layer1_chunk2
=
_mm_unpacklo_ps
(
v_r1
,
v_b1
);
__m128
layer1_chunk3
=
_mm_unpackhi_ps
(
v_r1
,
v_b1
);
__m128
layer1_chunk4
=
_mm_unpacklo_ps
(
v_g0
,
v_a0
);
__m128
layer1_chunk5
=
_mm_unpackhi_ps
(
v_g0
,
v_a0
);
__m128
layer1_chunk6
=
_mm_unpacklo_ps
(
v_g1
,
v_a1
);
__m128
layer1_chunk7
=
_mm_unpackhi_ps
(
v_g1
,
v_a1
);
__m128
layer2_chunk0
=
_mm_unpacklo_ps
(
layer1_chunk0
,
layer1_chunk4
);
__m128
layer2_chunk1
=
_mm_unpackhi_ps
(
layer1_chunk0
,
layer1_chunk4
);
__m128
layer2_chunk2
=
_mm_unpacklo_ps
(
layer1_chunk1
,
layer1_chunk5
);
__m128
layer2_chunk3
=
_mm_unpackhi_ps
(
layer1_chunk1
,
layer1_chunk5
);
__m128
layer2_chunk4
=
_mm_unpacklo_ps
(
layer1_chunk2
,
layer1_chunk6
);
__m128
layer2_chunk5
=
_mm_unpackhi_ps
(
layer1_chunk2
,
layer1_chunk6
);
__m128
layer2_chunk6
=
_mm_unpacklo_ps
(
layer1_chunk3
,
layer1_chunk7
);
__m128
layer2_chunk7
=
_mm_unpackhi_ps
(
layer1_chunk3
,
layer1_chunk7
);
v_r0
=
_mm_unpacklo_ps
(
layer2_chunk0
,
layer2_chunk4
);
v_r1
=
_mm_unpackhi_ps
(
layer2_chunk0
,
layer2_chunk4
);
v_g0
=
_mm_unpacklo_ps
(
layer2_chunk1
,
layer2_chunk5
);
v_g1
=
_mm_unpackhi_ps
(
layer2_chunk1
,
layer2_chunk5
);
v_b0
=
_mm_unpacklo_ps
(
layer2_chunk2
,
layer2_chunk6
);
v_b1
=
_mm_unpackhi_ps
(
layer2_chunk2
,
layer2_chunk6
);
v_a0
=
_mm_unpacklo_ps
(
layer2_chunk3
,
layer2_chunk7
);
v_a1
=
_mm_unpackhi_ps
(
layer2_chunk3
,
layer2_chunk7
);
}
inline
void
_mm_interliv_ps
(
__m128
&
v_r0
,
__m128
&
v_r1
,
__m128
&
v_g0
,
__m128
&
v_g1
,
__m128
&
v_b0
,
__m128
&
v_b1
)
{
const
int
mask_lo
=
_MM_SHUFFLE
(
2
,
0
,
2
,
0
),
mask_hi
=
_MM_SHUFFLE
(
3
,
1
,
3
,
1
);
__m128
layer2_chunk0
=
_mm_shuffle_ps
(
v_r0
,
v_r1
,
mask_lo
);
__m128
layer2_chunk3
=
_mm_shuffle_ps
(
v_r0
,
v_r1
,
mask_hi
);
__m128
layer2_chunk1
=
_mm_shuffle_ps
(
v_g0
,
v_g1
,
mask_lo
);
__m128
layer2_chunk4
=
_mm_shuffle_ps
(
v_g0
,
v_g1
,
mask_hi
);
__m128
layer2_chunk2
=
_mm_shuffle_ps
(
v_b0
,
v_b1
,
mask_lo
);
__m128
layer2_chunk5
=
_mm_shuffle_ps
(
v_b0
,
v_b1
,
mask_hi
);
__m128
layer1_chunk0
=
_mm_shuffle_ps
(
layer2_chunk0
,
layer2_chunk1
,
mask_lo
);
__m128
layer1_chunk3
=
_mm_shuffle_ps
(
layer2_chunk0
,
layer2_chunk1
,
mask_hi
);
__m128
layer1_chunk1
=
_mm_shuffle_ps
(
layer2_chunk2
,
layer2_chunk3
,
mask_lo
);
__m128
layer1_chunk4
=
_mm_shuffle_ps
(
layer2_chunk2
,
layer2_chunk3
,
mask_hi
);
__m128
layer1_chunk2
=
_mm_shuffle_ps
(
layer2_chunk4
,
layer2_chunk5
,
mask_lo
);
__m128
layer1_chunk5
=
_mm_shuffle_ps
(
layer2_chunk4
,
layer2_chunk5
,
mask_hi
);
v_r0
=
_mm_shuffle_ps
(
layer1_chunk0
,
layer1_chunk1
,
mask_lo
);
v_g1
=
_mm_shuffle_ps
(
layer1_chunk0
,
layer1_chunk1
,
mask_hi
);
v_r1
=
_mm_shuffle_ps
(
layer1_chunk2
,
layer1_chunk3
,
mask_lo
);
v_b0
=
_mm_shuffle_ps
(
layer1_chunk2
,
layer1_chunk3
,
mask_hi
);
v_g0
=
_mm_shuffle_ps
(
layer1_chunk4
,
layer1_chunk5
,
mask_lo
);
v_b1
=
_mm_shuffle_ps
(
layer1_chunk4
,
layer1_chunk5
,
mask_hi
);
}
inline
void
_mm_interliv_ps
(
__m128
&
v_r0
,
__m128
&
v_r1
,
__m128
&
v_g0
,
__m128
&
v_g1
,
__m128
&
v_b0
,
__m128
&
v_b1
,
__m128
&
v_a0
,
__m128
&
v_a1
)
{
const
int
mask_lo
=
_MM_SHUFFLE
(
2
,
0
,
2
,
0
),
mask_hi
=
_MM_SHUFFLE
(
3
,
1
,
3
,
1
);
__m128
layer2_chunk0
=
_mm_shuffle_ps
(
v_r0
,
v_r1
,
mask_lo
);
__m128
layer2_chunk4
=
_mm_shuffle_ps
(
v_r0
,
v_r1
,
mask_hi
);
__m128
layer2_chunk1
=
_mm_shuffle_ps
(
v_g0
,
v_g1
,
mask_lo
);
__m128
layer2_chunk5
=
_mm_shuffle_ps
(
v_g0
,
v_g1
,
mask_hi
);
__m128
layer2_chunk2
=
_mm_shuffle_ps
(
v_b0
,
v_b1
,
mask_lo
);
__m128
layer2_chunk6
=
_mm_shuffle_ps
(
v_b0
,
v_b1
,
mask_hi
);
__m128
layer2_chunk3
=
_mm_shuffle_ps
(
v_a0
,
v_a1
,
mask_lo
);
__m128
layer2_chunk7
=
_mm_shuffle_ps
(
v_a0
,
v_a1
,
mask_hi
);
__m128
layer1_chunk0
=
_mm_shuffle_ps
(
layer2_chunk0
,
layer2_chunk1
,
mask_lo
);
__m128
layer1_chunk4
=
_mm_shuffle_ps
(
layer2_chunk0
,
layer2_chunk1
,
mask_hi
);
__m128
layer1_chunk1
=
_mm_shuffle_ps
(
layer2_chunk2
,
layer2_chunk3
,
mask_lo
);
__m128
layer1_chunk5
=
_mm_shuffle_ps
(
layer2_chunk2
,
layer2_chunk3
,
mask_hi
);
__m128
layer1_chunk2
=
_mm_shuffle_ps
(
layer2_chunk4
,
layer2_chunk5
,
mask_lo
);
__m128
layer1_chunk6
=
_mm_shuffle_ps
(
layer2_chunk4
,
layer2_chunk5
,
mask_hi
);
__m128
layer1_chunk3
=
_mm_shuffle_ps
(
layer2_chunk6
,
layer2_chunk7
,
mask_lo
);
__m128
layer1_chunk7
=
_mm_shuffle_ps
(
layer2_chunk6
,
layer2_chunk7
,
mask_hi
);
v_r0
=
_mm_shuffle_ps
(
layer1_chunk0
,
layer1_chunk1
,
mask_lo
);
v_b0
=
_mm_shuffle_ps
(
layer1_chunk0
,
layer1_chunk1
,
mask_hi
);
v_r1
=
_mm_shuffle_ps
(
layer1_chunk2
,
layer1_chunk3
,
mask_lo
);
v_b1
=
_mm_shuffle_ps
(
layer1_chunk2
,
layer1_chunk3
,
mask_hi
);
v_g0
=
_mm_shuffle_ps
(
layer1_chunk4
,
layer1_chunk5
,
mask_lo
);
v_a0
=
_mm_shuffle_ps
(
layer1_chunk4
,
layer1_chunk5
,
mask_hi
);
v_g1
=
_mm_shuffle_ps
(
layer1_chunk6
,
layer1_chunk7
,
mask_lo
);
v_a1
=
_mm_shuffle_ps
(
layer1_chunk6
,
layer1_chunk7
,
mask_hi
);
}
#endif
#endif //__OPENCV_CORE_SSE_UTILS_HPP__
modules/imgproc/src/color.cpp
View file @
3a426660
...
...
@@ -102,205 +102,6 @@
static
IppStatus
sts
=
ippInit
();
#endif
#if CV_SSE2
#define _MM_DEINTERLIV_EPI8(layer0_chunk0, layer0_chunk1, layer0_chunk2, \
layer0_chunk3, layer0_chunk4, layer0_chunk5) \
{ \
__m128i layer1_chunk0 = _mm_unpacklo_epi8(layer0_chunk0, layer0_chunk3); \
__m128i layer1_chunk1 = _mm_unpackhi_epi8(layer0_chunk0, layer0_chunk3); \
__m128i layer1_chunk2 = _mm_unpacklo_epi8(layer0_chunk1, layer0_chunk4); \
__m128i layer1_chunk3 = _mm_unpackhi_epi8(layer0_chunk1, layer0_chunk4); \
__m128i layer1_chunk4 = _mm_unpacklo_epi8(layer0_chunk2, layer0_chunk5); \
__m128i layer1_chunk5 = _mm_unpackhi_epi8(layer0_chunk2, layer0_chunk5); \
\
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); \
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); \
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); \
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); \
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); \
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); \
\
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); \
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); \
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); \
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); \
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); \
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); \
\
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); \
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); \
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); \
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); \
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); \
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); \
\
layer0_chunk0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); \
layer0_chunk1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); \
layer0_chunk2 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); \
layer0_chunk3 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); \
layer0_chunk4 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); \
layer0_chunk5 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); \
}
#define _MM_INTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \
{ \
__m128i v_mask = _mm_set1_epi16(0x00ff); \
\
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); \
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); \
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); \
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); \
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); \
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); \
\
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); \
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); \
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); \
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); \
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); \
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); \
\
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); \
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); \
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); \
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); \
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); \
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); \
\
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); \
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); \
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); \
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); \
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); \
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); \
\
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); \
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); \
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); \
}
#define _MM_DEINTERLIV_EPI16(layer0_chunk0, layer0_chunk1, layer0_chunk2, \
layer0_chunk3, layer0_chunk4, layer0_chunk5) \
{ \
__m128i layer1_chunk0 = _mm_unpacklo_epi16(layer0_chunk0, layer0_chunk3); \
__m128i layer1_chunk1 = _mm_unpackhi_epi16(layer0_chunk0, layer0_chunk3); \
__m128i layer1_chunk2 = _mm_unpacklo_epi16(layer0_chunk1, layer0_chunk4); \
__m128i layer1_chunk3 = _mm_unpackhi_epi16(layer0_chunk1, layer0_chunk4); \
__m128i layer1_chunk4 = _mm_unpacklo_epi16(layer0_chunk2, layer0_chunk5); \
__m128i layer1_chunk5 = _mm_unpackhi_epi16(layer0_chunk2, layer0_chunk5); \
\
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); \
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); \
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); \
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); \
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); \
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); \
\
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); \
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); \
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); \
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); \
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); \
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); \
\
layer0_chunk0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); \
layer0_chunk1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); \
layer0_chunk2 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); \
layer0_chunk3 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); \
layer0_chunk4 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); \
layer0_chunk5 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); \
}
#define _MM_INTERLIV_EPI16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \
{ \
__m128i v_mask = _mm_set1_epi32(0x0000ffff); \
\
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); \
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); \
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); \
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); \
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); \
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); \
\
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); \
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); \
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); \
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); \
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); \
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); \
\
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); \
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); \
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); \
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); \
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); \
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); \
\
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); \
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); \
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); \
}
#define _MM_DEINTERLIV_PS(layer0_chunk0, layer0_chunk1, layer0_chunk2, \
layer0_chunk3, layer0_chunk4, layer0_chunk5) \
{ \
__m128 layer1_chunk0 = _mm_unpacklo_ps(layer0_chunk0, layer0_chunk3); \
__m128 layer1_chunk1 = _mm_unpackhi_ps(layer0_chunk0, layer0_chunk3); \
__m128 layer1_chunk2 = _mm_unpacklo_ps(layer0_chunk1, layer0_chunk4); \
__m128 layer1_chunk3 = _mm_unpackhi_ps(layer0_chunk1, layer0_chunk4); \
__m128 layer1_chunk4 = _mm_unpacklo_ps(layer0_chunk2, layer0_chunk5); \
__m128 layer1_chunk5 = _mm_unpackhi_ps(layer0_chunk2, layer0_chunk5); \
\
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); \
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); \
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); \
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); \
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); \
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); \
\
layer0_chunk0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); \
layer0_chunk1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); \
layer0_chunk2 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); \
layer0_chunk3 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); \
layer0_chunk4 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); \
layer0_chunk5 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); \
}
#define _MM_INTERLIV_PS(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \
{ \
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); \
\
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); \
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); \
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); \
__m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); \
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); \
__m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); \
\
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); \
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); \
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); \
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); \
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); \
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); \
\
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); \
v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); \
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); \
v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); \
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); \
v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); \
}
#endif
namespace
cv
{
...
...
@@ -1703,7 +1504,34 @@ struct RGB2Gray<ushort>
__m128i
v_b0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
32
));
__m128i
v_b1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
40
));
_MM_DEINTERLIV_EPI16
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_mm_deinterliv_epi16
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
__m128i
v_gray0
;
process
(
v_r0
,
v_g0
,
v_b0
,
v_gray0
);
__m128i
v_gray1
;
process
(
v_r1
,
v_g1
,
v_b1
,
v_gray1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
),
v_gray0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
+
8
),
v_gray1
);
}
}
else
if
(
scn
==
4
)
{
for
(
;
i
<=
n
-
16
;
i
+=
16
,
src
+=
scn
*
16
)
{
__m128i
v_r0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
));
__m128i
v_r1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
8
));
__m128i
v_g0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
16
));
__m128i
v_g1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
24
));
__m128i
v_b0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
32
));
__m128i
v_b1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
40
));
__m128i
v_a0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
48
));
__m128i
v_a1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
56
));
_mm_deinterliv_epi16
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
,
v_a0
,
v_a1
);
__m128i
v_gray0
;
process
(
v_r0
,
v_g0
,
v_b0
,
...
...
@@ -1768,7 +1596,34 @@ struct RGB2Gray<float>
__m128
v_b0
=
_mm_loadu_ps
(
src
+
16
);
__m128
v_b1
=
_mm_loadu_ps
(
src
+
20
);
_MM_DEINTERLIV_PS
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_mm_deinterliv_ps
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
__m128
v_gray0
;
process
(
v_r0
,
v_g0
,
v_b0
,
v_gray0
);
__m128
v_gray1
;
process
(
v_r1
,
v_g1
,
v_b1
,
v_gray1
);
_mm_storeu_ps
(
dst
+
i
,
v_gray0
);
_mm_storeu_ps
(
dst
+
i
+
4
,
v_gray1
);
}
}
else
if
(
scn
==
4
)
{
for
(
;
i
<=
n
-
8
;
i
+=
8
,
src
+=
scn
*
8
)
{
__m128
v_r0
=
_mm_loadu_ps
(
src
);
__m128
v_r1
=
_mm_loadu_ps
(
src
+
4
);
__m128
v_g0
=
_mm_loadu_ps
(
src
+
8
);
__m128
v_g1
=
_mm_loadu_ps
(
src
+
12
);
__m128
v_b0
=
_mm_loadu_ps
(
src
+
16
);
__m128
v_b1
=
_mm_loadu_ps
(
src
+
20
);
__m128
v_a0
=
_mm_loadu_ps
(
src
+
24
);
__m128
v_a1
=
_mm_loadu_ps
(
src
+
28
);
_mm_deinterliv_ps
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
,
v_a0
,
v_a1
);
__m128
v_gray0
;
process
(
v_r0
,
v_g0
,
v_b0
,
...
...
@@ -1966,7 +1821,7 @@ struct RGB2YCrCb_f<float>
__m128
v_b0
=
_mm_loadu_ps
(
src
+
16
);
__m128
v_b1
=
_mm_loadu_ps
(
src
+
20
);
_
MM_DEINTERLIV_PS
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_deinterliv_ps
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
__m128
v_y0
,
v_cr0
,
v_cb0
;
process
(
v_r0
,
v_g0
,
v_b0
,
...
...
@@ -1976,7 +1831,7 @@ struct RGB2YCrCb_f<float>
process
(
v_r1
,
v_g1
,
v_b1
,
v_y1
,
v_cr1
,
v_cb1
);
_
MM_INTERLIV_PS
(
v_y0
,
v_y1
,
v_cr0
,
v_cr1
,
v_cb0
,
v_cb1
)
_
mm_interliv_ps
(
v_y0
,
v_y1
,
v_cr0
,
v_cr1
,
v_cb0
,
v_cb1
);
_mm_storeu_ps
(
dst
+
i
,
v_y0
);
_mm_storeu_ps
(
dst
+
i
+
4
,
v_y1
);
...
...
@@ -2331,7 +2186,7 @@ struct RGB2YCrCb_i<uchar>
__m128i
v_b0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
64
));
__m128i
v_b1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
80
));
_
MM_DEINTERLIV_EPI8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_deinterliv_epi8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
__m128i
v_y0
=
v_zero
,
v_cr0
=
v_zero
,
v_cb0
=
v_zero
;
process
(
_mm_unpacklo_epi8
(
v_r0
,
v_zero
),
...
...
@@ -2363,7 +2218,7 @@ struct RGB2YCrCb_i<uchar>
__m128i
v_cr_1
=
_mm_packus_epi16
(
v_cr0
,
v_cr1
);
__m128i
v_cb_1
=
_mm_packus_epi16
(
v_cb0
,
v_cb1
);
_
MM_INTERLIV_EPI8
(
v_y_0
,
v_y_1
,
v_cr_0
,
v_cr_1
,
v_cb_0
,
v_cb_1
)
_
mm_interlive_epi8
(
v_y_0
,
v_y_1
,
v_cr_0
,
v_cr_1
,
v_cb_0
,
v_cb_1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
),
v_y_0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
+
16
),
v_y_1
);
...
...
@@ -2473,7 +2328,7 @@ struct RGB2YCrCb_i<ushort>
__m128i
v_b0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
32
));
__m128i
v_b1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
40
));
_
MM_DEINTERLIV_EPI16
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_deinterliv_epi16
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
__m128i
v_y0
=
v_zero
,
v_cr0
=
v_zero
,
v_cb0
=
v_zero
;
process
(
v_r0
,
v_g0
,
v_b0
,
...
...
@@ -2483,7 +2338,7 @@ struct RGB2YCrCb_i<ushort>
process
(
v_r1
,
v_g1
,
v_b1
,
v_y1
,
v_cr1
,
v_cb1
);
_
MM_INTERLIV_EPI16
(
v_y0
,
v_y1
,
v_cr0
,
v_cr1
,
v_cb0
,
v_cb1
)
_
mm_interliv_epi16
(
v_y0
,
v_y1
,
v_cr0
,
v_cr1
,
v_cb0
,
v_cb1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
),
v_y0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
i
+
8
),
v_y1
);
...
...
@@ -2681,7 +2536,7 @@ struct YCrCb2RGB_f<float>
__m128
v_cb0
=
_mm_loadu_ps
(
src
+
i
+
16
);
__m128
v_cb1
=
_mm_loadu_ps
(
src
+
i
+
20
);
_
MM_DEINTERLIV_PS
(
v_y0
,
v_y1
,
v_cr0
,
v_cr1
,
v_cb0
,
v_cb1
)
_
mm_deinterliv_ps
(
v_y0
,
v_y1
,
v_cr0
,
v_cr1
,
v_cb0
,
v_cb1
);
__m128
v_r0
,
v_g0
,
v_b0
;
process
(
v_y0
,
v_cr0
,
v_cb0
,
...
...
@@ -2691,7 +2546,7 @@ struct YCrCb2RGB_f<float>
process
(
v_y1
,
v_cr1
,
v_cb1
,
v_r1
,
v_g1
,
v_b1
);
_
MM_INTERLIV_PS
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_interliv_ps
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
_mm_storeu_ps
(
dst
,
v_r0
);
_mm_storeu_ps
(
dst
+
4
,
v_r1
);
...
...
@@ -3094,7 +2949,7 @@ struct YCrCb2RGB_i<uchar>
__m128i
v_cb0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
i
+
64
));
__m128i
v_cb1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
i
+
80
));
_
MM_DEINTERLIV_EPI8
(
v_y0
,
v_y1
,
v_cr0
,
v_cr1
,
v_cb0
,
v_cb1
)
_
mm_deinterliv_epi8
(
v_y0
,
v_y1
,
v_cr0
,
v_cr1
,
v_cb0
,
v_cb1
);
__m128i
v_r_0
=
v_zero
,
v_g_0
=
v_zero
,
v_b_0
=
v_zero
;
process
(
_mm_unpacklo_epi8
(
v_y0
,
v_zero
),
...
...
@@ -3132,7 +2987,7 @@ struct YCrCb2RGB_i<uchar>
std
::
swap
(
v_r1
,
v_b1
);
}
_
MM_INTERLIV_EPI8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_interlive_epi8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
),
v_r0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
16
),
v_r1
);
...
...
@@ -3355,7 +3210,7 @@ struct RGB2XYZ_f<float>
__m128
v_b0
=
_mm_loadu_ps
(
src
+
16
);
__m128
v_b1
=
_mm_loadu_ps
(
src
+
20
);
_
MM_DEINTERLIV_PS
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_deinterliv_ps
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
__m128
v_x0
,
v_y0
,
v_z0
;
process
(
v_r0
,
v_g0
,
v_b0
,
...
...
@@ -3365,7 +3220,7 @@ struct RGB2XYZ_f<float>
process
(
v_r1
,
v_g1
,
v_b1
,
v_x1
,
v_y1
,
v_z1
);
_
MM_INTERLIV_PS
(
v_x0
,
v_x1
,
v_y0
,
v_y1
,
v_z0
,
v_z1
)
_
mm_interliv_ps
(
v_x0
,
v_x1
,
v_y0
,
v_y1
,
v_z0
,
v_z1
);
_mm_storeu_ps
(
dst
+
i
,
v_x0
);
_mm_storeu_ps
(
dst
+
i
+
4
,
v_x1
);
...
...
@@ -3781,7 +3636,7 @@ struct XYZ2RGB_f<float>
__m128
v_z0
=
_mm_loadu_ps
(
src
+
i
+
16
);
__m128
v_z1
=
_mm_loadu_ps
(
src
+
i
+
20
);
_
MM_DEINTERLIV_PS
(
v_x0
,
v_x1
,
v_y0
,
v_y1
,
v_z0
,
v_z1
)
_
mm_deinterliv_ps
(
v_x0
,
v_x1
,
v_y0
,
v_y1
,
v_z0
,
v_z1
);
__m128
v_r0
,
v_g0
,
v_b0
;
process
(
v_x0
,
v_y0
,
v_z0
,
...
...
@@ -3791,7 +3646,7 @@ struct XYZ2RGB_f<float>
process
(
v_x1
,
v_y1
,
v_z1
,
v_r1
,
v_g1
,
v_b1
);
_
MM_INTERLIV_PS
(
v_b0
,
v_b1
,
v_g0
,
v_g1
,
v_r0
,
v_r1
)
_
mm_interliv_ps
(
v_b0
,
v_b1
,
v_g0
,
v_g1
,
v_r0
,
v_r1
);
_mm_storeu_ps
(
dst
,
v_b0
);
_mm_storeu_ps
(
dst
+
4
,
v_b1
);
...
...
@@ -4361,7 +4216,7 @@ struct HSV2RGB_b
v_g1
=
_mm_mul_ps
(
v_g1
,
v_scale_inv
);
v_b1
=
_mm_mul_ps
(
v_b1
,
v_scale_inv
);
_
MM_INTERLIV_PS
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_interliv_ps
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
_mm_store_ps
(
buf
,
v_r0
);
_mm_store_ps
(
buf
+
4
,
v_r1
);
...
...
@@ -4412,7 +4267,7 @@ struct HSV2RGB_b
__m128i
v_b0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
j
+
64
));
__m128i
v_b1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
j
+
80
));
_
MM_DEINTERLIV_EPI8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_deinterliv_epi8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
process
(
_mm_unpacklo_epi8
(
v_r0
,
v_zero
),
_mm_unpacklo_epi8
(
v_g0
,
v_zero
),
...
...
@@ -4606,7 +4461,7 @@ struct RGB2HLS_b
__m128
v_s0f
=
_mm_load_ps
(
buf
+
16
);
__m128
v_s1f
=
_mm_load_ps
(
buf
+
20
);
_
MM_DEINTERLIV_PS
(
v_h0f
,
v_h1f
,
v_l0f
,
v_l1f
,
v_s0f
,
v_s1f
)
_
mm_deinterliv_ps
(
v_h0f
,
v_h1f
,
v_l0f
,
v_l1f
,
v_s0f
,
v_s1f
);
v_l0f
=
_mm_mul_ps
(
v_l0f
,
v_scale
);
v_l1f
=
_mm_mul_ps
(
v_l1f
,
v_scale
);
...
...
@@ -4729,7 +4584,7 @@ struct RGB2HLS_b
__m128i
v_l1
=
_mm_packus_epi16
(
v_l_0
,
v_l_1
);
__m128i
v_s1
=
_mm_packus_epi16
(
v_s_0
,
v_s_1
);
_
MM_INTERLIV_EPI8
(
v_h0
,
v_h1
,
v_l0
,
v_l1
,
v_s0
,
v_s1
)
_
mm_interlive_epi8
(
v_h0
,
v_h1
,
v_l0
,
v_l1
,
v_s0
,
v_s1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
j
),
v_h0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
j
+
16
),
v_h1
);
...
...
@@ -4861,7 +4716,7 @@ struct HLS2RGB_b
v_g1
=
_mm_mul_ps
(
v_g1
,
v_scale_inv
);
v_b1
=
_mm_mul_ps
(
v_b1
,
v_scale_inv
);
_
MM_INTERLIV_PS
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_interliv_ps
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
_mm_store_ps
(
buf
,
v_r0
);
_mm_store_ps
(
buf
+
4
,
v_r1
);
...
...
@@ -4912,7 +4767,7 @@ struct HLS2RGB_b
__m128i
v_b0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
j
+
64
));
__m128i
v_b1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
j
+
80
));
_
MM_DEINTERLIV_EPI8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_deinterliv_epi8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
process
(
_mm_unpacklo_epi8
(
v_r0
,
v_zero
),
_mm_unpacklo_epi8
(
v_g0
,
v_zero
),
...
...
@@ -5360,7 +5215,7 @@ struct Lab2RGB_b
v_b0
=
_mm_sub_ps
(
v_b0
,
v_128
);
v_b1
=
_mm_sub_ps
(
v_b1
,
v_128
);
_
MM_INTERLIV_PS
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_interliv_ps
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
_mm_store_ps
(
buf
,
v_r0
);
_mm_store_ps
(
buf
+
4
,
v_r1
);
...
...
@@ -5411,7 +5266,7 @@ struct Lab2RGB_b
__m128i
v_b0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
j
+
64
));
__m128i
v_b1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
j
+
80
));
_
MM_DEINTERLIV_EPI8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_deinterliv_epi8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
process
(
_mm_unpacklo_epi8
(
v_r0
,
v_zero
),
_mm_unpacklo_epi8
(
v_g0
,
v_zero
),
...
...
@@ -5713,7 +5568,7 @@ struct RGB2Luv_b
__m128
v_v0f
=
_mm_load_ps
(
buf
+
16
);
__m128
v_v1f
=
_mm_load_ps
(
buf
+
20
);
_
MM_DEINTERLIV_PS
(
v_l0f
,
v_l1f
,
v_u0f
,
v_u1f
,
v_v0f
,
v_v1f
)
_
mm_deinterliv_ps
(
v_l0f
,
v_l1f
,
v_u0f
,
v_u1f
,
v_v0f
,
v_v1f
);
v_l0f
=
_mm_mul_ps
(
v_l0f
,
v_scale
);
v_l1f
=
_mm_mul_ps
(
v_l1f
,
v_scale
);
...
...
@@ -5839,7 +5694,7 @@ struct RGB2Luv_b
__m128i
v_u1
=
_mm_packus_epi16
(
v_u_0
,
v_u_1
);
__m128i
v_v1
=
_mm_packus_epi16
(
v_v_0
,
v_v_1
);
_
MM_INTERLIV_EPI8
(
v_l0
,
v_l1
,
v_u0
,
v_u1
,
v_v0
,
v_v1
)
_
mm_interlive_epi8
(
v_l0
,
v_l1
,
v_u0
,
v_u1
,
v_v0
,
v_v1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
j
),
v_l0
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
j
+
16
),
v_l1
);
...
...
@@ -5920,7 +5775,7 @@ struct Luv2RGB_b
v_v0
=
_mm_sub_ps
(
_mm_mul_ps
(
v_v0
,
v_coeff2
),
v_140
);
v_v1
=
_mm_sub_ps
(
_mm_mul_ps
(
v_v1
,
v_coeff2
),
v_140
);
_
MM_INTERLIV_PS
(
v_l0
,
v_l1
,
v_u0
,
v_u1
,
v_v0
,
v_v1
)
_
mm_interliv_ps
(
v_l0
,
v_l1
,
v_u0
,
v_u1
,
v_v0
,
v_v1
);
_mm_store_ps
(
buf
,
v_l0
);
_mm_store_ps
(
buf
+
4
,
v_l1
);
...
...
@@ -5971,7 +5826,7 @@ struct Luv2RGB_b
__m128i
v_b0
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
j
+
64
));
__m128i
v_b1
=
_mm_loadu_si128
((
__m128i
const
*
)(
src
+
j
+
80
));
_
MM_DEINTERLIV_EPI8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
)
_
mm_deinterliv_epi8
(
v_r0
,
v_r1
,
v_g0
,
v_g1
,
v_b0
,
v_b1
);
process
(
_mm_unpacklo_epi8
(
v_r0
,
v_zero
),
_mm_unpacklo_epi8
(
v_g0
,
v_zero
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment