Commit c46f119e authored by ChipKerchner's avatar ChipKerchner

Convert demosaic functions to HAL

parent d513fb4c
......@@ -86,6 +86,7 @@
#include "precomp.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include <limits>
......@@ -111,7 +112,7 @@ public:
return 0;
}
int bayer2RGBA(const T*, int, T*, int, int) const
int bayer2RGBA(const T*, int, T*, int, int, const T) const
{
return 0;
}
......@@ -122,279 +123,14 @@ public:
}
};
#if CV_SSE2
#if CV_SIMD128
class SIMDBayerInterpolator_8u
{
public:
SIMDBayerInterpolator_8u()
{
use_simd = checkHardwareSupport(CV_CPU_SSE2);
}
int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
int width, int bcoeff, int gcoeff, int rcoeff) const
{
if( !use_simd )
return 0;
__m128i _b2y = _mm_set1_epi16((short)(rcoeff*2));
__m128i _g2y = _mm_set1_epi16((short)(gcoeff*2));
__m128i _r2y = _mm_set1_epi16((short)(bcoeff*2));
const uchar* bayer_end = bayer + width;
for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
{
__m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
__m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
__m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
__m128i b1 = _mm_add_epi16(_mm_srli_epi16(_mm_slli_epi16(r0, 8), 7),
_mm_srli_epi16(_mm_slli_epi16(r2, 8), 7));
__m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2));
b1 = _mm_slli_epi16(_mm_srli_si128(b1, 2), 1);
__m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 7), _mm_srli_epi16(r2, 7));
__m128i g1 = _mm_srli_epi16(_mm_slli_epi16(r1, 8), 7);
g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
g1 = _mm_slli_epi16(_mm_srli_si128(g1, 2), 2);
r0 = _mm_srli_epi16(r1, 8);
r1 = _mm_slli_epi16(_mm_add_epi16(r0, _mm_srli_si128(r0, 2)), 2);
r0 = _mm_slli_epi16(r0, 3);
g0 = _mm_add_epi16(_mm_mulhi_epi16(b0, _b2y), _mm_mulhi_epi16(g0, _g2y));
g1 = _mm_add_epi16(_mm_mulhi_epi16(b1, _b2y), _mm_mulhi_epi16(g1, _g2y));
g0 = _mm_add_epi16(g0, _mm_mulhi_epi16(r0, _r2y));
g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(r1, _r2y));
g0 = _mm_srli_epi16(g0, 2);
g1 = _mm_srli_epi16(g1, 2);
g0 = _mm_packus_epi16(g0, g0);
g1 = _mm_packus_epi16(g1, g1);
g0 = _mm_unpacklo_epi8(g0, g1);
_mm_storeu_si128((__m128i*)dst, g0);
}
return (int)(bayer - (bayer_end - width));
}
int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
{
if( !use_simd )
return 0;
/*
B G B G | B G B G | B G B G | B G B G
G R G R | G R G R | G R G R | G R G R
B G B G | B G B G | B G B G | B G B G
*/
__m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
__m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128();
__m128i masklo = _mm_set1_epi16(0x00ff);
const uchar* bayer_end = bayer + width;
for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
{
__m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
__m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
__m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
__m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo));
__m128i nextb1 = _mm_srli_si128(b1, 2);
__m128i b0 = _mm_add_epi16(b1, nextb1);
b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
// b0 b2 ... b14 b1 b3 ... b15
b0 = _mm_packus_epi16(b0, b1);
__m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8));
__m128i g1 = _mm_and_si128(r1, masklo);
g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
g1 = _mm_srli_si128(g1, 2);
g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2);
// g0 g2 ... g14 g1 g3 ... g15
g0 = _mm_packus_epi16(g0, g1);
r0 = _mm_srli_epi16(r1, 8);
r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
// r0 r2 ... r14 r1 r3 ... r15
r0 = _mm_packus_epi16(r0, r1);
b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
b0 = _mm_xor_si128(b0, b1);
r0 = _mm_xor_si128(r0, b1);
// b1 g1 b3 g3 b5 g5...
b1 = _mm_unpackhi_epi8(b0, g0);
// b0 g0 b2 g2 b4 g4 ....
b0 = _mm_unpacklo_epi8(b0, g0);
// r1 0 r3 0 r5 0 ...
r1 = _mm_unpackhi_epi8(r0, z);
// r0 0 r2 0 r4 0 ...
r0 = _mm_unpacklo_epi8(r0, z);
// 0 b0 g0 r0 0 b2 g2 r2 ...
g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
// 0 b8 g8 r8 0 b10 g10 r10 ...
g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
// b1 g1 r1 0 b3 g3 r3 0 ...
r0 = _mm_unpacklo_epi16(b1, r1);
// b9 g9 r9 0 b11 g11 r11 0 ...
r1 = _mm_unpackhi_epi16(b1, r1);
// 0 b0 g0 r0 b1 g1 r1 0 ...
b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
// 0 b4 g4 r4 b5 g5 r5 0 ...
b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
_mm_storel_epi64((__m128i*)(dst-1+0), b0);
_mm_storel_epi64((__m128i*)(dst-1+6*1), _mm_srli_si128(b0, 8));
_mm_storel_epi64((__m128i*)(dst-1+6*2), b1);
_mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8));
// 0 b8 g8 r8 b9 g9 r9 0 ...
g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
// 0 b12 g12 r12 b13 g13 r13 0 ...
g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
_mm_storel_epi64((__m128i*)(dst-1+6*4), g0);
_mm_storel_epi64((__m128i*)(dst-1+6*5), _mm_srli_si128(g0, 8));
_mm_storel_epi64((__m128i*)(dst-1+6*6), g1);
}
return (int)(bayer - (bayer_end - width));
}
int bayer2RGBA(const uchar*, int, uchar*, int, int) const
{
return 0;
}
int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
{
if (!use_simd)
return 0;
const uchar* bayer_end = bayer + width;
__m128i masklow = _mm_set1_epi16(0x00ff);
__m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
__m128i full = _mm_set1_epi16(-1), z = _mm_setzero_si128();
__m128i mask = _mm_set1_epi16(blue > 0 ? -1 : 0);
for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42)
{
/*
B G B G | B G B G | B G B G | B G B G
G R G R | G R G R | G R G R | G R G R
B G B G | B G B G | B G B G | B G B G
*/
__m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
__m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
__m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
__m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklow), _mm_and_si128(r2, masklow));
__m128i nextb1 = _mm_srli_si128(b1, 2);
__m128i b0 = _mm_add_epi16(b1, nextb1);
b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
// b0 b2 ... b14 b1 b3 ... b15
b0 = _mm_packus_epi16(b0, b1);
// vertical sum
__m128i r0g = _mm_srli_epi16(r0, 8);
__m128i r2g = _mm_srli_epi16(r2, 8);
__m128i sumv = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(r0g, r2g), delta1), 1);
// gorizontal sum
__m128i g1 = _mm_and_si128(masklow, r1);
__m128i nextg1 = _mm_srli_si128(g1, 2);
__m128i sumg = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(g1, nextg1), delta1), 1);
// gradients
__m128i gradv = _mm_adds_epi16(_mm_subs_epu16(r0g, r2g), _mm_subs_epu16(r2g, r0g));
__m128i gradg = _mm_adds_epi16(_mm_subs_epu16(nextg1, g1), _mm_subs_epu16(g1, nextg1));
__m128i gmask = _mm_cmpgt_epi16(gradg, gradv);
__m128i g0 = _mm_add_epi16(_mm_and_si128(gmask, sumv), _mm_and_si128(sumg, _mm_xor_si128(gmask, full)));
// g0 g2 ... g14 g1 g3 ...
g0 = _mm_packus_epi16(g0, nextg1);
r0 = _mm_srli_epi16(r1, 8);
r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
// r0 r2 ... r14 r1 r3 ... r15
r0 = _mm_packus_epi16(r0, r1);
b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
b0 = _mm_xor_si128(b0, b1);
r0 = _mm_xor_si128(r0, b1);
// b1 g1 b3 g3 b5 g5...
b1 = _mm_unpackhi_epi8(b0, g0);
// b0 g0 b2 g2 b4 g4 ....
b0 = _mm_unpacklo_epi8(b0, g0);
// r1 0 r3 0 r5 0 ...
r1 = _mm_unpackhi_epi8(r0, z);
// r0 0 r2 0 r4 0 ...
r0 = _mm_unpacklo_epi8(r0, z);
// 0 b0 g0 r0 0 b2 g2 r2 ...
g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
// 0 b8 g8 r8 0 b10 g10 r10 ...
g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
// b1 g1 r1 0 b3 g3 r3 0 ...
r0 = _mm_unpacklo_epi16(b1, r1);
// b9 g9 r9 0 b11 g11 r11 0 ...
r1 = _mm_unpackhi_epi16(b1, r1);
// 0 b0 g0 r0 b1 g1 r1 0 ...
b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
// 0 b4 g4 r4 b5 g5 r5 0 ...
b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
_mm_storel_epi64((__m128i*)(dst+0), b0);
_mm_storel_epi64((__m128i*)(dst+6*1), _mm_srli_si128(b0, 8));
_mm_storel_epi64((__m128i*)(dst+6*2), b1);
_mm_storel_epi64((__m128i*)(dst+6*3), _mm_srli_si128(b1, 8));
// 0 b8 g8 r8 b9 g9 r9 0 ...
g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
// 0 b12 g12 r12 b13 g13 r13 0 ...
g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
_mm_storel_epi64((__m128i*)(dst+6*4), g0);
_mm_storel_epi64((__m128i*)(dst+6*5), _mm_srli_si128(g0, 8));
_mm_storel_epi64((__m128i*)(dst+6*6), g1);
}
return int(bayer - (bayer_end - width));
}
bool use_simd;
};
#elif CV_NEON
class SIMDBayerInterpolator_8u
{
public:
SIMDBayerInterpolator_8u()
{
}
int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
int width, int bcoeff, int gcoeff, int rcoeff) const
{
/*
B G B G | B G B G | B G B G | B G B G
G R G R | G R G R | G R G R | G R G R
B G B G | B G B G | B G B G | B G B G
*/
#if CV_NEON
uint16x8_t masklo = vdupq_n_u16(255);
const uchar* bayer_end = bayer + width;
......@@ -440,6 +176,40 @@ public:
vst1_u8(dst, p.val[0]);
vst1_u8(dst + 8, p.val[1]);
}
#else
v_uint16x8 _b2y = v_setall_u16((ushort)(rcoeff*2));
v_uint16x8 _g2y = v_setall_u16((ushort)(gcoeff*2));
v_uint16x8 _r2y = v_setall_u16((ushort)(bcoeff*2));
const uchar* bayer_end = bayer + width;
for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
{
v_uint16x8 r0 = v_load((ushort*)bayer);
v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7);
v_uint16x8 b0 = v_rotate_right<1>(b1) + b1;
b1 = v_rotate_right<1>(b1) << 1;
v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7);
v_uint16x8 g1 = (r1 << 8) >> 7;
g0 += v_rotate_right<1>(g1) + g1;
g1 = v_rotate_right<1>(g1) << 2;
r0 = r1 >> 8;
r1 = (v_rotate_right<1>(r0) + r0) << 2;
r0 = r0 << 3;
g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2;
g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2;
v_uint8x16 pack_lo, pack_hi;
v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)),
v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)),
pack_lo, pack_hi);
v_store(dst, pack_lo);
}
#endif
return (int)(bayer - (bayer_end - width));
}
......@@ -451,6 +221,8 @@ public:
G R G R | G R G R | G R G R | G R G R
B G B G | B G B G | B G B G | B G B G
*/
#if CV_NEON
uint16x8_t masklo = vdupq_n_u16(255);
uint8x16x3_t pix;
const uchar* bayer_end = bayer + width;
......@@ -484,21 +256,109 @@ public:
vst3q_u8(dst-1, pix);
}
#else
v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2);
v_uint16x8 mask = v_setall_u16(blue < 0 ? (ushort)(-1) : 0);
v_uint16x8 masklo = v_setall_u16(0x00ff);
v_uint8x16 z = v_setzero_u8();
const uchar* bayer_end = bayer + width;
for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
{
v_uint16x8 r0 = v_load((ushort*)bayer);
v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
v_uint16x8 b0 = b1 + nextb1;
b1 = (nextb1 + delta1) >> 1;
b0 = (b0 + delta2) >> 2;
// b0 b2 ... b14 b1 b3 ... b15
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
v_uint16x8 g1 = r1 & masklo;
g0 += v_rotate_right<1>(g1) + g1;
g1 = v_rotate_right<1>(g1);
g0 = (g0 + delta2) >> 2;
// g0 g2 ... g14 g1 g3 ... g15
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
r0 = r1 >> 8;
r1 = v_rotate_right<1>(r0) + r0;
r1 = (r1 + delta1) >> 1;
// r0 r2 ... r14 r1 r3 ... r15
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
b1 = (b0 ^ r0) & mask;
b0 = b0 ^ b1;
r0 = r0 ^ b1;
// b1 g1 b3 g3 b5 g5...
v_uint8x16 pack_lo, pack_hi;
v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi);
b1 = v_reinterpret_as_u16(pack_hi);
// b0 g0 b2 g2 b4 g4 ....
b0 = v_reinterpret_as_u16(pack_lo);
// r1 0 r3 0 r5 0 ...
v_zip(v_reinterpret_as_u8(r0), z, pack_lo, pack_hi);
r1 = v_reinterpret_as_u16(pack_hi);
// r0 0 r2 0 r4 0 ...
r0 = v_reinterpret_as_u16(pack_lo);
// 0 b0 g0 r0 0 b2 g2 r2 ...
v_zip(b0, r0, g0, g1);
g0 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g0)));
// 0 b8 g8 r8 0 b10 g10 r10 ...
g1 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g1)));
// b1 g1 r1 0 b3 g3 r3 0 ...
v_zip(b1, r1, r0, r1);
// b9 g9 r9 0 b11 g11 r11 0 ...
// 0 b0 g0 r0 b1 g1 r1 0 ...
v_uint32x4 pack32_lo, pack32_hi;
v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi);
b0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo)));
// 0 b4 g4 r4 b5 g5 r5 0 ...
b1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi)));
v_store_low(dst-1+0, v_reinterpret_as_u8(b0));
v_store_high(dst-1+6*1, v_reinterpret_as_u8(b0));
v_store_low(dst-1+6*2, v_reinterpret_as_u8(b1));
v_store_high(dst-1+6*3, v_reinterpret_as_u8(b1));
// 0 b8 g8 r8 b9 g9 r9 0 ...
v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi);
g0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo)));
// 0 b12 g12 r12 b13 g13 r13 0 ...
g1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi)));
v_store_low(dst-1+6*4, v_reinterpret_as_u8(g0));
v_store_high(dst-1+6*5, v_reinterpret_as_u8(g0));
v_store_low(dst-1+6*6, v_reinterpret_as_u8(g1));
}
#endif
return (int)(bayer - (bayer_end - width));
}
int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue, const uchar alpha) const
{
/*
B G B G | B G B G | B G B G | B G B G
G R G R | G R G R | G R G R | G R G R
B G B G | B G B G | B G B G | B G B G
*/
#if CV_NEON
uint16x8_t masklo = vdupq_n_u16(255);
uint8x16x4_t pix;
const uchar* bayer_end = bayer + width;
pix.val[3] = vdupq_n_u8(255);
pix.val[3] = vdupq_n_u8(alpha);
for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
{
......@@ -529,13 +389,198 @@ public:
vst4q_u8(dst-1, pix);
}
#else
v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2);
v_uint16x8 mask = v_setall_u16(blue < 0 ? (ushort)(-1) : 0);
v_uint16x8 masklo = v_setall_u16(0x00ff);
v_uint8x16 a = v_setall_u8(alpha);
const uchar* bayer_end = bayer + width;
for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
{
v_uint16x8 r0 = v_load((ushort*)bayer);
v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
v_uint16x8 b0 = b1 + nextb1;
b1 = (nextb1 + delta1) >> 1;
b0 = (b0 + delta2) >> 2;
// b0 b2 ... b14 b1 b3 ... b15
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
v_uint16x8 g1 = r1 & masklo;
g0 += v_rotate_right<1>(g1) + g1;
g1 = v_rotate_right<1>(g1);
g0 = (g0 + delta2) >> 2;
// g0 g2 ... g14 g1 g3 ... g15
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
r0 = r1 >> 8;
r1 = v_rotate_right<1>(r0) + r0;
r1 = (r1 + delta1) >> 1;
// r0 r2 ... r14 r1 r3 ... r15
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
b1 = (b0 ^ r0) & mask;
b0 = b0 ^ b1;
r0 = r0 ^ b1;
// b1 g1 b3 g3 b5 g5...
v_uint8x16 pack_lo, pack_hi;
v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi);
b1 = v_reinterpret_as_u16(pack_hi);
// b0 g0 b2 g2 b4 g4 ....
b0 = v_reinterpret_as_u16(pack_lo);
// r1 a r3 a r5 a ...
v_zip(v_reinterpret_as_u8(r0), a, pack_lo, pack_hi);
r1 = v_reinterpret_as_u16(pack_hi);
// r0 a r2 a r4 a ...
r0 = v_reinterpret_as_u16(pack_lo);
// a b0 g0 r0 a b2 g2 r2 ...
v_zip(b0, r0, g0, g1);
// a b8 g8 r8 a b10 g10 r10 ...
// b1 g1 r1 a b3 g3 r3 a ...
v_zip(b1, r1, r0, r1);
// b9 g9 r9 a b11 g11 r11 a ...
// a b0 g0 r0 b1 g1 r1 a ...
v_uint32x4 pack32_lo, pack32_hi;
v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi);
b0 = v_reinterpret_as_u16(pack32_lo);
// a b4 g4 r4 b5 g5 r5 a ...
b1 = v_reinterpret_as_u16(pack32_hi);
v_store_low(dst-1+0, v_reinterpret_as_u8(b0));
v_store_high(dst-1+8*1, v_reinterpret_as_u8(b0));
v_store_low(dst-1+8*2, v_reinterpret_as_u8(b1));
v_store_high(dst-1+8*3, v_reinterpret_as_u8(b1));
// a b8 g8 r8 b9 g9 r9 a ...
v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi);
g0 = v_reinterpret_as_u16(pack32_lo);
// a b12 g12 r12 b13 g13 r13 a ...
g1 = v_reinterpret_as_u16(pack32_hi);
v_store_low(dst-1+8*4, v_reinterpret_as_u8(g0));
v_store_high(dst-1+8*5, v_reinterpret_as_u8(g0));
v_store_low(dst-1+8*6, v_reinterpret_as_u8(g1));
}
#endif
return (int)(bayer - (bayer_end - width));
}
int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const
int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
{
return 0;
const uchar* bayer_end = bayer + width;
v_uint16x8 masklow = v_setall_u16(0x00ff);
v_uint16x8 delta1 = v_setall_u16(1), delta2 = v_setall_u16(2);
v_uint16x8 full = v_setall_u16((ushort)(-1));
v_uint8x16 z = v_setzero_u8();
v_uint16x8 mask = v_setall_u16(blue > 0 ? (ushort)(-1) : 0);
for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42)
{
/*
B G B G | B G B G | B G B G | B G B G
G R G R | G R G R | G R G R | G R G R
B G B G | B G B G | B G B G | B G B G
*/
v_uint16x8 r0 = v_load((ushort*)bayer);
v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow);
v_uint16x8 nextb1 = v_rotate_right<1>(b1);
v_uint16x8 b0 = b1 + nextb1;
b1 = (nextb1 + delta1) >> 1;
b0 = (b0 + delta2) >> 2;
// b0 b2 ... b14 b1 b3 ... b15
b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
// vertical sum
v_uint16x8 r0g = r0 >> 8;
v_uint16x8 r2g = r2 >> 8;
v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1;
// horizontal sum
v_uint16x8 g1 = r1 & masklow;
v_uint16x8 nextg1 = v_rotate_right<1>(g1);
v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1;
// gradients
v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g);
v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1);
v_uint16x8 gmask = gradg > gradv;
v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full));
// g0 g2 ... g14 g1 g3 ...
g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1)));
r0 = r1 >> 8;
r1 = v_rotate_right<1>(r0) + r0;
r1 = (r1 + delta1) >> 1;
// r0 r2 ... r14 r1 r3 ... r15
r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
b1 = (b0 ^ r0) & mask;
b0 = b0 ^ b1;
r0 = r0 ^ b1;
// b1 g1 b3 g3 b5 g5...
v_uint8x16 pack_lo, pack_hi;
v_zip(v_reinterpret_as_u8(b0), v_reinterpret_as_u8(g0), pack_lo, pack_hi);
b1 = v_reinterpret_as_u16(pack_hi);
// b0 g0 b2 g2 b4 g4 ....
b0 = v_reinterpret_as_u16(pack_lo);
// r1 0 r3 0 r5 0 ...
v_zip(v_reinterpret_as_u8(r0), z, pack_lo, pack_hi);
r1 = v_reinterpret_as_u16(pack_hi);
// r0 0 r2 0 r4 0 ...
r0 = v_reinterpret_as_u16(pack_lo);
// 0 b0 g0 r0 0 b2 g2 r2 ...
v_zip(b0, r0, g0, g1);
g0 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g0)));
// 0 b8 g8 r8 0 b10 g10 r10 ...
g1 = v_reinterpret_as_u16(v_rotate_left<1>(v_reinterpret_as_u8(g1)));
// b1 g1 r1 0 b3 g3 r3 0 ...
v_zip(b1, r1, r0, r1);
// b9 g9 r9 0 b11 g11 r11 0 ...
// 0 b0 g0 r0 b1 g1 r1 0 ...
v_uint32x4 pack32_lo, pack32_hi;
v_zip(v_reinterpret_as_u32(g0), v_reinterpret_as_u32(r0), pack32_lo, pack32_hi);
b0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo)));
// 0 b4 g4 r4 b5 g5 r5 0 ...
b1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi)));
v_store_low(dst+0, v_reinterpret_as_u8(b0));
v_store_high(dst+6*1, v_reinterpret_as_u8(b0));
v_store_low(dst+6*2, v_reinterpret_as_u8(b1));
v_store_high(dst+6*3, v_reinterpret_as_u8(b1));
// 0 b8 g8 r8 b9 g9 r9 0 ...
v_zip(v_reinterpret_as_u32(g1), v_reinterpret_as_u32(r1), pack32_lo, pack32_hi);
g0 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_lo)));
// 0 b12 g12 r12 b13 g13 r13 0 ...
g1 = v_reinterpret_as_u16(v_rotate_right<1>(v_reinterpret_as_u8(pack32_hi)));
v_store_low(dst+6*4, v_reinterpret_as_u8(g0));
v_store_high(dst+6*5, v_reinterpret_as_u8(g0));
v_store_low(dst+6*6, v_reinterpret_as_u8(g1));
}
return int(bayer - (bayer_end - width));
}
};
#else
......@@ -775,7 +820,7 @@ public:
// simd optimization only for dcn == 3
int delta = dcn == 4 ?
vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) :
vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue, alpha) :
vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
bayer += delta;
dst += delta*dcn;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment