Commit a54f6bb0 authored by Ilya Lavrenov's avatar Ilya Lavrenov

cv::cvtColor

parent 6f05a250
......@@ -98,6 +98,11 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16,
Size size = get<0>(GetParam());
int type = get<1>(GetParam());
BorderType btype = get<2>(GetParam());
double eps = 1e-3;
#if CV_NEON
eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : eps;
#endif
Mat src(size, type);
Mat dst(size, type);
......@@ -106,7 +111,7 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16,
TEST_CYCLE() blur(src, dst, Size(16,16), Point(-1,-1), btype);
SANITY_CHECK(dst, 1e-3);
SANITY_CHECK(dst, eps);
}
PERF_TEST_P(Size_MatType_BorderType3x3, box3x3,
......
......@@ -769,7 +769,6 @@ template<typename _Tp> struct RGB2Gray
float coeffs[3];
};
template<> struct RGB2Gray<uchar>
{
typedef uchar channel_type;
......@@ -800,6 +799,166 @@ template<> struct RGB2Gray<uchar>
int tab[256*3];
};
#if CV_NEON
template <>
struct RGB2Gray<ushort>
{
typedef ushort channel_type;
RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
srccn(_srccn)
{
static const int coeffs0[] = { R2Y, G2Y, B2Y };
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
if( blueIdx == 0 )
std::swap(coeffs[0], coeffs[2]);
v_cb = vdup_n_u16(coeffs[0]);
v_cg = vdup_n_u16(coeffs[1]);
v_cr = vdup_n_u16(coeffs[2]);
v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
}
void operator()(const ushort* src, ushort* dst, int n) const
{
int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
uint16x8_t v_b, v_r, v_g;
if (scn == 3)
{
uint16x8x3_t v_src = vld3q_u16(src);
v_b = v_src.val[0];
v_g = v_src.val[1];
v_r = v_src.val[2];
}
else
{
uint16x8x4_t v_src = vld4q_u16(src);
v_b = v_src.val[0];
v_g = v_src.val[1];
v_r = v_src.val[2];
}
uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
vmull_u16(vget_low_u16(v_b), v_cb),
vget_low_u16(v_g), v_cg),
vget_low_u16(v_r), v_cr);
uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
vmull_u16(vget_high_u16(v_b), v_cb),
vget_high_u16(v_g), v_cg),
vget_high_u16(v_r), v_cr);
uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
}
for ( ; i <= n - 4; i += 4, src += scn * 4)
{
uint16x4_t v_b, v_r, v_g;
if (scn == 3)
{
uint16x4x3_t v_src = vld3_u16(src);
v_b = v_src.val[0];
v_g = v_src.val[1];
v_r = v_src.val[2];
}
else
{
uint16x4x4_t v_src = vld4_u16(src);
v_b = v_src.val[0];
v_g = v_src.val[1];
v_r = v_src.val[2];
}
uint32x4_t v_dst = vmlal_u16(vmlal_u16(
vmull_u16(v_b, v_cb),
v_g, v_cg),
v_r, v_cr);
vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
}
for( ; i < n; i++, src += scn)
dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
}
int srccn, coeffs[3];
uint16x4_t v_cb, v_cg, v_cr;
uint32x4_t v_delta;
};
template <>
struct RGB2Gray<float>
{
typedef float channel_type;
RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
{
static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
if(blueIdx == 0)
std::swap(coeffs[0], coeffs[2]);
v_cb = vdupq_n_f32(coeffs[0]);
v_cg = vdupq_n_f32(coeffs[1]);
v_cr = vdupq_n_f32(coeffs[2]);
}
void operator()(const float * src, float * dst, int n) const
{
int scn = srccn, i = 0;
float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
if (scn == 3)
{
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
float32x4x3_t v_src = vld3q_f32(src);
vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
v_src = vld3q_f32(src + scn * 4);
vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
}
for ( ; i <= n - 4; i += 4, src += scn * 4)
{
float32x4x3_t v_src = vld3q_f32(src);
vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
}
}
else
{
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
float32x4x4_t v_src = vld4q_f32(src);
vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
v_src = vld4q_f32(src + scn * 4);
vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
}
for ( ; i <= n - 4; i += 4, src += scn * 4)
{
float32x4x4_t v_src = vld4q_f32(src);
vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
}
}
for ( ; i < n; i++, src += scn)
dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
}
int srccn;
float coeffs[3];
float32x4_t v_cb, v_cg, v_cr;
};
#else
template<> struct RGB2Gray<ushort>
{
......@@ -823,6 +982,7 @@ template<> struct RGB2Gray<ushort>
int coeffs[3];
};
#endif
///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
......@@ -855,6 +1015,72 @@ template<typename _Tp> struct RGB2YCrCb_f
float coeffs[5];
};
#if CV_NEON
template <>
struct RGB2YCrCb_f<float>
{
typedef float channel_type;
RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
srccn(_srccn), blueIdx(_blueIdx)
{
static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
if(blueIdx==0)
std::swap(coeffs[0], coeffs[2]);
v_c0 = vdupq_n_f32(coeffs[0]);
v_c1 = vdupq_n_f32(coeffs[1]);
v_c2 = vdupq_n_f32(coeffs[2]);
v_c3 = vdupq_n_f32(coeffs[3]);
v_c4 = vdupq_n_f32(coeffs[4]);
v_delta = vdupq_n_f32(ColorChannel<float>::half());
}
void operator()(const float * src, float * dst, int n) const
{
int scn = srccn, bidx = blueIdx, i = 0;
const float delta = ColorChannel<float>::half();
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
n *= 3;
if (scn == 3)
for ( ; i <= n - 12; i += 12, src += 12)
{
float32x4x3_t v_src = vld3q_f32(src), v_dst;
v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
vst3q_f32(dst + i, v_dst);
}
else
for ( ; i <= n - 12; i += 12, src += 16)
{
float32x4x4_t v_src = vld4q_f32(src);
float32x4x3_t v_dst;
v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
vst3q_f32(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
float Cr = (src[bidx^2] - Y)*C3 + delta;
float Cb = (src[bidx] - Y)*C4 + delta;
dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
}
}
int srccn, blueIdx;
float coeffs[5];
float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
};
#endif
template<typename _Tp> struct RGB2YCrCb_i
{
......@@ -887,6 +1113,224 @@ template<typename _Tp> struct RGB2YCrCb_i
int coeffs[5];
};
#if CV_NEON
template <>
struct RGB2YCrCb_i<uchar>
{
typedef uchar channel_type;
RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
: srccn(_srccn), blueIdx(_blueIdx)
{
static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
if (blueIdx==0)
std::swap(coeffs[0], coeffs[2]);
v_c0 = vdup_n_s16(coeffs[0]);
v_c1 = vdup_n_s16(coeffs[1]);
v_c2 = vdup_n_s16(coeffs[2]);
v_c3 = vdupq_n_s32(coeffs[3]);
v_c4 = vdupq_n_s32(coeffs[4]);
v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
}
void operator()(const uchar * src, uchar * dst, int n) const
{
int scn = srccn, bidx = blueIdx, i = 0;
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
n *= 3;
for ( ; i <= n - 24; i += 24, src += scn * 8)
{
uint8x8x3_t v_dst;
int16x8x3_t v_src16;
if (scn == 3)
{
uint8x8x3_t v_src = vld3_u8(src);
v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
}
else
{
uint8x8x4_t v_src = vld4_u8(src);
v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
}
int16x4x3_t v_src0;
v_src0.val[0] = vget_low_s16(v_src16.val[0]);
v_src0.val[1] = vget_low_s16(v_src16.val[1]);
v_src0.val[2] = vget_low_s16(v_src16.val[2]);
int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
v_src0.val[0] = vget_high_s16(v_src16.val[0]);
v_src0.val[1] = vget_high_s16(v_src16.val[1]);
v_src0.val[2] = vget_high_s16(v_src16.val[2]);
int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
vst3_u8(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
dst[i] = saturate_cast<uchar>(Y);
dst[i+1] = saturate_cast<uchar>(Cr);
dst[i+2] = saturate_cast<uchar>(Cb);
}
}
int srccn, blueIdx, coeffs[5];
int16x4_t v_c0, v_c1, v_c2;
int32x4_t v_c3, v_c4, v_delta, v_delta2;
};
template <>
struct RGB2YCrCb_i<ushort>
{
typedef ushort channel_type;
RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
: srccn(_srccn), blueIdx(_blueIdx)
{
static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
if (blueIdx==0)
std::swap(coeffs[0], coeffs[2]);
v_c0 = vdupq_n_s32(coeffs[0]);
v_c1 = vdupq_n_s32(coeffs[1]);
v_c2 = vdupq_n_s32(coeffs[2]);
v_c3 = vdupq_n_s32(coeffs[3]);
v_c4 = vdupq_n_s32(coeffs[4]);
v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
}
void operator()(const ushort * src, ushort * dst, int n) const
{
int scn = srccn, bidx = blueIdx, i = 0;
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
n *= 3;
for ( ; i <= n - 24; i += 24, src += scn * 8)
{
uint16x8x3_t v_src, v_dst;
int32x4x3_t v_src0;
if (scn == 3)
v_src = vld3q_u16(src);
else
{
uint16x8x4_t v_src_ = vld4q_u16(src);
v_src.val[0] = v_src_.val[0];
v_src.val[1] = v_src_.val[1];
v_src.val[2] = v_src_.val[2];
}
v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
vst3q_u16(dst + i, v_dst);
}
for ( ; i <= n - 12; i += 12, src += scn * 4)
{
uint16x4x3_t v_dst;
int32x4x3_t v_src0;
if (scn == 3)
{
uint16x4x3_t v_src = vld3_u16(src);
v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
}
else
{
uint16x4x4_t v_src = vld4_u16(src);
v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
}
int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
v_dst.val[0] = vqmovun_s32(v_Y);
v_dst.val[1] = vqmovun_s32(v_Cr);
v_dst.val[2] = vqmovun_s32(v_Cb);
vst3_u16(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
dst[i] = saturate_cast<ushort>(Y);
dst[i+1] = saturate_cast<ushort>(Cr);
dst[i+2] = saturate_cast<ushort>(Cb);
}
}
int srccn, blueIdx, coeffs[5];
int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
};
#endif
template<typename _Tp> struct YCrCb2RGB_f
{
......@@ -923,6 +1367,80 @@ template<typename _Tp> struct YCrCb2RGB_f
float coeffs[4];
};
#if CV_NEON
template <>
struct YCrCb2RGB_f<float>
{
typedef float channel_type;
YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
: dstcn(_dstcn), blueIdx(_blueIdx)
{
static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
v_c0 = vdupq_n_f32(coeffs[0]);
v_c1 = vdupq_n_f32(coeffs[1]);
v_c2 = vdupq_n_f32(coeffs[2]);
v_c3 = vdupq_n_f32(coeffs[3]);
v_delta = vdupq_n_f32(ColorChannel<float>::half());
v_alpha = vdupq_n_f32(ColorChannel<float>::max());
}
void operator()(const float* src, float* dst, int n) const
{
int dcn = dstcn, bidx = blueIdx, i = 0;
const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
n *= 3;
if (dcn == 3)
for ( ; i <= n - 12; i += 12, dst += 12)
{
float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
vst3q_f32(dst, v_dst);
}
else
for ( ; i <= n - 12; i += 12, dst += 16)
{
float32x4x3_t v_src = vld3q_f32(src + i);
float32x4x4_t v_dst;
float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
v_dst.val[3] = v_alpha;
vst4q_f32(dst, v_dst);
}
for ( ; i < n; i += 3, dst += dcn)
{
float Y = src[i], Cr = src[i+1], Cb = src[i+2];
float b = Y + (Cb - delta)*C3;
float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
float r = Y + (Cr - delta)*C0;
dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
if( dcn == 4 )
dst[3] = alpha;
}
}
int dstcn, blueIdx;
float coeffs[4];
float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
};
#endif
template<typename _Tp> struct YCrCb2RGB_i
{
......@@ -962,20 +1480,268 @@ template<typename _Tp> struct YCrCb2RGB_i
int coeffs[4];
};
#if CV_NEON
////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
static const float sRGB2XYZ_D65[] =
template <>
struct YCrCb2RGB_i<uchar>
{
0.412453f, 0.357580f, 0.180423f,
0.212671f, 0.715160f, 0.072169f,
0.019334f, 0.119193f, 0.950227f
};
typedef uchar channel_type;
static const float XYZ2sRGB_D65[] =
{
3.240479f, -1.53715f, -0.498535f,
-0.969256f, 1.875991f, 0.041556f,
YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
: dstcn(_dstcn), blueIdx(_blueIdx)
{
static const int coeffs0[] = {22987, -11698, -5636, 29049};
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
v_c0 = vdupq_n_s32(coeffs[0]);
v_c1 = vdupq_n_s32(coeffs[1]);
v_c2 = vdupq_n_s32(coeffs[2]);
v_c3 = vdupq_n_s32(coeffs[3]);
v_delta = vdup_n_s16(ColorChannel<uchar>::half());
v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int dcn = dstcn, bidx = blueIdx, i = 0;
const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
n *= 3;
for ( ; i <= n - 24; i += 24, dst += dcn * 8)
{
uint8x8x3_t v_src = vld3_u8(src + i);
int16x8x3_t v_src16;
v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
v_Cr = vget_low_s16(v_src16.val[1]),
v_Cb = vget_low_s16(v_src16.val[2]);
int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
v_Y = vget_high_s16(v_src16.val[0]);
v_Cr = vget_high_s16(v_src16.val[1]);
v_Cb = vget_high_s16(v_src16.val[2]);
int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
if (dcn == 3)
{
uint8x8x3_t v_dst;
v_dst.val[bidx] = v_b;
v_dst.val[1] = v_g;
v_dst.val[bidx^2] = v_r;
vst3_u8(dst, v_dst);
}
else
{
uint8x8x4_t v_dst;
v_dst.val[bidx] = v_b;
v_dst.val[1] = v_g;
v_dst.val[bidx^2] = v_r;
v_dst.val[3] = v_alpha;
vst4_u8(dst, v_dst);
}
}
for ( ; i < n; i += 3, dst += dcn)
{
uchar Y = src[i];
uchar Cr = src[i+1];
uchar Cb = src[i+2];
int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
dst[bidx] = saturate_cast<uchar>(b);
dst[1] = saturate_cast<uchar>(g);
dst[bidx^2] = saturate_cast<uchar>(r);
if( dcn == 4 )
dst[3] = alpha;
}
}
int dstcn, blueIdx;
int coeffs[4];
int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
int16x4_t v_delta;
uint8x8_t v_alpha;
};
template <>
struct YCrCb2RGB_i<ushort>
{
typedef ushort channel_type;
YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
: dstcn(_dstcn), blueIdx(_blueIdx)
{
static const int coeffs0[] = {22987, -11698, -5636, 29049};
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
v_c0 = vdupq_n_s32(coeffs[0]);
v_c1 = vdupq_n_s32(coeffs[1]);
v_c2 = vdupq_n_s32(coeffs[2]);
v_c3 = vdupq_n_s32(coeffs[3]);
v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
v_alpha2 = vget_low_u16(v_alpha);
}
void operator()(const ushort* src, ushort* dst, int n) const
{
int dcn = dstcn, bidx = blueIdx, i = 0;
const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
n *= 3;
for ( ; i <= n - 24; i += 24, dst += dcn * 8)
{
uint16x8x3_t v_src = vld3q_u16(src + i);
int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
if (dcn == 3)
{
uint16x8x3_t v_dst;
v_dst.val[bidx] = v_b;
v_dst.val[1] = v_g;
v_dst.val[bidx^2] = v_r;
vst3q_u16(dst, v_dst);
}
else
{
uint16x8x4_t v_dst;
v_dst.val[bidx] = v_b;
v_dst.val[1] = v_g;
v_dst.val[bidx^2] = v_r;
v_dst.val[3] = v_alpha;
vst4q_u16(dst, v_dst);
}
}
for ( ; i <= n - 12; i += 12, dst += dcn * 4)
{
uint16x4x3_t v_src = vld3_u16(src + i);
int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
uint16x4_t v_bd = vqmovun_s32(v_b);
uint16x4_t v_gd = vqmovun_s32(v_g);
uint16x4_t v_rd = vqmovun_s32(v_r);
if (dcn == 3)
{
uint16x4x3_t v_dst;
v_dst.val[bidx] = v_bd;
v_dst.val[1] = v_gd;
v_dst.val[bidx^2] = v_rd;
vst3_u16(dst, v_dst);
}
else
{
uint16x4x4_t v_dst;
v_dst.val[bidx] = v_bd;
v_dst.val[1] = v_gd;
v_dst.val[bidx^2] = v_rd;
v_dst.val[3] = v_alpha2;
vst4_u16(dst, v_dst);
}
}
for ( ; i < n; i += 3, dst += dcn)
{
ushort Y = src[i];
ushort Cr = src[i+1];
ushort Cb = src[i+2];
int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
dst[bidx] = saturate_cast<ushort>(b);
dst[1] = saturate_cast<ushort>(g);
dst[bidx^2] = saturate_cast<ushort>(r);
if( dcn == 4 )
dst[3] = alpha;
}
}
int dstcn, blueIdx;
int coeffs[4];
int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
uint16x8_t v_alpha;
uint16x4_t v_alpha2;
};
#endif
////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
static const float sRGB2XYZ_D65[] =
{
0.412453f, 0.357580f, 0.180423f,
0.212671f, 0.715160f, 0.072169f,
0.019334f, 0.119193f, 0.950227f
};
static const float XYZ2sRGB_D65[] =
{
3.240479f, -1.53715f, -0.498535f,
-0.969256f, 1.875991f, 0.041556f,
0.055648f, -0.204043f, 1.057311f
};
......@@ -1013,6 +1779,78 @@ template<typename _Tp> struct RGB2XYZ_f
float coeffs[9];
};
#if CV_NEON
template <>
struct RGB2XYZ_f<float>
{
typedef float channel_type;
RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
{
memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
if(blueIdx == 0)
{
std::swap(coeffs[0], coeffs[2]);
std::swap(coeffs[3], coeffs[5]);
std::swap(coeffs[6], coeffs[8]);
}
v_c0 = vdupq_n_f32(coeffs[0]);
v_c1 = vdupq_n_f32(coeffs[1]);
v_c2 = vdupq_n_f32(coeffs[2]);
v_c3 = vdupq_n_f32(coeffs[3]);
v_c4 = vdupq_n_f32(coeffs[4]);
v_c5 = vdupq_n_f32(coeffs[5]);
v_c6 = vdupq_n_f32(coeffs[6]);
v_c7 = vdupq_n_f32(coeffs[7]);
v_c8 = vdupq_n_f32(coeffs[8]);
}
void operator()(const float* src, float* dst, int n) const
{
int scn = srccn, i = 0;
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
n *= 3;
if (scn == 3)
for ( ; i <= n - 12; i += 12, src += 12)
{
float32x4x3_t v_src = vld3q_f32(src), v_dst;
v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
vst3q_f32(dst + i, v_dst);
}
else
for ( ; i <= n - 12; i += 12, src += 16)
{
float32x4x4_t v_src = vld4q_f32(src);
float32x4x3_t v_dst;
v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
vst3q_f32(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
}
}
int srccn;
float coeffs[9];
float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
};
#endif
template<typename _Tp> struct RGB2XYZ_i
{
......@@ -1055,6 +1893,247 @@ template<typename _Tp> struct RGB2XYZ_i
int coeffs[9];
};
#if CV_NEON
template <>
struct RGB2XYZ_i<uchar>
{
typedef uchar channel_type;
RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
{
static const int coeffs0[] =
{
1689, 1465, 739,
871, 2929, 296,
79, 488, 3892
};
for( int i = 0; i < 9; i++ )
coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
if(blueIdx == 0)
{
std::swap(coeffs[0], coeffs[2]);
std::swap(coeffs[3], coeffs[5]);
std::swap(coeffs[6], coeffs[8]);
}
v_c0 = vdup_n_u16(coeffs[0]);
v_c1 = vdup_n_u16(coeffs[1]);
v_c2 = vdup_n_u16(coeffs[2]);
v_c3 = vdup_n_u16(coeffs[3]);
v_c4 = vdup_n_u16(coeffs[4]);
v_c5 = vdup_n_u16(coeffs[5]);
v_c6 = vdup_n_u16(coeffs[6]);
v_c7 = vdup_n_u16(coeffs[7]);
v_c8 = vdup_n_u16(coeffs[8]);
v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
}
void operator()(const uchar * src, uchar * dst, int n) const
{
int scn = srccn, i = 0;
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
n *= 3;
for ( ; i <= n - 24; i += 24, src += scn * 8)
{
uint8x8x3_t v_dst;
uint16x8x3_t v_src16;
if (scn == 3)
{
uint8x8x3_t v_src = vld3_u8(src);
v_src16.val[0] = vmovl_u8(v_src.val[0]);
v_src16.val[1] = vmovl_u8(v_src.val[1]);
v_src16.val[2] = vmovl_u8(v_src.val[2]);
}
else
{
uint8x8x4_t v_src = vld4_u8(src);
v_src16.val[0] = vmovl_u8(v_src.val[0]);
v_src16.val[1] = vmovl_u8(v_src.val[1]);
v_src16.val[2] = vmovl_u8(v_src.val[2]);
}
uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
v_s1 = vget_low_u16(v_src16.val[1]),
v_s2 = vget_low_u16(v_src16.val[2]);
uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
v_s0 = vget_high_u16(v_src16.val[0]),
v_s1 = vget_high_u16(v_src16.val[1]),
v_s2 = vget_high_u16(v_src16.val[2]);
uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
vst3_u8(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
dst[i] = saturate_cast<uchar>(X);
dst[i+1] = saturate_cast<uchar>(Y);
dst[i+2] = saturate_cast<uchar>(Z);
}
}
int srccn, coeffs[9];
uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
uint32x4_t v_delta;
};
template <>
struct RGB2XYZ_i<ushort>
{
typedef ushort channel_type;
RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
{
static const int coeffs0[] =
{
1689, 1465, 739,
871, 2929, 296,
79, 488, 3892
};
for( int i = 0; i < 9; i++ )
coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
if(blueIdx == 0)
{
std::swap(coeffs[0], coeffs[2]);
std::swap(coeffs[3], coeffs[5]);
std::swap(coeffs[6], coeffs[8]);
}
v_c0 = vdup_n_u16(coeffs[0]);
v_c1 = vdup_n_u16(coeffs[1]);
v_c2 = vdup_n_u16(coeffs[2]);
v_c3 = vdup_n_u16(coeffs[3]);
v_c4 = vdup_n_u16(coeffs[4]);
v_c5 = vdup_n_u16(coeffs[5]);
v_c6 = vdup_n_u16(coeffs[6]);
v_c7 = vdup_n_u16(coeffs[7]);
v_c8 = vdup_n_u16(coeffs[8]);
v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
}
void operator()(const ushort * src, ushort * dst, int n) const
{
int scn = srccn, i = 0;
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
n *= 3;
for ( ; i <= n - 24; i += 24, src += scn * 8)
{
uint16x8x3_t v_src, v_dst;
if (scn == 3)
v_src = vld3q_u16(src);
else
{
uint16x8x4_t v_src4 = vld4q_u16(src);
v_src.val[0] = v_src4.val[0];
v_src.val[1] = v_src4.val[1];
v_src.val[2] = v_src4.val[2];
}
uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
v_s1 = vget_low_u16(v_src.val[1]),
v_s2 = vget_low_u16(v_src.val[2]);
uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
v_s0 = vget_high_u16(v_src.val[0]),
v_s1 = vget_high_u16(v_src.val[1]),
v_s2 = vget_high_u16(v_src.val[2]);
uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
vst3q_u16(dst + i, v_dst);
}
for ( ; i <= n - 12; i += 12, src += scn * 4)
{
uint16x4x3_t v_dst;
uint16x4_t v_s0, v_s1, v_s2;
if (scn == 3)
{
uint16x4x3_t v_src = vld3_u16(src);
v_s0 = v_src.val[0];
v_s1 = v_src.val[1];
v_s2 = v_src.val[2];
}
else
{
uint16x4x4_t v_src = vld4_u16(src);
v_s0 = v_src.val[0];
v_s1 = v_src.val[1];
v_s2 = v_src.val[2];
}
uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
vst3_u16(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
dst[i] = saturate_cast<ushort>(X);
dst[i+1] = saturate_cast<ushort>(Y);
dst[i+2] = saturate_cast<ushort>(Z);
}
}
int srccn, coeffs[9];
uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
uint32x4_t v_delta;
};
#endif
template<typename _Tp> struct XYZ2RGB_f
{
......@@ -1141,6 +2220,278 @@ template<typename _Tp> struct XYZ2RGB_i
int coeffs[9];
};
#if CV_NEON
template <>
struct XYZ2RGB_i<uchar>
{
typedef uchar channel_type;
XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
: dstcn(_dstcn), blueIdx(_blueIdx)
{
static const int coeffs0[] =
{
13273, -6296, -2042,
-3970, 7684, 170,
228, -836, 4331
};
for(int i = 0; i < 9; i++)
coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
if(blueIdx == 0)
{
std::swap(coeffs[0], coeffs[6]);
std::swap(coeffs[1], coeffs[7]);
std::swap(coeffs[2], coeffs[8]);
}
v_c0 = vdup_n_s16(coeffs[0]);
v_c1 = vdup_n_s16(coeffs[1]);
v_c2 = vdup_n_s16(coeffs[2]);
v_c3 = vdup_n_s16(coeffs[3]);
v_c4 = vdup_n_s16(coeffs[4]);
v_c5 = vdup_n_s16(coeffs[5]);
v_c6 = vdup_n_s16(coeffs[6]);
v_c7 = vdup_n_s16(coeffs[7]);
v_c8 = vdup_n_s16(coeffs[8]);
v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int dcn = dstcn, i = 0;
uchar alpha = ColorChannel<uchar>::max();
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
n *= 3;
for ( ; i <= n - 24; i += 24, dst += dcn * 8)
{
uint8x8x3_t v_src = vld3_u8(src + i);
int16x8x3_t v_src16;
v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
v_s1 = vget_low_s16(v_src16.val[1]),
v_s2 = vget_low_s16(v_src16.val[2]);
int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
v_s0 = vget_high_s16(v_src16.val[0]),
v_s1 = vget_high_s16(v_src16.val[1]),
v_s2 = vget_high_s16(v_src16.val[2]);
int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
if (dcn == 3)
{
uint8x8x3_t v_dst;
v_dst.val[0] = v_b;
v_dst.val[1] = v_g;
v_dst.val[2] = v_r;
vst3_u8(dst, v_dst);
}
else
{
uint8x8x4_t v_dst;
v_dst.val[0] = v_b;
v_dst.val[1] = v_g;
v_dst.val[2] = v_r;
v_dst.val[3] = v_alpha;
vst4_u8(dst, v_dst);
}
}
for ( ; i < n; i += 3, dst += dcn)
{
int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
dst[2] = saturate_cast<uchar>(R);
if( dcn == 4 )
dst[3] = alpha;
}
}
int dstcn, blueIdx;
int coeffs[9];
int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
uint8x8_t v_alpha;
int32x4_t v_delta;
};
template <>
struct XYZ2RGB_i<ushort>
{
typedef ushort channel_type;
XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
: dstcn(_dstcn), blueIdx(_blueIdx)
{
static const int coeffs0[] =
{
13273, -6296, -2042,
-3970, 7684, 170,
228, -836, 4331
};
for(int i = 0; i < 9; i++)
coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
if(blueIdx == 0)
{
std::swap(coeffs[0], coeffs[6]);
std::swap(coeffs[1], coeffs[7]);
std::swap(coeffs[2], coeffs[8]);
}
v_c0 = vdupq_n_s32(coeffs[0]);
v_c1 = vdupq_n_s32(coeffs[1]);
v_c2 = vdupq_n_s32(coeffs[2]);
v_c3 = vdupq_n_s32(coeffs[3]);
v_c4 = vdupq_n_s32(coeffs[4]);
v_c5 = vdupq_n_s32(coeffs[5]);
v_c6 = vdupq_n_s32(coeffs[6]);
v_c7 = vdupq_n_s32(coeffs[7]);
v_c8 = vdupq_n_s32(coeffs[8]);
v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
v_alpha2 = vget_low_u16(v_alpha);
}
void operator()(const ushort* src, ushort* dst, int n) const
{
int dcn = dstcn, i = 0;
ushort alpha = ColorChannel<ushort>::max();
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
n *= 3;
for ( ; i <= n - 24; i += 24, dst += dcn * 8)
{
uint16x8x3_t v_src = vld3q_u16(src + i);
int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
if (dcn == 3)
{
uint16x8x3_t v_dst;
v_dst.val[0] = v_b;
v_dst.val[1] = v_g;
v_dst.val[2] = v_r;
vst3q_u16(dst, v_dst);
}
else
{
uint16x8x4_t v_dst;
v_dst.val[0] = v_b;
v_dst.val[1] = v_g;
v_dst.val[2] = v_r;
v_dst.val[3] = v_alpha;
vst4q_u16(dst, v_dst);
}
}
for ( ; i <= n - 12; i += 12, dst += dcn * 4)
{
uint16x4x3_t v_src = vld3_u16(src + i);
int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
uint16x4_t v_b = vqmovun_s32(v_X);
uint16x4_t v_g = vqmovun_s32(v_Y);
uint16x4_t v_r = vqmovun_s32(v_Z);
if (dcn == 3)
{
uint16x4x3_t v_dst;
v_dst.val[0] = v_b;
v_dst.val[1] = v_g;
v_dst.val[2] = v_r;
vst3_u16(dst, v_dst);
}
else
{
uint16x4x4_t v_dst;
v_dst.val[0] = v_b;
v_dst.val[1] = v_g;
v_dst.val[2] = v_r;
v_dst.val[3] = v_alpha2;
vst4_u16(dst, v_dst);
}
}
for ( ; i < n; i += 3, dst += dcn)
{
int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
dst[2] = saturate_cast<ushort>(R);
if( dcn == 4 )
dst[3] = alpha;
}
}
int dstcn, blueIdx;
int coeffs[9];
int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
uint16x4_t v_alpha2;
uint16x8_t v_alpha;
};
#endif
////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
......
......@@ -3580,14 +3580,6 @@ public:
_mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
}
}
#elif CV_NEON
for( ; x1 <= bcols - 4; x1 += 4 )
{
int32x4_t v_sx = cv_vrndq_s32_f32(vld1q_f32(sX + x1)),
v_sy = cv_vrndq_s32_f32(vld1q_f32(sY + x1));
int16x4x2_t v_dst = vzip_s16(vqmovn_s32(v_sx), vqmovn_s32(v_sy));
vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
}
#endif
for( ; x1 < bcols; x1++ )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment