Commit 64916d3d authored by Rostislav Vasilikhin's avatar Rostislav Vasilikhin Committed by Alexander Alekhin

Merge pull request #10869 from savuor:color_cpp_split

color.cpp split (#10869)

* initial split is done

* files renamed (these names are excluded during compilation)

* IPP code moved to corresponding files

* splineBuild, splineInterpolate -> color_lab.cpp

* Lab, Luv: little refactored

* it compiles (didn't check work); Lab OCL code moved to color_lab.cpp

* cvtcolor.cl: Lab/Luv part moved to color_lab.cl

* cvtcolor.cl: color_rgb.cl extracted

* cvtcolor.cl: color_yuv.cl separated

* cvtcolor.cl: color_hsv.cl extracted

* cvtcolor.cl: extracted to color_lab.cl and color_rgb.cl

* helper functions moved to hpp file

* Lab, Luv: moved to color_lab.cpp

* CPU XYZ: to color_lab.cpp

* OCL XYZ: to color_lab.cpp

* warning fixed

* CvtHelper added

* CPU YUV: to color_yuv.cpp, helpers to color.hpp

* CPU HLS/HSV: to color_hsv.cpp

* CPU BGR2BGR: to color_rgb.cpp

* CPU RGB: to color_rgb.cpp

* extra arg removed

* CPU YUV: to color_yuv.cpp

* color code decoded

* OclHelper added, some funcs rewritten

* color_lab.cpp: refactored to use OclHelper

* OCL RGB: to color_rgb.cpp

* OCL HLS/HSV: to color_hsv.cpp

* OCL YUV: to color_yuv.cpp

* OCL YUV planes: to color_yuv.cpp

* OCL: color code reduced

* licence to demosaicing.cpp

* IPP func tables to color_rgb.cpp

* code cleanup

* HAVE_OPENCL ifdefs added

* helpers made more common

* fixed two plane YUV with separate mats

* fixed warning in gcc7.2.0

* precomp header fixed

* color space classification functions fixed

* helpers fixed

* rename: isSRGB -> is_sRGB
parent c727e8a4
This source diff could not be displayed because it is too large. You can view the blob instead.
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "opencv2/imgproc.hpp"
#include "opencv2/core/utility.hpp"
#include <limits>
#include "opencl_kernels_imgproc.hpp"
#include "hal_replacement.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/softfloat.hpp"
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
namespace cv
{
//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
const float B2YF = 0.114f;
const float G2YF = 0.587f;
const float R2YF = 0.299f;
enum
{
yuv_shift = 14,
xyz_shift = 12,
R2Y = 4899, // == R2YF*16384
G2Y = 9617, // == G2YF*16384
B2Y = 1868, // == B2YF*16384
BLOCK_SIZE = 256
};
template<typename _Tp> struct ColorChannel
{
typedef float worktype_f;
static _Tp max() { return std::numeric_limits<_Tp>::max(); }
static _Tp half() { return (_Tp)(max()/2 + 1); }
};
template<> struct ColorChannel<float>
{
typedef float worktype_f;
static float max() { return 1.f; }
static float half() { return 0.5f; }
};
/*template<> struct ColorChannel<double>
{
typedef double worktype_f;
static double max() { return 1.; }
static double half() { return 0.5; }
};*/
//
// Helper functions
//
namespace {
inline bool isHSV(int code)
{
switch(code)
{
case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
return true;
default:
return false;
}
}
inline bool isLab(int code)
{
switch (code)
{
case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Lab2LBGR: case COLOR_Lab2LRGB:
case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_LBGR2Lab: case COLOR_LRGB2Lab:
return true;
default:
return false;
}
}
inline bool is_sRGB(int code)
{
switch (code)
{
case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_BGR2Luv: case COLOR_RGB2Luv:
case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Luv2BGR: case COLOR_Luv2RGB:
return true;
default:
return false;
}
}
inline bool swapBlue(int code)
{
switch (code)
{
case COLOR_BGR2BGRA: case COLOR_BGRA2BGR:
case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555:
case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA:
case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
case COLOR_BGR2YCrCb: case COLOR_BGR2YUV:
case COLOR_YCrCb2BGR: case COLOR_YUV2BGR:
case COLOR_BGR2XYZ: case COLOR_XYZ2BGR:
case COLOR_BGR2HSV: case COLOR_BGR2HLS: case COLOR_BGR2HSV_FULL: case COLOR_BGR2HLS_FULL:
case COLOR_YUV2BGR_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2BGRA_IYUV:
case COLOR_YUV2BGR_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2BGRA_NV12:
case COLOR_Lab2BGR: case COLOR_Luv2BGR: case COLOR_Lab2LBGR: case COLOR_Luv2LBGR:
case COLOR_BGR2Lab: case COLOR_BGR2Luv: case COLOR_LBGR2Lab: case COLOR_LBGR2Luv:
case COLOR_HSV2BGR: case COLOR_HLS2BGR: case COLOR_HSV2BGR_FULL: case COLOR_HLS2BGR_FULL:
case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2:
case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU:
case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12:
return false;
default:
return true;
}
}
inline bool isFullRangeHSV(int code)
{
switch (code)
{
case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
return true;
default:
return false;
}
}
inline int dstChannels(int code)
{
switch( code )
{
case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2RGBA:
case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
case COLOR_GRAY2BGRA:
case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
return 4;
case COLOR_BGRA2BGR: case COLOR_RGBA2BGR: case COLOR_RGB2BGR:
case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
case COLOR_GRAY2BGR:
case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12:
case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV:
case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2:
return 3;
default:
return 0;
}
}
inline int greenBits(int code)
{
switch( code )
{
case COLOR_BGR2BGR565: case COLOR_RGB2BGR565: case COLOR_BGRA2BGR565: case COLOR_RGBA2BGR565:
case COLOR_BGR5652BGR: case COLOR_BGR5652RGB: case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA:
case COLOR_BGR5652GRAY: case COLOR_GRAY2BGR565:
return 6;
case COLOR_BGR2BGR555: case COLOR_RGB2BGR555: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR555:
case COLOR_BGR5552BGR: case COLOR_BGR5552RGB: case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA:
case COLOR_BGR5552GRAY: case COLOR_GRAY2BGR555:
return 5;
default:
return 0;
}
}
inline int uIndex(int code)
{
switch( code )
{
case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
return 2;
case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21:
case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
return 1;
case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
return 0;
default:
return -1;
}
}
} // namespace::
template<int i0, int i1 = -1, int i2 = -1>
struct Set
{
static bool contains(int i)
{
return (i == i0 || i == i1 || i == i2);
}
};
template<int i0, int i1>
struct Set<i0, i1, -1>
{
static bool contains(int i)
{
return (i == i0 || i == i1);
}
};
template<int i0>
struct Set<i0, -1, -1>
{
static bool contains(int i)
{
return (i == i0);
}
};
enum SizePolicy
{
TO_YUV, FROM_YUV, NONE
};
template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
struct CvtHelper
{
CvtHelper(InputArray _src, OutputArray _dst, int dcn)
{
int stype = _src.type();
scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype);
CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) );
if (_src.getObj() == _dst.getObj()) // inplace processing (#6653)
_src.copyTo(src);
else
src = _src.getMat();
Size sz = src.size();
switch (sizePolicy)
{
case TO_YUV:
CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0);
dstSz = Size(sz.width, sz.height / 2 * 3);
break;
case FROM_YUV:
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0);
dstSz = Size(sz.width, sz.height * 2 / 3);
break;
case NONE:
default:
dstSz = sz;
break;
}
_dst.create(dstSz, CV_MAKETYPE(depth, dcn));
dst = _dst.getMat();
}
Mat src, dst;
int depth, scn;
Size dstSz;
};
#ifdef HAVE_OPENCL
template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
struct OclHelper
{
OclHelper( InputArray _src, OutputArray _dst, int dcn)
{
src = _src.getUMat();
Size sz = src.size(), dstSz;
int scn = src.channels();
int depth = src.depth();
CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) );
switch (sizePolicy)
{
case TO_YUV:
CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
dstSz = Size(sz.width, sz.height / 2 * 3);
break;
case FROM_YUV:
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 );
dstSz = Size(sz.width, sz.height * 2 / 3);
break;
case NONE:
default:
dstSz = sz;
break;
}
_dst.create(dstSz, CV_MAKETYPE(depth, dcn));
dst = _dst.getUMat();
}
bool createKernel(cv::String name, ocl::ProgramSource& source, cv::String options)
{
ocl::Device dev = ocl::Device::getDefault();
int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
int pxPerWIx = 1;
cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
src.depth(), src.channels(), pxPerWIy);
switch (sizePolicy)
{
case TO_YUV:
if (dev.isIntel() &&
src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
dst.step % 4 == 0 && dst.offset % 4 == 0)
{
pxPerWIx = 2;
}
globalSize[0] = (size_t)dst.cols/(2*pxPerWIx);
globalSize[1] = ((size_t)dst.rows/3 + pxPerWIy - 1) / pxPerWIy;
baseOptions += format("-D PIX_PER_WI_X=%d ", pxPerWIx);
break;
case FROM_YUV:
globalSize[0] = (size_t)dst.cols/2;
globalSize[1] = ((size_t)dst.rows/2 + pxPerWIy - 1) / pxPerWIy;
break;
case NONE:
default:
globalSize[0] = (size_t)src.cols;
globalSize[1] = ((size_t)src.rows + pxPerWIy - 1) / pxPerWIy;
break;
}
k.create(name.c_str(), source, baseOptions + options);
if(k.empty())
return false;
nArgs = k.set(0, ocl::KernelArg::ReadOnlyNoSize(src));
nArgs = k.set(nArgs, ocl::KernelArg::WriteOnly(dst));
return true;
}
bool run()
{
return k.run(2, globalSize, NULL, false);
}
template<typename T>
void setArg(const T& arg)
{
nArgs = k.set(nArgs, arg);
}
UMat src, dst;
ocl::Kernel k;
size_t globalSize[2];
int nArgs;
};
#endif
///////////////////////////// Top-level template function ////////////////////////////////
template <typename Cvt>
class CvtColorLoop_Invoker : public ParallelLoopBody
{
typedef typename Cvt::channel_type _Tp;
public:
CvtColorLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt) :
ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_),
width(width_), cvt(_cvt)
{
}
virtual void operator()(const Range& range) const
{
CV_TRACE_FUNCTION();
const uchar* yS = src_data + static_cast<size_t>(range.start) * src_step;
uchar* yD = dst_data + static_cast<size_t>(range.start) * dst_step;
for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step )
cvt(reinterpret_cast<const _Tp*>(yS), reinterpret_cast<_Tp*>(yD), width);
}
private:
const uchar * src_data;
const size_t src_step;
uchar * dst_data;
const size_t dst_step;
const int width;
const Cvt& cvt;
const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
};
template <typename Cvt>
void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
{
parallel_for_(Range(0, height),
CvtColorLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt),
(width * height) / static_cast<double>(1<<16));
}
#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700)
# define NEED_IPP 1
#else
# define NEED_IPP 0
#endif
#if NEED_IPP
#define MAX_IPP8u 255
#define MAX_IPP16u 65535
#define MAX_IPP32f 1.0
typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
template <typename Cvt>
class CvtColorIPPLoop_Invoker :
public ParallelLoopBody
{
public:
CvtColorIPPLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt, bool *_ok) :
ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), cvt(_cvt), ok(_ok)
{
*ok = true;
}
virtual void operator()(const Range& range) const
{
const void *yS = src_data + src_step * range.start;
void *yD = dst_data + dst_step * range.start;
if( !cvt(yS, static_cast<int>(src_step), yD, static_cast<int>(dst_step), width, range.end - range.start) )
*ok = false;
else
{
CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
}
}
private:
const uchar * src_data;
const size_t src_step;
uchar * dst_data;
const size_t dst_step;
const int width;
const Cvt& cvt;
bool *ok;
const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
};
template <typename Cvt>
bool CvtColorIPPLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
{
bool ok;
parallel_for_(Range(0, height), CvtColorIPPLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt, &ok), (width * height)/(double)(1<<16) );
return ok;
}
template <typename Cvt>
bool CvtColorIPPLoopCopy(const uchar * src_data, size_t src_step, int src_type, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
{
Mat temp;
Mat src(Size(width, height), src_type, const_cast<uchar*>(src_data), src_step);
Mat source = src;
if( src_data == dst_data )
{
src.copyTo(temp);
source = temp;
}
bool ok;
parallel_for_(Range(0, source.rows),
CvtColorIPPLoop_Invoker<Cvt>(source.data, source.step, dst_data, dst_step,
source.cols, cvt, &ok),
source.total()/(double)(1<<16) );
return ok;
}
struct IPPGeneralFunctor
{
IPPGeneralFunctor(ippiGeneralFunc _func) : ippiColorConvertGeneral(_func){}
bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
{
return ippiColorConvertGeneral ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
}
private:
ippiGeneralFunc ippiColorConvertGeneral;
};
struct IPPReorderFunctor
{
IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : ippiColorConvertReorder(_func)
{
order[0] = _order0;
order[1] = _order1;
order[2] = _order2;
order[3] = 3;
}
bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
{
return ippiColorConvertReorder ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
}
private:
ippiReorderFunc ippiColorConvertReorder;
int order[4];
};
struct IPPReorderGeneralFunctor
{
IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
ippiColorConvertReorder(_func1), ippiColorConvertGeneral(_func2), depth(_depth)
{
order[0] = _order0;
order[1] = _order1;
order[2] = _order2;
order[3] = 3;
}
bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
{
if (ippiColorConvertReorder == 0 || ippiColorConvertGeneral == 0)
return false;
Mat temp;
temp.create(rows, cols, CV_MAKETYPE(depth, 3));
if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
return false;
return CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
}
private:
ippiReorderFunc ippiColorConvertReorder;
ippiGeneralFunc ippiColorConvertGeneral;
int order[4];
int depth;
};
struct IPPGeneralReorderFunctor
{
IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
ippiColorConvertGeneral(_func1), ippiColorConvertReorder(_func2), depth(_depth)
{
order[0] = _order0;
order[1] = _order1;
order[2] = _order2;
order[3] = 3;
}
bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
{
if (ippiColorConvertGeneral == 0 || ippiColorConvertReorder == 0)
return false;
Mat temp;
temp.create(rows, cols, CV_MAKETYPE(depth, 3));
if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
return false;
return CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
}
private:
ippiGeneralFunc ippiColorConvertGeneral;
ippiReorderFunc ippiColorConvertReorder;
int order[4];
int depth;
};
extern ippiReorderFunc ippiSwapChannelsC3C4RTab[8];
extern ippiReorderFunc ippiSwapChannelsC4C3RTab[8];
extern ippiReorderFunc ippiSwapChannelsC3RTab[8];
#endif
#ifdef HAVE_OPENCL
bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb );
bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb );
bool oclCvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb);
bool oclCvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb);
bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx );
bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full );
bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full );
bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full );
bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full );
bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse );
bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits );
bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits );
bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits );
bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits );
bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx );
bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn );
bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst );
bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst );
bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx);
bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx);
bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx );
bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx );
bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx );
bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
#endif
void cvtColorBGR2Lab( InputArray _src, OutputArray _dst, bool swapb, bool srgb);
void cvtColorBGR2Luv( InputArray _src, OutputArray _dst, bool swapb, bool srgb);
void cvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb );
void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb );
void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb );
void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb );
void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb);
void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb);
void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn);
void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx );
void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx);
void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi );
void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange );
void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange );
void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange);
void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange);
void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb);
void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits);
void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits);
void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb);
void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn);
void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits);
void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits);
void cvtColorRGBA2mRGBA(InputArray _src, OutputArray _dst);
void cvtColormRGBA2RGBA(InputArray _src, OutputArray _dst);
} //namespace cv
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "color.hpp"
namespace cv
{
////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
struct RGB2HSV_b
{
typedef uchar channel_type;
RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
: srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
{
CV_Assert( hrange == 180 || hrange == 256 );
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int i, bidx = blueIdx, scn = srccn;
const int hsv_shift = 12;
static int sdiv_table[256];
static int hdiv_table180[256];
static int hdiv_table256[256];
static volatile bool initialized = false;
int hr = hrange;
const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
n *= 3;
if( !initialized )
{
sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
for( i = 1; i < 256; i++ )
{
sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
}
initialized = true;
}
for( i = 0; i < n; i += 3, src += scn )
{
int b = src[bidx], g = src[1], r = src[bidx^2];
int h, s, v = b;
int vmin = b;
int vr, vg;
CV_CALC_MAX_8U( v, g );
CV_CALC_MAX_8U( v, r );
CV_CALC_MIN_8U( vmin, g );
CV_CALC_MIN_8U( vmin, r );
uchar diff = saturate_cast<uchar>(v - vmin);
vr = v == r ? -1 : 0;
vg = v == g ? -1 : 0;
s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
h = (vr & (g - b)) +
(~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
h += h < 0 ? hr : 0;
dst[i] = saturate_cast<uchar>(h);
dst[i+1] = (uchar)s;
dst[i+2] = (uchar)v;
}
}
int srccn, blueIdx, hrange;
};
struct RGB2HSV_f
{
typedef float channel_type;
RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
: srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
void operator()(const float* src, float* dst, int n) const
{
int i, bidx = blueIdx, scn = srccn;
float hscale = hrange*(1.f/360.f);
n *= 3;
for( i = 0; i < n; i += 3, src += scn )
{
float b = src[bidx], g = src[1], r = src[bidx^2];
float h, s, v;
float vmin, diff;
v = vmin = r;
if( v < g ) v = g;
if( v < b ) v = b;
if( vmin > g ) vmin = g;
if( vmin > b ) vmin = b;
diff = v - vmin;
s = diff/(float)(fabs(v) + FLT_EPSILON);
diff = (float)(60./(diff + FLT_EPSILON));
if( v == r )
h = (g - b)*diff;
else if( v == g )
h = (b - r)*diff + 120.f;
else
h = (r - g)*diff + 240.f;
if( h < 0 ) h += 360.f;
dst[i] = h*hscale;
dst[i+1] = s;
dst[i+2] = v;
}
}
int srccn, blueIdx;
float hrange;
};
struct HSV2RGB_f
{
typedef float channel_type;
HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
: dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
#if CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
}
#if CV_SSE2
void process(__m128& v_h0, __m128& v_h1, __m128& v_s0,
__m128& v_s1, __m128& v_v0, __m128& v_v1) const
{
v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
__m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
__m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
__m128 v_tab00 = v_v0;
__m128 v_tab01 = v_v1;
__m128 v_tab10 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), v_s0));
__m128 v_tab11 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), v_s1));
__m128 v_tab20 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s0, v_h0)));
__m128 v_tab21 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s1, v_h1)));
__m128 v_tab30 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s0, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0))));
__m128 v_tab31 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s1, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1))));
__m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
__m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
v_h0 = _mm_and_ps(v_tab10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
v_h1 = _mm_and_ps(v_tab11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
v_s0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_s1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
v_v0 = _mm_and_ps(v_tab00, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_v1 = _mm_and_ps(v_tab01, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab00, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab01, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
}
#endif
void operator()(const float* src, float* dst, int n) const
{
int i = 0, bidx = blueIdx, dcn = dstcn;
float _hscale = hscale;
float alpha = ColorChannel<float>::max();
n *= 3;
#if CV_SSE2
if (haveSIMD)
{
for( ; i <= n - 24; i += 24, dst += dcn * 8 )
{
__m128 v_h0 = _mm_loadu_ps(src + i + 0);
__m128 v_h1 = _mm_loadu_ps(src + i + 4);
__m128 v_s0 = _mm_loadu_ps(src + i + 8);
__m128 v_s1 = _mm_loadu_ps(src + i + 12);
__m128 v_v0 = _mm_loadu_ps(src + i + 16);
__m128 v_v1 = _mm_loadu_ps(src + i + 20);
_mm_deinterleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
process(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
if (dcn == 3)
{
if (bidx)
{
_mm_interleave_ps(v_v0, v_v1, v_s0, v_s1, v_h0, v_h1);
_mm_storeu_ps(dst + 0, v_v0);
_mm_storeu_ps(dst + 4, v_v1);
_mm_storeu_ps(dst + 8, v_s0);
_mm_storeu_ps(dst + 12, v_s1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
}
else
{
_mm_interleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_s0);
_mm_storeu_ps(dst + 12, v_s1);
_mm_storeu_ps(dst + 16, v_v0);
_mm_storeu_ps(dst + 20, v_v1);
}
}
else
{
__m128 v_a0 = _mm_set1_ps(alpha);
__m128 v_a1 = _mm_set1_ps(alpha);
if (bidx)
{
_mm_interleave_ps(v_v0, v_v1, v_s0, v_s1, v_h0, v_h1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_v0);
_mm_storeu_ps(dst + 4, v_v1);
_mm_storeu_ps(dst + 8, v_s0);
_mm_storeu_ps(dst + 12, v_s1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
}
else
{
_mm_interleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_s0);
_mm_storeu_ps(dst + 12, v_s1);
_mm_storeu_ps(dst + 16, v_v0);
_mm_storeu_ps(dst + 20, v_v1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
}
}
}
}
#endif
for( ; i < n; i += 3, dst += dcn )
{
float h = src[i], s = src[i+1], v = src[i+2];
float b, g, r;
if( s == 0 )
b = g = r = v;
else
{
static const int sector_data[][3]=
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
float tab[4];
int sector;
h *= _hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
sector = cvFloor(h);
h -= sector;
if( (unsigned)sector >= 6u )
{
sector = 0;
h = 0.f;
}
tab[0] = v;
tab[1] = v*(1.f - s);
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h));
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
dst[bidx] = b;
dst[1] = g;
dst[bidx^2] = r;
if( dcn == 4 )
dst[3] = alpha;
}
}
int dstcn, blueIdx;
float hscale;
#if CV_SSE2
bool haveSIMD;
#endif
};
struct HSV2RGB_b
{
typedef uchar channel_type;
HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
: dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
{
#if CV_NEON
v_scale_inv = vdupq_n_f32(1.f/255.f);
v_scale = vdupq_n_f32(255.f);
v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
#elif CV_SSE2
v_scale = _mm_set1_ps(255.0f);
v_alpha = _mm_set1_ps(ColorChannel<uchar>::max());
v_zero = _mm_setzero_si128();
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
}
#if CV_SSE2
void process(__m128i v_r, __m128i v_g, __m128i v_b,
const __m128& v_coeffs_,
float * buf) const
{
__m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
__m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
__m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
__m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
__m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
__m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
__m128 v_coeffs = v_coeffs_;
v_r0 = _mm_mul_ps(v_r0, v_coeffs);
v_g1 = _mm_mul_ps(v_g1, v_coeffs);
v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
v_r1 = _mm_mul_ps(v_r1, v_coeffs);
v_b0 = _mm_mul_ps(v_b0, v_coeffs);
v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
v_g0 = _mm_mul_ps(v_g0, v_coeffs);
v_b1 = _mm_mul_ps(v_b1, v_coeffs);
_mm_store_ps(buf, v_r0);
_mm_store_ps(buf + 4, v_r1);
_mm_store_ps(buf + 8, v_g0);
_mm_store_ps(buf + 12, v_g1);
_mm_store_ps(buf + 16, v_b0);
_mm_store_ps(buf + 20, v_b1);
}
#endif
void operator()(const uchar* src, uchar* dst, int n) const
{
int i, j, dcn = dstcn;
uchar alpha = ColorChannel<uchar>::max();
float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
#if CV_SSE2
__m128 v_coeffs = _mm_set_ps(1.f, 1.f/255.f, 1.f/255.f, 1.f);
#endif
for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
{
int dn = std::min(n - i, (int)BLOCK_SIZE);
j = 0;
#if CV_NEON
for ( ; j <= (dn - 8) * 3; j += 24)
{
uint8x8x3_t v_src = vld3_u8(src + j);
uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
v_t1 = vmovl_u8(v_src.val[1]),
v_t2 = vmovl_u8(v_src.val[2]);
float32x4x3_t v_dst;
v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
vst3q_f32(buf + j, v_dst);
v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
vst3q_f32(buf + j + 12, v_dst);
}
#elif CV_SSE2
if (haveSIMD)
{
for ( ; j <= (dn - 8) * 3; j += 24)
{
__m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j));
__m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16));
process(_mm_unpacklo_epi8(v_src0, v_zero),
_mm_unpackhi_epi8(v_src0, v_zero),
_mm_unpacklo_epi8(v_src1, v_zero),
v_coeffs,
buf + j);
}
}
#endif
for( ; j < dn*3; j += 3 )
{
buf[j] = src[j];
buf[j+1] = src[j+1]*(1.f/255.f);
buf[j+2] = src[j+2]*(1.f/255.f);
}
cvt(buf, buf, dn);
j = 0;
#if CV_NEON
for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
{
float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
if (dcn == 4)
{
uint8x8x4_t v_dst;
v_dst.val[0] = v_dst0;
v_dst.val[1] = v_dst1;
v_dst.val[2] = v_dst2;
v_dst.val[3] = v_alpha;
vst4_u8(dst, v_dst);
}
else
{
uint8x8x3_t v_dst;
v_dst.val[0] = v_dst0;
v_dst.val[1] = v_dst1;
v_dst.val[2] = v_dst2;
vst3_u8(dst, v_dst);
}
}
#elif CV_SSE2
if (dcn == 3 && haveSIMD)
{
for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
{
__m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
__m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
__m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
__m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
_mm_cvtps_epi32(v_src1));
__m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
_mm_cvtps_epi32(v_src3));
_mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
}
int jr = j % 3;
if (jr)
dst -= jr, j -= jr;
}
else if (dcn == 4 && haveSIMD)
{
for ( ; j <= (dn * 3 - 12); j += 12, dst += 16)
{
__m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
__m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
__m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
__m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha);
__m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha);
__m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44));
__m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78);
__m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e));
__m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78);
__m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1);
__m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3);
_mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
}
int jr = j % 3;
if (jr)
dst -= jr, j -= jr;
}
#endif
for( ; j < dn*3; j += 3, dst += dcn )
{
dst[0] = saturate_cast<uchar>(buf[j]*255.f);
dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
if( dcn == 4 )
dst[3] = alpha;
}
}
}
int dstcn;
HSV2RGB_f cvt;
#if CV_NEON
float32x4_t v_scale, v_scale_inv;
uint8x8_t v_alpha;
#elif CV_SSE2
__m128 v_scale;
__m128 v_alpha;
__m128i v_zero;
bool haveSIMD;
#endif
};
///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
struct RGB2HLS_f
{
typedef float channel_type;
RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
: srccn(_srccn), blueIdx(_blueIdx), hscale(_hrange/360.f) {
#if CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
}
#if CV_SSE2
void process(__m128& v_b0, __m128& v_b1, __m128& v_g0,
__m128& v_g1, __m128& v_r0, __m128& v_r1) const
{
__m128 v_max0 = _mm_max_ps(_mm_max_ps(v_b0, v_g0), v_r0);
__m128 v_max1 = _mm_max_ps(_mm_max_ps(v_b1, v_g1), v_r1);
__m128 v_min0 = _mm_min_ps(_mm_min_ps(v_b0, v_g0), v_r0);
__m128 v_min1 = _mm_min_ps(_mm_min_ps(v_b1, v_g1), v_r1);
__m128 v_diff0 = _mm_sub_ps(v_max0, v_min0);
__m128 v_diff1 = _mm_sub_ps(v_max1, v_min1);
__m128 v_sum0 = _mm_add_ps(v_max0, v_min0);
__m128 v_sum1 = _mm_add_ps(v_max1, v_min1);
__m128 v_l0 = _mm_mul_ps(v_sum0, _mm_set1_ps(0.5f));
__m128 v_l1 = _mm_mul_ps(v_sum1, _mm_set1_ps(0.5f));
__m128 v_gel0 = _mm_cmpge_ps(v_l0, _mm_set1_ps(0.5f));
__m128 v_gel1 = _mm_cmpge_ps(v_l1, _mm_set1_ps(0.5f));
__m128 v_s0 = _mm_and_ps(v_gel0, _mm_sub_ps(_mm_set1_ps(2.0f), v_sum0));
__m128 v_s1 = _mm_and_ps(v_gel1, _mm_sub_ps(_mm_set1_ps(2.0f), v_sum1));
v_s0 = _mm_or_ps(v_s0, _mm_andnot_ps(v_gel0, v_sum0));
v_s1 = _mm_or_ps(v_s1, _mm_andnot_ps(v_gel1, v_sum1));
v_s0 = _mm_div_ps(v_diff0, v_s0);
v_s1 = _mm_div_ps(v_diff1, v_s1);
__m128 v_gteps0 = _mm_cmpgt_ps(v_diff0, _mm_set1_ps(FLT_EPSILON));
__m128 v_gteps1 = _mm_cmpgt_ps(v_diff1, _mm_set1_ps(FLT_EPSILON));
v_diff0 = _mm_div_ps(_mm_set1_ps(60.f), v_diff0);
v_diff1 = _mm_div_ps(_mm_set1_ps(60.f), v_diff1);
__m128 v_eqr0 = _mm_cmpeq_ps(v_max0, v_r0);
__m128 v_eqr1 = _mm_cmpeq_ps(v_max1, v_r1);
__m128 v_h0 = _mm_and_ps(v_eqr0, _mm_mul_ps(_mm_sub_ps(v_g0, v_b0), v_diff0));
__m128 v_h1 = _mm_and_ps(v_eqr1, _mm_mul_ps(_mm_sub_ps(v_g1, v_b1), v_diff1));
__m128 v_eqg0 = _mm_cmpeq_ps(v_max0, v_g0);
__m128 v_eqg1 = _mm_cmpeq_ps(v_max1, v_g1);
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(_mm_andnot_ps(v_eqr0, v_eqg0), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_b0, v_r0), v_diff0), _mm_set1_ps(120.f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(_mm_andnot_ps(v_eqr1, v_eqg1), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_b1, v_r1), v_diff1), _mm_set1_ps(120.f))));
v_h0 = _mm_or_ps(v_h0, _mm_andnot_ps(_mm_or_ps(v_eqr0, v_eqg0), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_r0, v_g0), v_diff0), _mm_set1_ps(240.f))));
v_h1 = _mm_or_ps(v_h1, _mm_andnot_ps(_mm_or_ps(v_eqr1, v_eqg1), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_r1, v_g1), v_diff1), _mm_set1_ps(240.f))));
v_h0 = _mm_add_ps(v_h0, _mm_and_ps(_mm_cmplt_ps(v_h0, _mm_setzero_ps()), _mm_set1_ps(360.f)));
v_h1 = _mm_add_ps(v_h1, _mm_and_ps(_mm_cmplt_ps(v_h1, _mm_setzero_ps()), _mm_set1_ps(360.f)));
v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
v_b0 = _mm_and_ps(v_gteps0, v_h0);
v_b1 = _mm_and_ps(v_gteps1, v_h1);
v_g0 = v_l0;
v_g1 = v_l1;
v_r0 = _mm_and_ps(v_gteps0, v_s0);
v_r1 = _mm_and_ps(v_gteps1, v_s1);
}
#endif
void operator()(const float* src, float* dst, int n) const
{
int i = 0, bidx = blueIdx, scn = srccn;
n *= 3;
#if CV_SSE2
if (haveSIMD)
{
for( ; i <= n - 24; i += 24, src += scn * 8 )
{
__m128 v_b0 = _mm_loadu_ps(src + 0);
__m128 v_b1 = _mm_loadu_ps(src + 4);
__m128 v_g0 = _mm_loadu_ps(src + 8);
__m128 v_g1 = _mm_loadu_ps(src + 12);
__m128 v_r0 = _mm_loadu_ps(src + 16);
__m128 v_r1 = _mm_loadu_ps(src + 20);
if (scn == 3)
{
_mm_deinterleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
}
else
{
__m128 v_a0 = _mm_loadu_ps(src + 24);
__m128 v_a1 = _mm_loadu_ps(src + 28);
_mm_deinterleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1, v_a0, v_a1);
}
if (bidx)
{
__m128 v_tmp0 = v_b0;
__m128 v_tmp1 = v_b1;
v_b0 = v_r0;
v_b1 = v_r1;
v_r0 = v_tmp0;
v_r1 = v_tmp1;
}
process(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
_mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
_mm_storeu_ps(dst + i + 0, v_b0);
_mm_storeu_ps(dst + i + 4, v_b1);
_mm_storeu_ps(dst + i + 8, v_g0);
_mm_storeu_ps(dst + i + 12, v_g1);
_mm_storeu_ps(dst + i + 16, v_r0);
_mm_storeu_ps(dst + i + 20, v_r1);
}
}
#endif
for( ; i < n; i += 3, src += scn )
{
float b = src[bidx], g = src[1], r = src[bidx^2];
float h = 0.f, s = 0.f, l;
float vmin, vmax, diff;
vmax = vmin = r;
if( vmax < g ) vmax = g;
if( vmax < b ) vmax = b;
if( vmin > g ) vmin = g;
if( vmin > b ) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if( diff > FLT_EPSILON )
{
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
diff = 60.f/diff;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
h = (b - r)*diff + 120.f;
else
h = (r - g)*diff + 240.f;
if( h < 0.f ) h += 360.f;
}
dst[i] = h*hscale;
dst[i+1] = l;
dst[i+2] = s;
}
}
int srccn, blueIdx;
float hscale;
#if CV_SSE2
bool haveSIMD;
#endif
};
struct RGB2HLS_b
{
typedef uchar channel_type;
RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
: srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
{
#if CV_NEON
v_scale_inv = vdupq_n_f32(1.f/255.f);
v_scale = vdupq_n_f32(255.f);
v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
#elif CV_SSE2
v_scale_inv = _mm_set1_ps(1.f/255.f);
v_zero = _mm_setzero_si128();
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
}
#if CV_SSE2
void process(const float * buf,
__m128 & v_coeffs, uchar * dst) const
{
__m128 v_l0f = _mm_load_ps(buf);
__m128 v_l1f = _mm_load_ps(buf + 4);
__m128 v_u0f = _mm_load_ps(buf + 8);
__m128 v_u1f = _mm_load_ps(buf + 12);
v_l0f = _mm_mul_ps(v_l0f, v_coeffs);
v_u1f = _mm_mul_ps(v_u1f, v_coeffs);
v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92));
v_u0f = _mm_mul_ps(v_u0f, v_coeffs);
v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92));
v_l1f = _mm_mul_ps(v_l1f, v_coeffs);
__m128i v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
__m128i v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f));
__m128i v_l0 = _mm_packus_epi16(v_l, v_u);
_mm_storeu_si128((__m128i *)(dst), v_l0);
}
#endif
void operator()(const uchar* src, uchar* dst, int n) const
{
int i, j, scn = srccn;
float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
#if CV_SSE2
__m128 v_coeffs = _mm_set_ps(1.f, 255.f, 255.f, 1.f);
#endif
for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
{
int dn = std::min(n - i, (int)BLOCK_SIZE);
j = 0;
#if CV_NEON
for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
{
uint16x8_t v_t0, v_t1, v_t2;
if (scn == 3)
{
uint8x8x3_t v_src = vld3_u8(src);
v_t0 = vmovl_u8(v_src.val[0]);
v_t1 = vmovl_u8(v_src.val[1]);
v_t2 = vmovl_u8(v_src.val[2]);
}
else
{
uint8x8x4_t v_src = vld4_u8(src);
v_t0 = vmovl_u8(v_src.val[0]);
v_t1 = vmovl_u8(v_src.val[1]);
v_t2 = vmovl_u8(v_src.val[2]);
}
float32x4x3_t v_dst;
v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
vst3q_f32(buf + j, v_dst);
v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
vst3q_f32(buf + j + 12, v_dst);
}
#elif CV_SSE2
if (scn == 3 && haveSIMD)
{
for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)src);
__m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
_mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
_mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
_mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
_mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
}
int jr = j % 3;
if (jr)
src -= jr, j -= jr;
}
else if (scn == 4 && haveSIMD)
{
for ( ; j <= (dn * 3 - 12); j += 12, src += 16)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)src);
__m128i v_src_lo = _mm_unpacklo_epi8(v_src, v_zero);
__m128i v_src_hi = _mm_unpackhi_epi8(v_src, v_zero);
_mm_storeu_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_lo, v_zero)), v_scale_inv));
_mm_storeu_ps(buf + j + 3, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_lo, v_zero)), v_scale_inv));
_mm_storeu_ps(buf + j + 6, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_hi, v_zero)), v_scale_inv));
float tmp = buf[j + 8];
_mm_storeu_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_unpackhi_epi16(v_src_hi, v_zero), 0x90)), v_scale_inv));
buf[j + 8] = tmp;
}
int jr = j % 3;
if (jr)
src -= jr, j -= jr;
}
#endif
for( ; j < dn*3; j += 3, src += scn )
{
buf[j] = src[0]*(1.f/255.f);
buf[j+1] = src[1]*(1.f/255.f);
buf[j+2] = src[2]*(1.f/255.f);
}
cvt(buf, buf, dn);
j = 0;
#if CV_NEON
for ( ; j <= (dn - 8) * 3; j += 24)
{
float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
uint8x8x3_t v_dst;
v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
vst3_u8(dst + j, v_dst);
}
#elif CV_SSE2
if (haveSIMD)
{
for ( ; j <= (dn - 16) * 3; j += 48)
{
process(buf + j,
v_coeffs, dst + j);
process(buf + j + 16,
v_coeffs, dst + j + 16);
process(buf + j + 32,
v_coeffs, dst + j + 32);
}
}
#endif
for( ; j < dn*3; j += 3 )
{
dst[j] = saturate_cast<uchar>(buf[j]);
dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
}
}
}
int srccn;
RGB2HLS_f cvt;
#if CV_NEON
float32x4_t v_scale, v_scale_inv;
uint8x8_t v_alpha;
#elif CV_SSE2
__m128 v_scale_inv;
__m128i v_zero;
bool haveSIMD;
#endif
};
struct HLS2RGB_f
{
typedef float channel_type;
HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
: dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
#if CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
}
#if CV_SSE2
void process(__m128& v_h0, __m128& v_h1, __m128& v_l0,
__m128& v_l1, __m128& v_s0, __m128& v_s1) const
{
__m128 v_lel0 = _mm_cmple_ps(v_l0, _mm_set1_ps(0.5f));
__m128 v_lel1 = _mm_cmple_ps(v_l1, _mm_set1_ps(0.5f));
__m128 v_p20 = _mm_andnot_ps(v_lel0, _mm_sub_ps(_mm_add_ps(v_l0, v_s0), _mm_mul_ps(v_l0, v_s0)));
__m128 v_p21 = _mm_andnot_ps(v_lel1, _mm_sub_ps(_mm_add_ps(v_l1, v_s1), _mm_mul_ps(v_l1, v_s1)));
v_p20 = _mm_or_ps(v_p20, _mm_and_ps(v_lel0, _mm_mul_ps(v_l0, _mm_add_ps(_mm_set1_ps(1.0f), v_s0))));
v_p21 = _mm_or_ps(v_p21, _mm_and_ps(v_lel1, _mm_mul_ps(v_l1, _mm_add_ps(_mm_set1_ps(1.0f), v_s1))));
__m128 v_p10 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l0), v_p20);
__m128 v_p11 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l1), v_p21);
v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
__m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
__m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
__m128 v_p2_p10 = _mm_sub_ps(v_p20, v_p10);
__m128 v_p2_p11 = _mm_sub_ps(v_p21, v_p11);
__m128 v_tab20 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0)));
__m128 v_tab21 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1)));
__m128 v_tab30 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, v_h0));
__m128 v_tab31 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, v_h1));
__m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
__m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
v_h0 = _mm_and_ps(v_p10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
v_h1 = _mm_and_ps(v_p11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
v_l0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_l1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
v_s0 = _mm_and_ps(v_p20, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_s1 = _mm_and_ps(v_p21, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
}
#endif
void operator()(const float* src, float* dst, int n) const
{
int i = 0, bidx = blueIdx, dcn = dstcn;
float _hscale = hscale;
float alpha = ColorChannel<float>::max();
n *= 3;
#if CV_SSE2
if (haveSIMD)
{
for( ; i <= n - 24; i += 24, dst += dcn * 8 )
{
__m128 v_h0 = _mm_loadu_ps(src + i + 0);
__m128 v_h1 = _mm_loadu_ps(src + i + 4);
__m128 v_l0 = _mm_loadu_ps(src + i + 8);
__m128 v_l1 = _mm_loadu_ps(src + i + 12);
__m128 v_s0 = _mm_loadu_ps(src + i + 16);
__m128 v_s1 = _mm_loadu_ps(src + i + 20);
_mm_deinterleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
process(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
if (dcn == 3)
{
if (bidx)
{
_mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1);
_mm_storeu_ps(dst + 0, v_s0);
_mm_storeu_ps(dst + 4, v_s1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
}
else
{
_mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_s0);
_mm_storeu_ps(dst + 20, v_s1);
}
}
else
{
__m128 v_a0 = _mm_set1_ps(alpha);
__m128 v_a1 = _mm_set1_ps(alpha);
if (bidx)
{
_mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_s0);
_mm_storeu_ps(dst + 4, v_s1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
}
else
{
_mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_s0);
_mm_storeu_ps(dst + 20, v_s1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
}
}
}
}
#endif
for( ; i < n; i += 3, dst += dcn )
{
float h = src[i], l = src[i+1], s = src[i+2];
float b, g, r;
if( s == 0 )
b = g = r = l;
else
{
static const int sector_data[][3]=
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
float tab[4];
int sector;
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= _hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
assert( 0 <= h && h < 6 );
sector = cvFloor(h);
h -= sector;
tab[0] = p2;
tab[1] = p1;
tab[2] = p1 + (p2 - p1)*(1-h);
tab[3] = p1 + (p2 - p1)*h;
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
dst[bidx] = b;
dst[1] = g;
dst[bidx^2] = r;
if( dcn == 4 )
dst[3] = alpha;
}
}
int dstcn, blueIdx;
float hscale;
#if CV_SSE2
bool haveSIMD;
#endif
};
struct HLS2RGB_b
{
typedef uchar channel_type;
HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
: dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
{
#if CV_NEON
v_scale_inv = vdupq_n_f32(1.f/255.f);
v_scale = vdupq_n_f32(255.f);
v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
#elif CV_SSE2
v_scale = _mm_set1_ps(255.f);
v_alpha = _mm_set1_ps(ColorChannel<uchar>::max());
v_zero = _mm_setzero_si128();
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
}
#if CV_SSE2
void process(__m128i v_r, __m128i v_g, __m128i v_b,
const __m128& v_coeffs_,
float * buf) const
{
__m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
__m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
__m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
__m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
__m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
__m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
__m128 v_coeffs = v_coeffs_;
v_r0 = _mm_mul_ps(v_r0, v_coeffs);
v_g1 = _mm_mul_ps(v_g1, v_coeffs);
v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
v_r1 = _mm_mul_ps(v_r1, v_coeffs);
v_b0 = _mm_mul_ps(v_b0, v_coeffs);
v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
v_g0 = _mm_mul_ps(v_g0, v_coeffs);
v_b1 = _mm_mul_ps(v_b1, v_coeffs);
_mm_store_ps(buf, v_r0);
_mm_store_ps(buf + 4, v_r1);
_mm_store_ps(buf + 8, v_g0);
_mm_store_ps(buf + 12, v_g1);
_mm_store_ps(buf + 16, v_b0);
_mm_store_ps(buf + 20, v_b1);
}
#endif
void operator()(const uchar* src, uchar* dst, int n) const
{
int i, j, dcn = dstcn;
uchar alpha = ColorChannel<uchar>::max();
float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
#if CV_SSE2
__m128 v_coeffs = _mm_set_ps(1.f, 1.f/255.f, 1.f/255.f, 1.f);
#endif
for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
{
int dn = std::min(n - i, (int)BLOCK_SIZE);
j = 0;
#if CV_NEON
for ( ; j <= (dn - 8) * 3; j += 24)
{
uint8x8x3_t v_src = vld3_u8(src + j);
uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
v_t1 = vmovl_u8(v_src.val[1]),
v_t2 = vmovl_u8(v_src.val[2]);
float32x4x3_t v_dst;
v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
vst3q_f32(buf + j, v_dst);
v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
vst3q_f32(buf + j + 12, v_dst);
}
#elif CV_SSE2
if (haveSIMD)
{
for ( ; j <= (dn - 8) * 3; j += 24)
{
__m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j));
__m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16));
process(_mm_unpacklo_epi8(v_src0, v_zero),
_mm_unpackhi_epi8(v_src0, v_zero),
_mm_unpacklo_epi8(v_src1, v_zero),
v_coeffs,
buf + j);
}
}
#endif
for( ; j < dn*3; j += 3 )
{
buf[j] = src[j];
buf[j+1] = src[j+1]*(1.f/255.f);
buf[j+2] = src[j+2]*(1.f/255.f);
}
cvt(buf, buf, dn);
j = 0;
#if CV_NEON
for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
{
float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
if (dcn == 4)
{
uint8x8x4_t v_dst;
v_dst.val[0] = v_dst0;
v_dst.val[1] = v_dst1;
v_dst.val[2] = v_dst2;
v_dst.val[3] = v_alpha;
vst4_u8(dst, v_dst);
}
else
{
uint8x8x3_t v_dst;
v_dst.val[0] = v_dst0;
v_dst.val[1] = v_dst1;
v_dst.val[2] = v_dst2;
vst3_u8(dst, v_dst);
}
}
#elif CV_SSE2
if (dcn == 3 && haveSIMD)
{
for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
{
__m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
__m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
__m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
__m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
_mm_cvtps_epi32(v_src1));
__m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
_mm_cvtps_epi32(v_src3));
_mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
}
int jr = j % 3;
if (jr)
dst -= jr, j -= jr;
}
else if (dcn == 4 && haveSIMD)
{
for ( ; j <= (dn * 3 - 12); j += 12, dst += 16)
{
__m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
__m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
__m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
__m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha);
__m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha);
__m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44));
__m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78);
__m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e));
__m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78);
__m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1);
__m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3);
_mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
}
int jr = j % 3;
if (jr)
dst -= jr, j -= jr;
}
#endif
for( ; j < dn*3; j += 3, dst += dcn )
{
dst[0] = saturate_cast<uchar>(buf[j]*255.f);
dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
if( dcn == 4 )
dst[3] = alpha;
}
}
}
int dstcn;
HLS2RGB_f cvt;
#if CV_NEON
float32x4_t v_scale, v_scale_inv;
uint8x8_t v_alpha;
#elif CV_SSE2
__m128 v_scale;
__m128 v_alpha;
__m128i v_zero;
bool haveSIMD;
#endif
};
//
// IPP functions
//
#if NEED_IPP
#if !IPP_DISABLE_RGB_HSV
static ippiGeneralFunc ippiRGB2HSVTab[] =
{
(ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
0, 0, 0, 0
};
#endif
static ippiGeneralFunc ippiHSV2RGBTab[] =
{
(ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0,
0, 0, 0, 0
};
static ippiGeneralFunc ippiRGB2HLSTab[] =
{
(ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0,
0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0
};
static ippiGeneralFunc ippiHLS2RGBTab[] =
{
(ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0,
0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
};
#endif
//
// HAL functions
//
namespace hal
{
// 8u, 32f
void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtBGRtoHSV, cv_hal_cvtBGRtoHSV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV);
#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
CV_IPP_CHECK()
{
if(depth == CV_8U && isFullRange)
{
if (isHSV)
{
#if !IPP_DISABLE_RGB_HSV // breaks OCL accuracy tests
if(scn == 3 && !swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
return;
}
else if(scn == 4 && !swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
return;
}
else if(scn == 4 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
return;
}
#endif
}
else
{
if(scn == 3 && !swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
return;
}
else if(scn == 4 && !swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
return;
}
else if(scn == 3 && swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
return;
}
else if(scn == 4 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
return;
}
}
}
}
#endif
int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180;
int blueIdx = swapBlue ? 2 : 0;
if(isHSV)
{
if(depth == CV_8U)
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_b(scn, blueIdx, hrange));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_f(scn, blueIdx, static_cast<float>(hrange)));
}
else
{
if( depth == CV_8U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_b(scn, blueIdx, hrange));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_f(scn, blueIdx, static_cast<float>(hrange)));
}
}
// 8u, 32f
void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtHSVtoBGR, cv_hal_cvtHSVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV);
#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
CV_IPP_CHECK()
{
if (depth == CV_8U && isFullRange)
{
if (isHSV)
{
if(dcn == 3 && !swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
return;
}
else if(dcn == 4 && !swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
return;
}
else if(dcn == 3 && swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
return;
}
else if(dcn == 4 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
return;
}
}
else
{
if(dcn == 3 && !swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
return;
}
else if(dcn == 4 && !swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
return;
}
else if(dcn == 3 && swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
return;
}
else if(dcn == 4 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
return;
}
}
}
}
#endif
int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180;
int blueIdx = swapBlue ? 2 : 0;
if(isHSV)
{
if( depth == CV_8U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_b(dcn, blueIdx, hrange));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_f(dcn, blueIdx, static_cast<float>(hrange)));
}
else
{
if( depth == CV_8U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_b(dcn, blueIdx, hrange));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_f(dcn, blueIdx, static_cast<float>(hrange)));
}
}
} // namespace hal
//
// OCL calls
//
#ifdef HAVE_OPENCL
bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full )
{
OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
if(!h.createKernel("HSV2RGB", ocl::imgproc::color_hsv_oclsrc,
format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
{
return false;
}
return h.run();
}
bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full )
{
OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
if(!h.createKernel("HLS2RGB", ocl::imgproc::color_hsv_oclsrc,
format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
{
return false;
}
return h.run();
}
bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full )
{
OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
float hscale = (_src.depth() == CV_32F ? 360.f : (!full ? 180.f : 256.f))/360.f;
if(!h.createKernel("RGB2HLS", ocl::imgproc::color_hsv_oclsrc,
format("-D hscale=%ff -D bidx=%d -D dcn=3", hscale, bidx)))
{
return false;
}
return h.run();
}
bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full )
{
OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 256);
cv::String options = (_src.depth() == CV_8U ?
format("-D hrange=%d -D bidx=%d -D dcn=3", hrange, bidx) :
format("-D hscale=%ff -D bidx=%d -D dcn=3", hrange*(1.f/360.f), bidx));
if(!h.createKernel("RGB2HSV", ocl::imgproc::color_hsv_oclsrc, options))
{
return false;
}
if(_src.depth() == CV_8U)
{
static UMat sdiv_data;
static UMat hdiv_data180;
static UMat hdiv_data256;
static int sdiv_table[256];
static int hdiv_table180[256];
static int hdiv_table256[256];
static volatile bool initialized180 = false, initialized256 = false;
volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
if (!initialized)
{
int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
int v = 255 << hsv_shift;
if (!initialized180 && !initialized256)
{
for(int i = 1; i < 256; i++ )
sdiv_table[i] = saturate_cast<int>(v/(1.*i));
Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
}
v = hrange << hsv_shift;
for (int i = 1; i < 256; i++ )
hdiv_table[i] = saturate_cast<int>(v/(6.*i));
Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
initialized = true;
}
h.setArg(ocl::KernelArg::PtrReadOnly(sdiv_data));
h.setArg(hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
ocl::KernelArg::PtrReadOnly(hdiv_data180));
}
return h.run();
}
#endif
//
// HAL calls
//
void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange )
{
CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.depth, h.scn, swapb, fullRange, false);
}
void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange )
{
CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.depth, h.scn, swapb, fullRange, true);
}
void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange)
{
if(dcn <= 0) dcn = 3;
CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.depth, dcn, swapb, fullRange, false);
}
void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange)
{
if(dcn <= 0) dcn = 3;
CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.depth, dcn, swapb, fullRange, true);
}
} // namespace cv
This source diff could not be displayed because it is too large. You can view the blob instead.
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "color.hpp"
namespace cv
{
////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
template<typename _Tp> struct RGB2RGB
{
typedef _Tp channel_type;
RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
void operator()(const _Tp* src, _Tp* dst, int n) const
{
int scn = srccn, dcn = dstcn, bidx = blueIdx;
if( dcn == 3 )
{
n *= 3;
for( int i = 0; i < n; i += 3, src += scn )
{
_Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
}
}
else if( scn == 3 )
{
n *= 3;
_Tp alpha = ColorChannel<_Tp>::max();
for( int i = 0; i < n; i += 3, dst += 4 )
{
_Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
}
}
else
{
n *= 4;
for( int i = 0; i < n; i += 4 )
{
_Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
dst[i+bidx] = t0; dst[i+1] = t1; dst[i+(bidx^2)] = t2; dst[i+3] = t3;
}
}
}
int srccn, dstcn, blueIdx;
};
#if CV_NEON
template<> struct RGB2RGB<uchar>
{
typedef uchar channel_type;
RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
{
v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
v_alpha2 = vget_low_u8(v_alpha);
}
void operator()(const uchar * src, uchar * dst, int n) const
{
int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
if (dcn == 3)
{
n *= 3;
if (scn == 3)
{
for ( ; i <= n - 48; i += 48, src += 48 )
{
uint8x16x3_t v_src = vld3q_u8(src), v_dst;
v_dst.val[0] = v_src.val[bidx];
v_dst.val[1] = v_src.val[1];
v_dst.val[2] = v_src.val[bidx ^ 2];
vst3q_u8(dst + i, v_dst);
}
for ( ; i <= n - 24; i += 24, src += 24 )
{
uint8x8x3_t v_src = vld3_u8(src), v_dst;
v_dst.val[0] = v_src.val[bidx];
v_dst.val[1] = v_src.val[1];
v_dst.val[2] = v_src.val[bidx ^ 2];
vst3_u8(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += 3 )
{
uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
}
}
else
{
for ( ; i <= n - 48; i += 48, src += 64 )
{
uint8x16x4_t v_src = vld4q_u8(src);
uint8x16x3_t v_dst;
v_dst.val[0] = v_src.val[bidx];
v_dst.val[1] = v_src.val[1];
v_dst.val[2] = v_src.val[bidx ^ 2];
vst3q_u8(dst + i, v_dst);
}
for ( ; i <= n - 24; i += 24, src += 32 )
{
uint8x8x4_t v_src = vld4_u8(src);
uint8x8x3_t v_dst;
v_dst.val[0] = v_src.val[bidx];
v_dst.val[1] = v_src.val[1];
v_dst.val[2] = v_src.val[bidx ^ 2];
vst3_u8(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += 4 )
{
uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
}
}
}
else if (scn == 3)
{
n *= 3;
for ( ; i <= n - 48; i += 48, dst += 64 )
{
uint8x16x3_t v_src = vld3q_u8(src + i);
uint8x16x4_t v_dst;
v_dst.val[bidx] = v_src.val[0];
v_dst.val[1] = v_src.val[1];
v_dst.val[bidx ^ 2] = v_src.val[2];
v_dst.val[3] = v_alpha;
vst4q_u8(dst, v_dst);
}
for ( ; i <= n - 24; i += 24, dst += 32 )
{
uint8x8x3_t v_src = vld3_u8(src + i);
uint8x8x4_t v_dst;
v_dst.val[bidx] = v_src.val[0];
v_dst.val[1] = v_src.val[1];
v_dst.val[bidx ^ 2] = v_src.val[2];
v_dst.val[3] = v_alpha2;
vst4_u8(dst, v_dst);
}
uchar alpha = ColorChannel<uchar>::max();
for (; i < n; i += 3, dst += 4 )
{
uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
}
}
else
{
n *= 4;
for ( ; i <= n - 64; i += 64 )
{
uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
v_dst.val[0] = v_src.val[bidx];
v_dst.val[1] = v_src.val[1];
v_dst.val[2] = v_src.val[bidx^2];
v_dst.val[3] = v_src.val[3];
vst4q_u8(dst + i, v_dst);
}
for ( ; i <= n - 32; i += 32 )
{
uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
v_dst.val[0] = v_src.val[bidx];
v_dst.val[1] = v_src.val[1];
v_dst.val[2] = v_src.val[bidx^2];
v_dst.val[3] = v_src.val[3];
vst4_u8(dst + i, v_dst);
}
for ( ; i < n; i += 4)
{
uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
dst[i+bidx] = t0; dst[i+1] = t1; dst[i+(bidx^2)] = t2; dst[i+3] = t3;
}
}
}
int srccn, dstcn, blueIdx;
uint8x16_t v_alpha;
uint8x8_t v_alpha2;
};
#endif
/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
struct RGB5x52RGB
{
typedef uchar channel_type;
RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
: dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
{
#if CV_NEON
v_n3 = vdupq_n_u16(~3);
v_n7 = vdupq_n_u16(~7);
v_255 = vdupq_n_u8(255);
v_0 = vdupq_n_u8(0);
v_mask = vdupq_n_u16(0x8000);
#endif
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int dcn = dstcn, bidx = blueIdx, i = 0;
if( greenBits == 6 )
{
#if CV_NEON
for ( ; i <= n - 16; i += 16, dst += dcn * 16)
{
uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)),
vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3)));
uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)),
vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7)));
if (dcn == 3)
{
uint8x16x3_t v_dst;
v_dst.val[bidx] = v_b;
v_dst.val[1] = v_g;
v_dst.val[bidx^2] = v_r;
vst3q_u8(dst, v_dst);
}
else
{
uint8x16x4_t v_dst;
v_dst.val[bidx] = v_b;
v_dst.val[1] = v_g;
v_dst.val[bidx^2] = v_r;
v_dst.val[3] = v_255;
vst4q_u8(dst, v_dst);
}
}
#endif
for( ; i < n; i++, dst += dcn )
{
unsigned t = ((const ushort*)src)[i];
dst[bidx] = (uchar)(t << 3);
dst[1] = (uchar)((t >> 3) & ~3);
dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
if( dcn == 4 )
dst[3] = 255;
}
}
else
{
#if CV_NEON
for ( ; i <= n - 16; i += 16, dst += dcn * 16)
{
uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)),
vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7)));
uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)),
vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7)));
if (dcn == 3)
{
uint8x16x3_t v_dst;
v_dst.val[bidx] = v_b;
v_dst.val[1] = v_g;
v_dst.val[bidx^2] = v_r;
vst3q_u8(dst, v_dst);
}
else
{
uint8x16x4_t v_dst;
v_dst.val[bidx] = v_b;
v_dst.val[1] = v_g;
v_dst.val[bidx^2] = v_r;
v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)),
vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0);
vst4q_u8(dst, v_dst);
}
}
#endif
for( ; i < n; i++, dst += dcn )
{
unsigned t = ((const ushort*)src)[i];
dst[bidx] = (uchar)(t << 3);
dst[1] = (uchar)((t >> 2) & ~7);
dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
if( dcn == 4 )
dst[3] = t & 0x8000 ? 255 : 0;
}
}
}
int dstcn, blueIdx, greenBits;
#if CV_NEON
uint16x8_t v_n3, v_n7, v_mask;
uint8x16_t v_255, v_0;
#endif
};
struct RGB2RGB5x5
{
typedef uchar channel_type;
RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
: srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
{
#if CV_NEON
v_n3 = vdup_n_u8(~3);
v_n7 = vdup_n_u8(~7);
v_mask = vdupq_n_u16(0x8000);
v_0 = vdupq_n_u16(0);
v_full = vdupq_n_u16(0xffff);
#endif
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int scn = srccn, bidx = blueIdx, i = 0;
if (greenBits == 6)
{
if (scn == 3)
{
#if CV_NEON
for ( ; i <= n - 8; i += 8, src += 24 )
{
uint8x8x3_t v_src = vld3_u8(src);
uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
vst1q_u16((ushort *)dst + i, v_dst);
}
#endif
for ( ; i < n; i++, src += 3 )
((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
}
else
{
#if CV_NEON
for ( ; i <= n - 8; i += 8, src += 32 )
{
uint8x8x4_t v_src = vld4_u8(src);
uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
vst1q_u16((ushort *)dst + i, v_dst);
}
#endif
for ( ; i < n; i++, src += 4 )
((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
}
}
else if (scn == 3)
{
#if CV_NEON
for ( ; i <= n - 8; i += 8, src += 24 )
{
uint8x8x3_t v_src = vld3_u8(src);
uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7));
vst1q_u16((ushort *)dst + i, v_dst);
}
#endif
for ( ; i < n; i++, src += 3 )
((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
}
else
{
#if CV_NEON
for ( ; i <= n - 8; i += 8, src += 32 )
{
uint8x8x4_t v_src = vld4_u8(src);
uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7),
vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0)));
vst1q_u16((ushort *)dst + i, v_dst);
}
#endif
for ( ; i < n; i++, src += 4 )
((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
}
}
int srccn, blueIdx, greenBits;
#if CV_NEON
uint8x8_t v_n3, v_n7;
uint16x8_t v_mask, v_0, v_full;
#endif
};
///////////////////////////////// Color to/from Grayscale ////////////////////////////////
template<typename _Tp>
struct Gray2RGB
{
typedef _Tp channel_type;
Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
void operator()(const _Tp* src, _Tp* dst, int n) const
{
if( dstcn == 3 )
for( int i = 0; i < n; i++, dst += 3 )
{
dst[0] = dst[1] = dst[2] = src[i];
}
else
{
_Tp alpha = ColorChannel<_Tp>::max();
for( int i = 0; i < n; i++, dst += 4 )
{
dst[0] = dst[1] = dst[2] = src[i];
dst[3] = alpha;
}
}
}
int dstcn;
};
struct Gray2RGB5x5
{
typedef uchar channel_type;
Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
{
#if CV_NEON
v_n7 = vdup_n_u8(~7);
v_n3 = vdup_n_u8(~3);
#elif CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
v_n7 = _mm_set1_epi16(~7);
v_n3 = _mm_set1_epi16(~3);
v_zero = _mm_setzero_si128();
#endif
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int i = 0;
if( greenBits == 6 )
{
#if CV_NEON
for ( ; i <= n - 8; i += 8 )
{
uint8x8_t v_src = vld1_u8(src + i);
uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3));
v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
vst1q_u16((ushort *)dst + i, v_dst);
}
#elif CV_SSE2
if (haveSIMD)
{
for ( ; i <= n - 16; i += 16 )
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
__m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
__m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
_mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
_mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
_mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
_mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
_mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
_mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
}
}
#endif
for ( ; i < n; i++ )
{
int t = src[i];
((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
}
}
else
{
#if CV_NEON
for ( ; i <= n - 8; i += 8 )
{
uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3));
uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
vst1q_u16((ushort *)dst + i, v_dst);
}
#elif CV_SSE2
if (haveSIMD)
{
for ( ; i <= n - 16; i += 8 )
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
__m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3);
__m128i v_dst = _mm_or_si128(v_src_p,
_mm_or_si128(_mm_slli_epi32(v_src_p, 5),
_mm_slli_epi16(v_src_p, 10)));
_mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3);
v_dst = _mm_or_si128(v_src_p,
_mm_or_si128(_mm_slli_epi16(v_src_p, 5),
_mm_slli_epi16(v_src_p, 10)));
_mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
}
}
#endif
for( ; i < n; i++ )
{
int t = src[i] >> 3;
((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
}
}
}
int greenBits;
#if CV_NEON
uint8x8_t v_n7, v_n3;
#elif CV_SSE2
__m128i v_n7, v_n3, v_zero;
bool haveSIMD;
#endif
};
struct RGB5x52Gray
{
typedef uchar channel_type;
RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
{
#if CV_NEON
v_b2y = vdup_n_u16(B2Y);
v_g2y = vdup_n_u16(G2Y);
v_r2y = vdup_n_u16(R2Y);
v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
v_f8 = vdupq_n_u16(0xf8);
v_fc = vdupq_n_u16(0xfc);
#elif CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
const __m128i v_b2y = _mm_set1_epi16(B2Y);
const __m128i v_g2y = _mm_set1_epi16(G2Y);
v_bg2y = _mm_unpacklo_epi16(v_b2y, v_g2y);
const __m128i v_r2y = _mm_set1_epi16(R2Y);
const __m128i v_one = _mm_set1_epi16(1);
v_rd2y = _mm_unpacklo_epi16(v_r2y, v_one);
v_delta = _mm_slli_epi16(v_one, yuv_shift - 1);
#endif
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int i = 0;
if( greenBits == 6 )
{
#if CV_NEON
for ( ; i <= n - 8; i += 8)
{
uint16x8_t v_src = vld1q_u16((ushort *)src + i);
uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc),
v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
}
#elif CV_SSE2
if (haveSIMD)
{
for ( ; i <= n - 8; i += 8)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
__m128i v_b = _mm_srli_epi16(_mm_slli_epi16(v_src, 11), 8),
v_g = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 5), 10),8),
v_r = _mm_slli_epi16(_mm_srli_epi16(v_src, 11), 3);
__m128i v_bg_lo = _mm_unpacklo_epi16(v_b, v_g);
__m128i v_rd_lo = _mm_unpacklo_epi16(v_r, v_delta);
__m128i v_bg_hi = _mm_unpackhi_epi16(v_b, v_g);
__m128i v_rd_hi = _mm_unpackhi_epi16(v_r, v_delta);
v_bg_lo = _mm_madd_epi16(v_bg_lo, v_bg2y);
v_rd_lo = _mm_madd_epi16(v_rd_lo, v_rd2y);
v_bg_hi = _mm_madd_epi16(v_bg_hi, v_bg2y);
v_rd_hi = _mm_madd_epi16(v_rd_hi, v_rd2y);
__m128i v_bgr_lo = _mm_add_epi32(v_bg_lo, v_rd_lo);
__m128i v_bgr_hi = _mm_add_epi32(v_bg_hi, v_rd_hi);
v_bgr_lo = _mm_srli_epi32(v_bgr_lo, yuv_shift);
v_bgr_hi = _mm_srli_epi32(v_bgr_hi, yuv_shift);
__m128i v_dst = _mm_packs_epi32(v_bgr_lo, v_bgr_hi);
v_dst = _mm_packus_epi16(v_dst, v_dst);
_mm_storel_epi64((__m128i *)(dst + i), v_dst);
}
}
#endif
for ( ; i < n; i++)
{
int t = ((ushort*)src)[i];
dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
((t >> 3) & 0xfc)*G2Y +
((t >> 8) & 0xf8)*R2Y, yuv_shift);
}
}
else
{
#if CV_NEON
for ( ; i <= n - 8; i += 8)
{
uint16x8_t v_src = vld1q_u16((ushort *)src + i);
uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8);
uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
}
#elif CV_SSE2
if (haveSIMD)
{
for ( ; i <= n - 8; i += 8)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
__m128i v_b = _mm_srli_epi16(_mm_slli_epi16(v_src, 11), 8),
v_g = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 5), 11),8),
v_r = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 10), 11),8);
__m128i v_bg_lo = _mm_unpacklo_epi16(v_b, v_g);
__m128i v_rd_lo = _mm_unpacklo_epi16(v_r, v_delta);
__m128i v_bg_hi = _mm_unpackhi_epi16(v_b, v_g);
__m128i v_rd_hi = _mm_unpackhi_epi16(v_r, v_delta);
v_bg_lo = _mm_madd_epi16(v_bg_lo, v_bg2y);
v_rd_lo = _mm_madd_epi16(v_rd_lo, v_rd2y);
v_bg_hi = _mm_madd_epi16(v_bg_hi, v_bg2y);
v_rd_hi = _mm_madd_epi16(v_rd_hi, v_rd2y);
__m128i v_bgr_lo = _mm_add_epi32(v_bg_lo, v_rd_lo);
__m128i v_bgr_hi = _mm_add_epi32(v_bg_hi, v_rd_hi);
v_bgr_lo = _mm_srli_epi32(v_bgr_lo, yuv_shift);
v_bgr_hi = _mm_srli_epi32(v_bgr_hi, yuv_shift);
__m128i v_dst = _mm_packs_epi32(v_bgr_lo, v_bgr_hi);
v_dst = _mm_packus_epi16(v_dst, v_dst);
_mm_storel_epi64((__m128i *)(dst + i), v_dst);
}
}
#endif
for ( ; i < n; i++)
{
int t = ((ushort*)src)[i];
dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
((t >> 2) & 0xf8)*G2Y +
((t >> 7) & 0xf8)*R2Y, yuv_shift);
}
}
}
int greenBits;
#if CV_NEON
uint16x4_t v_b2y, v_g2y, v_r2y;
uint32x4_t v_delta;
uint16x8_t v_f8, v_fc;
#elif CV_SSE2
bool haveSIMD;
__m128i v_bg2y, v_rd2y;
__m128i v_delta;
#endif
};
template<typename _Tp> struct RGB2Gray
{
typedef _Tp channel_type;
RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
{
static const float coeffs0[] = { R2YF, G2YF, B2YF };
memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
if(blueIdx == 0)
std::swap(coeffs[0], coeffs[2]);
}
void operator()(const _Tp* src, _Tp* dst, int n) const
{
int scn = srccn;
float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
for(int i = 0; i < n; i++, src += scn)
dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
}
int srccn;
float coeffs[3];
};
template<> struct RGB2Gray<uchar>
{
typedef uchar channel_type;
RGB2Gray(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn)
{
const int coeffs0[] = { R2Y, G2Y, B2Y };
if(!coeffs) coeffs = coeffs0;
int b = 0, g = 0, r = (1 << (yuv_shift-1));
int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx];
for( int i = 0; i < 256; i++, b += db, g += dg, r += dr )
{
tab[i] = b;
tab[i+256] = g;
tab[i+512] = r;
}
}
void operator()(const uchar* src, uchar* dst, int n) const
{
int scn = srccn;
const int* _tab = tab;
for(int i = 0; i < n; i++, src += scn)
dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift);
}
int srccn;
int tab[256*3];
};
#if CV_NEON
template <>
struct RGB2Gray<ushort>
{
typedef ushort channel_type;
RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
srccn(_srccn)
{
static const int coeffs0[] = { R2Y, G2Y, B2Y };
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
if( blueIdx == 0 )
std::swap(coeffs[0], coeffs[2]);
v_cb = vdup_n_u16(coeffs[0]);
v_cg = vdup_n_u16(coeffs[1]);
v_cr = vdup_n_u16(coeffs[2]);
v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
}
void operator()(const ushort* src, ushort* dst, int n) const
{
int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
uint16x8_t v_b, v_r, v_g;
if (scn == 3)
{
uint16x8x3_t v_src = vld3q_u16(src);
v_b = v_src.val[0];
v_g = v_src.val[1];
v_r = v_src.val[2];
}
else
{
uint16x8x4_t v_src = vld4q_u16(src);
v_b = v_src.val[0];
v_g = v_src.val[1];
v_r = v_src.val[2];
}
uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
vmull_u16(vget_low_u16(v_b), v_cb),
vget_low_u16(v_g), v_cg),
vget_low_u16(v_r), v_cr);
uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
vmull_u16(vget_high_u16(v_b), v_cb),
vget_high_u16(v_g), v_cg),
vget_high_u16(v_r), v_cr);
uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
}
for ( ; i <= n - 4; i += 4, src += scn * 4)
{
uint16x4_t v_b, v_r, v_g;
if (scn == 3)
{
uint16x4x3_t v_src = vld3_u16(src);
v_b = v_src.val[0];
v_g = v_src.val[1];
v_r = v_src.val[2];
}
else
{
uint16x4x4_t v_src = vld4_u16(src);
v_b = v_src.val[0];
v_g = v_src.val[1];
v_r = v_src.val[2];
}
uint32x4_t v_dst = vmlal_u16(vmlal_u16(
vmull_u16(v_b, v_cb),
v_g, v_cg),
v_r, v_cr);
vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
}
for( ; i < n; i++, src += scn)
dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
}
int srccn, coeffs[3];
uint16x4_t v_cb, v_cg, v_cr;
uint32x4_t v_delta;
};
template <>
struct RGB2Gray<float>
{
typedef float channel_type;
RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
{
static const float coeffs0[] = { R2YF, G2YF, B2YF };
memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
if(blueIdx == 0)
std::swap(coeffs[0], coeffs[2]);
v_cb = vdupq_n_f32(coeffs[0]);
v_cg = vdupq_n_f32(coeffs[1]);
v_cr = vdupq_n_f32(coeffs[2]);
}
void operator()(const float * src, float * dst, int n) const
{
int scn = srccn, i = 0;
float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
if (scn == 3)
{
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
float32x4x3_t v_src = vld3q_f32(src);
vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
v_src = vld3q_f32(src + scn * 4);
vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
}
for ( ; i <= n - 4; i += 4, src += scn * 4)
{
float32x4x3_t v_src = vld3q_f32(src);
vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
}
}
else
{
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
float32x4x4_t v_src = vld4q_f32(src);
vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
v_src = vld4q_f32(src + scn * 4);
vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
}
for ( ; i <= n - 4; i += 4, src += scn * 4)
{
float32x4x4_t v_src = vld4q_f32(src);
vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
}
}
for ( ; i < n; i++, src += scn)
dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
}
int srccn;
float coeffs[3];
float32x4_t v_cb, v_cg, v_cr;
};
#elif CV_SSE2
#if CV_SSE4_1
template <>
struct RGB2Gray<ushort>
{
typedef ushort channel_type;
RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
srccn(_srccn)
{
static const int coeffs0[] = { R2Y, G2Y, B2Y };
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
if( blueIdx == 0 )
std::swap(coeffs[0], coeffs[2]);
v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
v_zero = _mm_setzero_si128();
haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
}
// 16s x 8
void process(__m128i* v_rgb, __m128i* v_coeffs,
__m128i & v_gray) const
{
__m128i v_rgb_hi[4];
v_rgb_hi[0] = _mm_cmplt_epi16(v_rgb[0], v_zero);
v_rgb_hi[1] = _mm_cmplt_epi16(v_rgb[1], v_zero);
v_rgb_hi[2] = _mm_cmplt_epi16(v_rgb[2], v_zero);
v_rgb_hi[3] = _mm_cmplt_epi16(v_rgb[3], v_zero);
v_rgb_hi[0] = _mm_and_si128(v_rgb_hi[0], v_coeffs[1]);
v_rgb_hi[1] = _mm_and_si128(v_rgb_hi[1], v_coeffs[1]);
v_rgb_hi[2] = _mm_and_si128(v_rgb_hi[2], v_coeffs[1]);
v_rgb_hi[3] = _mm_and_si128(v_rgb_hi[3], v_coeffs[1]);
v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[1]);
v_rgb_hi[2] = _mm_hadd_epi16(v_rgb_hi[2], v_rgb_hi[3]);
v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[2]);
v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_coeffs[0]);
v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_coeffs[0]);
v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_coeffs[0]);
v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_coeffs[0]);
v_rgb[0] = _mm_hadd_epi32(v_rgb[0], v_rgb[1]);
v_rgb[2] = _mm_hadd_epi32(v_rgb[2], v_rgb[3]);
v_rgb[0] = _mm_add_epi32(v_rgb[0], v_delta);
v_rgb[2] = _mm_add_epi32(v_rgb[2], v_delta);
v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift);
v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift);
v_gray = _mm_packs_epi32(v_rgb[0], v_rgb[2]);
v_gray = _mm_add_epi16(v_gray, v_rgb_hi[0]);
}
void operator()(const ushort* src, ushort* dst, int n) const
{
int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
if (scn == 3 && haveSIMD)
{
__m128i v_coeffs[2];
v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0);
v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2);
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
__m128i v_src[3];
v_src[0] = _mm_loadu_si128((__m128i const *)(src));
v_src[1] = _mm_loadu_si128((__m128i const *)(src + 8));
v_src[2] = _mm_loadu_si128((__m128i const *)(src + 16));
__m128i v_rgb[4];
v_rgb[0] = _mm_slli_si128(v_src[0], 2);
v_rgb[1] = _mm_alignr_epi8(v_src[1], v_src[0], 10);
v_rgb[2] = _mm_alignr_epi8(v_src[2], v_src[1], 6);
v_rgb[3] = _mm_srli_si128(v_src[2], 2);
__m128i v_gray;
process(v_rgb, v_coeffs,
v_gray);
_mm_storeu_si128((__m128i *)(dst + i), v_gray);
}
}
else if (scn == 4 && haveSIMD)
{
__m128i v_coeffs[2];
v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0]);
v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2);
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
__m128i v_rgb[4];
v_rgb[0] = _mm_loadu_si128((__m128i const *)(src));
v_rgb[1] = _mm_loadu_si128((__m128i const *)(src + 8));
v_rgb[2] = _mm_loadu_si128((__m128i const *)(src + 16));
v_rgb[3] = _mm_loadu_si128((__m128i const *)(src + 24));
__m128i v_gray;
process(v_rgb, v_coeffs,
v_gray);
_mm_storeu_si128((__m128i *)(dst + i), v_gray);
}
}
for( ; i < n; i++, src += scn)
dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
}
int srccn, coeffs[3];
__m128i v_delta;
__m128i v_zero;
bool haveSIMD;
};
#endif // CV_SSE4_1
template <>
struct RGB2Gray<float>
{
typedef float channel_type;
RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
{
static const float coeffs0[] = { R2YF, G2YF, B2YF };
memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
if(blueIdx == 0)
std::swap(coeffs[0], coeffs[2]);
v_cb = _mm_set1_ps(coeffs[0]);
v_cg = _mm_set1_ps(coeffs[1]);
v_cr = _mm_set1_ps(coeffs[2]);
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
}
void process(__m128 v_b, __m128 v_g, __m128 v_r,
__m128 & v_gray) const
{
v_gray = _mm_mul_ps(v_r, v_cr);
v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg));
v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb));
}
void operator()(const float * src, float * dst, int n) const
{
int scn = srccn, i = 0;
float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
if (scn == 3 && haveSIMD)
{
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
__m128 v_r0 = _mm_loadu_ps(src);
__m128 v_r1 = _mm_loadu_ps(src + 4);
__m128 v_g0 = _mm_loadu_ps(src + 8);
__m128 v_g1 = _mm_loadu_ps(src + 12);
__m128 v_b0 = _mm_loadu_ps(src + 16);
__m128 v_b1 = _mm_loadu_ps(src + 20);
_mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
__m128 v_gray0;
process(v_r0, v_g0, v_b0,
v_gray0);
__m128 v_gray1;
process(v_r1, v_g1, v_b1,
v_gray1);
_mm_storeu_ps(dst + i, v_gray0);
_mm_storeu_ps(dst + i + 4, v_gray1);
}
}
else if (scn == 4 && haveSIMD)
{
for ( ; i <= n - 8; i += 8, src += scn * 8)
{
__m128 v_r0 = _mm_loadu_ps(src);
__m128 v_r1 = _mm_loadu_ps(src + 4);
__m128 v_g0 = _mm_loadu_ps(src + 8);
__m128 v_g1 = _mm_loadu_ps(src + 12);
__m128 v_b0 = _mm_loadu_ps(src + 16);
__m128 v_b1 = _mm_loadu_ps(src + 20);
__m128 v_a0 = _mm_loadu_ps(src + 24);
__m128 v_a1 = _mm_loadu_ps(src + 28);
_mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
__m128 v_gray0;
process(v_r0, v_g0, v_b0,
v_gray0);
__m128 v_gray1;
process(v_r1, v_g1, v_b1,
v_gray1);
_mm_storeu_ps(dst + i, v_gray0);
_mm_storeu_ps(dst + i + 4, v_gray1);
}
}
for ( ; i < n; i++, src += scn)
dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
}
int srccn;
float coeffs[3];
__m128 v_cb, v_cg, v_cr;
bool haveSIMD;
};
#endif // CV_SSE2
#if !CV_NEON && !CV_SSE4_1
template<> struct RGB2Gray<ushort>
{
typedef ushort channel_type;
RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
{
static const int coeffs0[] = { R2Y, G2Y, B2Y };
memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
if( blueIdx == 0 )
std::swap(coeffs[0], coeffs[2]);
}
void operator()(const ushort* src, ushort* dst, int n) const
{
int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
for(int i = 0; i < n; i++, src += scn)
dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
}
int srccn;
int coeffs[3];
};
#endif // !CV_NEON && !CV_SSE4_1
/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
template<typename _Tp>
struct RGBA2mRGBA
{
typedef _Tp channel_type;
void operator()(const _Tp* src, _Tp* dst, int n) const
{
_Tp max_val = ColorChannel<_Tp>::max();
_Tp half_val = ColorChannel<_Tp>::half();
for( int i = 0; i < n; i++ )
{
_Tp v0 = *src++;
_Tp v1 = *src++;
_Tp v2 = *src++;
_Tp v3 = *src++;
*dst++ = (v0 * v3 + half_val) / max_val;
*dst++ = (v1 * v3 + half_val) / max_val;
*dst++ = (v2 * v3 + half_val) / max_val;
*dst++ = v3;
}
}
};
template<typename _Tp>
struct mRGBA2RGBA
{
typedef _Tp channel_type;
void operator()(const _Tp* src, _Tp* dst, int n) const
{
_Tp max_val = ColorChannel<_Tp>::max();
for( int i = 0; i < n; i++ )
{
_Tp v0 = *src++;
_Tp v1 = *src++;
_Tp v2 = *src++;
_Tp v3 = *src++;
_Tp v3_half = v3 / 2;
*dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
*dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
*dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
*dst++ = v3;
}
}
};
//
// IPP functions
//
#if NEED_IPP
static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
{
(ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
};
static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
{
(ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
};
static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
{
(ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
};
static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
{
(ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
};
static IppStatus ippiGrayToRGB_C1C3R(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, IppiSize roiSize)
{
return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
}
static IppStatus ippiGrayToRGB_C1C3R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize)
{
return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
}
static IppStatus ippiGrayToRGB_C1C3R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize)
{
return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
}
static IppStatus ippiGrayToRGB_C1C4R(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, IppiSize roiSize, Ipp8u aval)
{
return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
}
static IppStatus ippiGrayToRGB_C1C4R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize, Ipp16u aval)
{
return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
}
static IppStatus ippiGrayToRGB_C1C4R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize, Ipp32f aval)
{
return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
}
struct IPPColor2GrayFunctor
{
IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
ippiColorToGray(_func)
{
coeffs[0] = B2YF;
coeffs[1] = G2YF;
coeffs[2] = R2YF;
}
bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
{
return ippiColorToGray ? CV_INSTRUMENT_FUN_IPP(ippiColorToGray, src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false;
}
private:
ippiColor2GrayFunc ippiColorToGray;
Ipp32f coeffs[3];
};
template <typename T>
struct IPPGray2BGRFunctor
{
IPPGray2BGRFunctor(){}
bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
{
return ippiGrayToRGB_C1C3R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows)) >= 0;
}
};
template <typename T>
struct IPPGray2BGRAFunctor
{
IPPGray2BGRAFunctor()
{
alpha = ColorChannel<T>::max();
}
bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
{
return ippiGrayToRGB_C1C4R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows), alpha) >= 0;
}
T alpha;
};
static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep,
IppiSize roiSize, const int *dstOrder)
{
return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_8u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u);
}
static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep,
IppiSize roiSize, const int *dstOrder)
{
return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_16u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u);
}
static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep,
IppiSize roiSize, const int *dstOrder)
{
return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_32f_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f);
}
// shared
ippiReorderFunc ippiSwapChannelsC3C4RTab[] =
{
(ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0,
0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0
};
static ippiGeneralFunc ippiCopyAC4C3RTab[] =
{
(ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0,
0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0
};
// shared
ippiReorderFunc ippiSwapChannelsC4C3RTab[] =
{
(ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0,
0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0
};
// shared
ippiReorderFunc ippiSwapChannelsC3RTab[] =
{
(ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0,
0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0
};
#if IPP_VERSION_X100 >= 810
static ippiReorderFunc ippiSwapChannelsC4RTab[] =
{
(ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0,
0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0
};
#endif
#endif
//
// HAL functions
//
namespace hal
{
// 8u, 16u, 32f
void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int scn, int dcn, bool swapBlue)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue);
#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
CV_IPP_CHECK()
{
if(scn == 3 && dcn == 4 && !swapBlue)
{
if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
return;
}
else if(scn == 4 && dcn == 3 && !swapBlue)
{
if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
return;
}
else if(scn == 3 && dcn == 4 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
return;
}
else if(scn == 4 && dcn == 3 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
return;
}
else if(scn == 3 && dcn == 3 && swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
return;
}
#if IPP_VERSION_X100 >= 810
else if(scn == 4 && dcn == 4 && swapBlue)
{
if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
return;
}
}
#endif
#endif
int blueIdx = swapBlue ? 2 : 0;
if( depth == CV_8U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<uchar>(scn, dcn, blueIdx));
else if( depth == CV_16U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<ushort>(scn, dcn, blueIdx));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<float>(scn, dcn, blueIdx));
}
// only 8u
void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int scn, bool swapBlue, int greenBits)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits);
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits));
}
// only 8u
void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int dcn, bool swapBlue, int greenBits)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits);
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits));
}
// 8u, 16u, 32f
void cvtBGRtoGray(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int scn, bool swapBlue)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtBGRtoGray, cv_hal_cvtBGRtoGray, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue);
#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
CV_IPP_CHECK()
{
if(depth == CV_32F && scn == 3 && !swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
return;
}
else if(depth == CV_32F && scn == 3 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
return;
}
else if(depth == CV_32F && scn == 4 && !swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
return;
}
else if(depth == CV_32F && scn == 4 && swapBlue)
{
if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
return;
}
}
#endif
int blueIdx = swapBlue ? 2 : 0;
if( depth == CV_8U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<uchar>(scn, blueIdx, 0));
else if( depth == CV_16U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<ushort>(scn, blueIdx, 0));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<float>(scn, blueIdx, 0));
}
// 8u, 16u, 32f
void cvtGraytoBGR(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int depth, int dcn)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtGraytoBGR, cv_hal_cvtGraytoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn);
#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
CV_IPP_CHECK()
{
bool ippres = false;
if(dcn == 3)
{
if( depth == CV_8U )
ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp8u>());
else if( depth == CV_16U )
ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp16u>());
else
ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp32f>());
}
else if(dcn == 4)
{
if( depth == CV_8U )
ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp8u>());
else if( depth == CV_16U )
ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp16u>());
else
ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp32f>());
}
if(ippres)
return;
}
#endif
if( depth == CV_8U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<uchar>(dcn));
else if( depth == CV_16U )
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<ushort>(dcn));
else
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<float>(dcn));
}
// only 8u
void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int greenBits)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits);
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits));
}
// only 8u
void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height,
int greenBits)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits);
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits));
}
void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtRGBAtoMultipliedRGBA, cv_hal_cvtRGBAtoMultipliedRGBA, src_data, src_step, dst_data, dst_step, width, height);
#ifdef HAVE_IPP
CV_IPP_CHECK()
{
if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
return;
}
#endif
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA<uchar>());
}
void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
uchar * dst_data, size_t dst_step,
int width, int height)
{
CV_INSTRUMENT_REGION()
CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height);
CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA<uchar>());
}
} // namespace hal
//
// OCL calls
//
#ifdef HAVE_OPENCL
bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse )
{
OclHelper< Set<3, 4>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("RGB", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER")))
{
return false;
}
return h.run();
}
bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits )
{
OclHelper< Set<3, 4>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
if(!h.createKernel("RGB2RGB5x5", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, gbits)))
{
return false;
}
return h.run();
}
bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits)
{
OclHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
if(!h.createKernel("RGB5x52RGB", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, gbits)))
{
return false;
}
return h.run();
}
bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits)
{
OclHelper< Set<2>, Set<1>, Set<CV_8U> > h(_src, _dst, 1);
if(!h.createKernel("BGR5x52Gray", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=1 -D bidx=0 -D greenbits=%d", gbits)))
{
return false;
}
return h.run();
}
bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits)
{
OclHelper< Set<1>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
if(!h.createKernel("Gray2BGR5x5", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=2 -D bidx=0 -D greenbits=%d", gbits)))
{
return false;
}
return h.run();
}
bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx)
{
OclHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
int stripeSize = 1;
if(!h.createKernel("RGB2Gray", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", bidx, stripeSize)))
{
return false;
}
h.globalSize[0] = (h.src.cols + stripeSize - 1)/stripeSize;
return h.run();
}
bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn)
{
OclHelper< Set<1>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("Gray2RGB", ocl::imgproc::color_rgb_oclsrc,
format("-D bidx=0 -D dcn=%d", dcn)))
{
return false;
}
return h.run();
}
bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst)
{
OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
if(!h.createKernel("RGBA2mRGBA", ocl::imgproc::color_rgb_oclsrc,
"-D dcn=4 -D bidx=3"))
{
return false;
}
return h.run();
}
bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst)
{
OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
if(!h.createKernel("mRGBA2RGBA", ocl::imgproc::color_rgb_oclsrc,
"-D dcn=4 -D bidx=3"))
{
return false;
}
return h.run();
}
#endif
//
// HAL calls
//
void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb)
{
CvtHelper< Set<3, 4>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
hal::cvtBGRtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.depth, h.scn, dcn, swapb);
}
void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits)
{
CvtHelper< Set<3, 4>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
hal::cvtBGRtoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.scn, swapb, gbits);
}
void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits)
{
if(dcn <= 0) dcn = 3;
CvtHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
hal::cvtBGR5x5toBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
dcn, swapb, gbits);
}
void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb)
{
CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
hal::cvtBGRtoGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
h.depth, h.scn, swapb);
}
void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn)
{
if(dcn <= 0) dcn = 3;
CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
hal::cvtGraytoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, h.depth, dcn);
}
void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits)
{
CvtHelper< Set<2>, Set<1>, Set<CV_8U> > h(_src, _dst, 1);
hal::cvtBGR5x5toGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits);
}
void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits)
{
CvtHelper< Set<1>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
hal::cvtGraytoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits);
}
void cvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst)
{
CvtHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
hal::cvtRGBAtoMultipliedRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows);
}
void cvtColormRGBA2RGBA( InputArray _src, OutputArray _dst)
{
CvtHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
hal::cvtMultipliedRGBAtoRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows);
}
} // namespace cv
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -41,6 +41,50 @@
//
//M*/
/********************************* COPYRIGHT NOTICE *******************************\
Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
from MD-Mathematische Dienste GmbH. Below is the copyright notice:
IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
By downloading, copying, installing or using the software you agree
to this license. If you do not agree to this license, do not download,
install, copy or use the software.
Contributors License Agreement:
Copyright (c) 2002,
MD-Mathematische Dienste GmbH
Im Defdahl 5-10
44141 Dortmund
Germany
www.md-it.de
Redistribution and use in source and binary forms,
with or without modification, are permitted provided
that the following conditions are met:
Redistributions of source code must retain
the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
The name of Contributor may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
THE POSSIBILITY OF SUCH DAMAGE.
\**********************************************************************************/
#include "precomp.hpp"
#include <limits>
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/**************************************PUBLICFUNC*************************************/
#if depth == 0
#define DATA_TYPE uchar
#define MAX_NUM 255
#define HALF_MAX_NUM 128
#define COEFF_TYPE int
#define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0
#elif depth == 2
#define DATA_TYPE ushort
#define MAX_NUM 65535
#define HALF_MAX_NUM 32768
#define COEFF_TYPE int
#define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2
#elif depth == 5
#define DATA_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX_NUM 0.5f
#define COEFF_TYPE float
#define SAT_CAST(num) (num)
#define DEPTH_5
#else
#error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
#endif
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
enum
{
hsv_shift = 12
};
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
#ifndef hscale
#define hscale 0
#endif
#ifndef hrange
#define hrange 0
#endif
#if bidx == 0
#define R_COMP z
#define G_COMP y
#define B_COMP x
#else
#define R_COMP x
#define G_COMP y
#define B_COMP z
#endif
//////////////////////////////////// RGB <-> HSV //////////////////////////////////////
__constant int sector_data[][3] = { { 1, 3, 0 },
{ 1, 0, 2 },
{ 3, 0, 1 },
{ 0, 2, 1 },
{ 0, 1, 3 },
{ 2, 1, 0 } };
#ifdef DEPTH_0
__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols,
__constant int * sdiv_table, __constant int * hdiv_table)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
int h, s, v = b;
int vmin = b, diff;
int vr, vg;
v = max(v, g);
v = max(v, r);
vmin = min(vmin, g);
vmin = min(vmin, r);
diff = v - vmin;
vr = v == r ? -1 : 0;
vg = v == g ? -1 : 0;
s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
h = (vr & (g - b)) +
(~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
h += h < 0 ? hrange : 0;
dst[dst_index] = convert_uchar_sat_rte(h);
dst[dst_index + 1] = (uchar)s;
dst[dst_index + 2] = (uchar)v;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
float b, g, r;
if (s != 0)
{
float tab[4];
int sector;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
sector = convert_int_sat_rtn(h);
h -= sector;
if( (unsigned)sector >= 6u )
{
sector = 0;
h = 0.f;
}
tab[0] = v;
tab[1] = v*(1.f - s);
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h));
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = v;
dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
dst[dst_index + 3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#elif defined DEPTH_5
__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
float h, s, v;
float vmin, diff;
v = vmin = r;
if( v < g ) v = g;
if( v < b ) v = b;
if( vmin > g ) vmin = g;
if( vmin > b ) vmin = b;
diff = v - vmin;
s = diff/(float)(fabs(v) + FLT_EPSILON);
diff = (float)(60.f/(diff + FLT_EPSILON));
if( v == r )
h = (g - b)*diff;
else if( v == g )
h = fma(b - r, diff, 120.f);
else
h = fma(r - g, diff, 240.f);
if( h < 0 )
h += 360.f;
dst[0] = h*hscale;
dst[1] = s;
dst[2] = v;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float h = src_pix.x, s = src_pix.y, v = src_pix.z;
float b, g, r;
if (s != 0)
{
float tab[4];
int sector;
h *= hscale;
if(h < 0)
do h += 6; while (h < 0);
else if (h >= 6)
do h -= 6; while (h >= 6);
sector = convert_int_sat_rtn(h);
h -= sector;
if ((unsigned)sector >= 6u)
{
sector = 0;
h = 0.f;
}
tab[0] = v;
tab[1] = v*(1.f - s);
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h));
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = v;
dst[bidx] = b;
dst[1] = g;
dst[bidx^2] = r;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
#ifdef DEPTH_0
__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
float h = 0.f, s = 0.f, l;
float vmin, vmax, diff;
vmax = vmin = r;
if (vmax < g) vmax = g;
if (vmax < b) vmax = b;
if (vmin > g) vmin = g;
if (vmin > b) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if (diff > FLT_EPSILON)
{
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
diff = 60.f/diff;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
h = fma(b - r, diff, 120.f);
else
h = fma(r - g, diff, 240.f);
if( h < 0.f )
h += 360.f;
}
dst[dst_index] = convert_uchar_sat_rte(h*hscale);
dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
float b, g, r;
if (s != 0)
{
float tab[4];
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
int sector = convert_int_sat_rtn(h);
h -= sector;
tab[0] = p2;
tab[1] = p1;
tab[2] = fma(p2 - p1, 1-h, p1);
tab[3] = fma(p2 - p1, h, p1);
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = l;
dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
dst[dst_index + 3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#elif defined DEPTH_5
__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
float h = 0.f, s = 0.f, l;
float vmin, vmax, diff;
vmax = vmin = r;
if (vmax < g) vmax = g;
if (vmax < b) vmax = b;
if (vmin > g) vmin = g;
if (vmin > b) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if (diff > FLT_EPSILON)
{
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
diff = 60.f/diff;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
h = fma(b - r, diff, 120.f);
else
h = fma(r - g, diff, 240.f);
if( h < 0.f ) h += 360.f;
}
dst[0] = h*hscale;
dst[1] = l;
dst[2] = s;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float h = src_pix.x, l = src_pix.y, s = src_pix.z;
float b, g, r;
if (s != 0)
{
float tab[4];
int sector;
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
sector = convert_int_sat_rtn(h);
h -= sector;
tab[0] = p2;
tab[1] = p1;
tab[2] = fma(p2 - p1, 1-h, p1);
tab[3] = fma(p2 - p1, h, p1);
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = l;
dst[bidx] = b;
dst[1] = g;
dst[bidx^2] = r;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if depth == 0
#define DATA_TYPE uchar
#define MAX_NUM 255
#define HALF_MAX_NUM 128
#define COEFF_TYPE int
#define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0
#elif depth == 2
#define DATA_TYPE ushort
#define MAX_NUM 65535
#define HALF_MAX_NUM 32768
#define COEFF_TYPE int
#define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2
#elif depth == 5
#define DATA_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX_NUM 0.5f
#define COEFF_TYPE float
#define SAT_CAST(num) (num)
#define DEPTH_5
#else
#error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
#endif
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
enum
{
xyz_shift = 12,
};
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols, __constant COEFF_TYPE * coeffs)
{
int dx = get_global_id(0);
int dy = get_global_id(1) * PIX_PER_WI_Y;
if (dx < cols)
{
int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (dy < rows)
{
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
#ifdef DEPTH_5
float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
#else
int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
#endif
dst[0] = SAT_CAST(x);
dst[1] = SAT_CAST(y);
dst[2] = SAT_CAST(z);
++dy;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols, __constant COEFF_TYPE * coeffs)
{
int dx = get_global_id(0);
int dy = get_global_id(1) * PIX_PER_WI_Y;
if (dx < cols)
{
int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (dy < rows)
{
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
#ifdef DEPTH_5
float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
#else
int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
#endif
DATA_TYPE dst0 = SAT_CAST(b);
DATA_TYPE dst1 = SAT_CAST(g);
DATA_TYPE dst2 = SAT_CAST(r);
#if dcn == 3 || defined DEPTH_5
dst[0] = dst0;
dst[1] = dst1;
dst[2] = dst2;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
#else
*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
#endif
++dy;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
#define lab_shift xyz_shift
#define gamma_shift 3
#define lab_shift2 (lab_shift + gamma_shift)
#define GAMMA_TAB_SIZE 1024
#define GammaTabScale (float)GAMMA_TAB_SIZE
inline float splineInterpolate(float x, __global const float * tab, int n)
{
int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
x -= ix;
tab += ix << 2;
return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
}
#ifdef DEPTH_0
__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
__global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
__constant int * coeffs, int Lscale, int Lshift)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const uchar* src_ptr = src + src_index;
__global uchar* dst_ptr = dst + dst_index;
uchar4 src_pix = vload4(0, src_ptr);
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
dst_ptr[0] = SAT_CAST(L);
dst_ptr[1] = SAT_CAST(a);
dst_ptr[2] = SAT_CAST(b);
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#elif defined DEPTH_5
__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float _1_3, float _a)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
float R = clamp(src_pix.x, 0.0f, 1.0f);
float G = clamp(src_pix.y, 0.0f, 1.0f);
float B = clamp(src_pix.z, 0.0f, 1.0f);
#ifdef SRGB
R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
// 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
float X = fma(R, C0, fma(G, C1, B*C2));
float Y = fma(R, C3, fma(G, C4, B*C5));
float Z = fma(R, C6, fma(G, C7, B*C8));
float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
float a = 500.f * (FX - FY);
float b = 200.f * (FY - FZ);
dst[0] = L;
dst[1] = a;
dst[2] = b;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float lThresh, float fThresh)
{
float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
float y, fy;
// 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
if (li <= lThresh)
{
y = li / 903.3f;
fy = fma(7.787f, y, 16.0f / 116.0f);
}
else
{
fy = (li + 16.0f) / 116.0f;
y = fy * fy * fy;
}
float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
#pragma unroll
for (int j = 0; j < 2; j++)
if (fxz[j] <= fThresh)
fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
else
fxz[j] = fxz[j] * fxz[j] * fxz[j];
float x = fxz[0], z = fxz[1];
float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
#ifdef SRGB
ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
}
#ifdef DEPTH_0
__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float lThresh, float fThresh)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const uchar* src_ptr = src + src_index;
__global uchar * dst_ptr = dst + dst_index;
uchar4 src_pix = vload4(0, src_ptr);
float srcbuf[3], dstbuf[3];
srcbuf[0] = src_pix.x*(100.f/255.f);
srcbuf[1] = convert_float(src_pix.y - 128);
srcbuf[2] = convert_float(src_pix.z - 128);
Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB
gammaTab,
#endif
coeffs, lThresh, fThresh);
#if dcn == 3
dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
#else
*(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#elif defined DEPTH_5
__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float lThresh, float fThresh)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float srcbuf[3], dstbuf[3];
srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB
gammaTab,
#endif
coeffs, lThresh, fThresh);
dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
#if dcn == 4
dst[3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
/////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
#define LAB_CBRT_TAB_SIZE 1024
#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
#ifdef DEPTH_5
__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float R = src[0], G = src[1], B = src[2];
R = clamp(R, 0.f, 1.f);
G = clamp(G, 0.f, 1.f);
B = clamp(B, 0.f, 1.f);
#ifdef SRGB
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
L = fma(116.f, L, -16.f);
float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
float u = L*fma(X, d, -_un);
float v = L*fma(2.25f, Y*d, -_vn);
dst[0] = L;
dst[1] = u;
dst[2] = v;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
#elif defined DEPTH_0
__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
if (y < rows)
{
float scale = 1.0f / 255.0f;
float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
#ifdef SRGB
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
L = 116.f*L - 16.f;
float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
float u = L*(X*d - _un);
float v = L*fma(2.25f, Y*d, -_vn);
dst[0] = SAT_CAST(L * 2.55f);
//0.72033 = 255/(220+134), 96.525 = 134*255/(220+134)
dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
//0.9732 = 255/(140+122), 136.259 = 140*255/(140+122)
dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
++y;
dst += dst_step;
src += src_step;
}
}
}
#endif
#ifdef DEPTH_5
__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float L = src[0], u = src[1], v = src[2], X, Y, Z;
if(L >= 8)
{
Y = fma(L, 1.f/116.f, 16.f/116.f);
Y = Y*Y*Y;
}
else
{
Y = L * (1.0f/903.3f); // L*(3./29.)^3
}
float up = 3.f*fma(L, _un, u);
float vp = 0.25f/fma(L, _vn, v);
vp = clamp(vp, -0.25f, 0.25f);
X = 3.f*Y*up*vp;
Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
R = clamp(R, 0.f, 1.f);
G = clamp(G, 0.f, 1.f);
B = clamp(B, 0.f, 1.f);
#ifdef SRGB
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
dst[0] = R;
dst[1] = G;
dst[2] = B;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
#elif defined DEPTH_0
__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
if (y < rows)
{
float d, X, Y, Z;
float L = src[0]*(100.f/255.f);
// 1.388235294117647 = (220+134)/255
float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
// 1.027450980392157 = (140+122)/255
float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
if(L >= 8)
{
Y = fma(L, 1.f/116.f, 16.f/116.f);
Y = Y*Y*Y;
}
else
{
Y = L * (1.0f/903.3f); // L*(3./29.)^3
}
float up = 3.f*fma(L, _un, u);
float vp = 0.25f/fma(L, _vn, v);
vp = clamp(vp, -0.25f, 0.25f);
X = 3.f*Y*up*vp;
Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
//limit X, Y, Z to [0, 2] to fit white point
X = clamp(X, 0.f, 2.f); Z = clamp(Z, 0.f, 2.f);
float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
R = clamp(R, 0.f, 1.f);
G = clamp(G, 0.f, 1.f);
B = clamp(B, 0.f, 1.f);
#ifdef SRGB
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
uchar dst0 = SAT_CAST(R * 255.0f);
uchar dst1 = SAT_CAST(G * 255.0f);
uchar dst2 = SAT_CAST(B * 255.0f);
#if dcn == 4
*(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
#else
dst[0] = dst0;
dst[1] = dst1;
dst[2] = dst2;
#endif
++y;
dst += dst_step;
src += src_step;
}
}
}
#endif
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/**************************************PUBLICFUNC*************************************/
#if depth == 0
#define DATA_TYPE uchar
#define MAX_NUM 255
#define HALF_MAX_NUM 128
#define COEFF_TYPE int
#define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0
#elif depth == 2
#define DATA_TYPE ushort
#define MAX_NUM 65535
#define HALF_MAX_NUM 32768
#define COEFF_TYPE int
#define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2
#elif depth == 5
#define DATA_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX_NUM 0.5f
#define COEFF_TYPE float
#define SAT_CAST(num) (num)
#define DEPTH_5
#else
#error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
#endif
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
enum
{
yuv_shift = 14,
R2Y = 4899,
G2Y = 9617,
B2Y = 1868
};
//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
#define B2YF 0.114f
#define G2YF 0.587f
#define R2YF 0.299f
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
#if bidx == 0
#define R_COMP z
#define G_COMP y
#define B_COMP x
#else
#define R_COMP x
#define G_COMP y
#define B_COMP z
#endif
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE_3 src_pix = vload3(0, src);
#ifdef DEPTH_5
dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
#else
dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
#endif
++y;
src_index += src_step;
dst_index += dst_step;
}
}
}
}
__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE val = src[0];
#if dcn == 3 || defined DEPTH_5
dst[0] = dst[1] = dst[2] = val;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
#else
*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
#if scn == 3
DATA_TYPE_3 src_pix = vload3(0, src);
#else
DATA_TYPE_4 src_pix = vload4(0, src);
#endif
#ifdef REVERSE
dst[0] = src_pix.z;
dst[1] = src_pix.y;
dst[2] = src_pix.x;
#else
dst[0] = src_pix.x;
dst[1] = src_pix.y;
dst[2] = src_pix.z;
#endif
#if dcn == 4
#if scn == 3
dst[3] = MAX_NUM;
#else
dst[3] = src[3];
#endif
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
ushort t = *((__global const ushort*)(src + src_index));
#if greenbits == 6
dst[dst_index + bidx] = (uchar)(t << 3);
dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
#else
dst[dst_index + bidx] = (uchar)(t << 3);
dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
#endif
#if dcn == 4
#if greenbits == 6
dst[dst_index + 3] = 255;
#else
dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
#endif
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
#if greenbits == 6
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
#elif scn == 3
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
#else
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////
__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, dst_offset + x);
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int t = *((__global const ushort*)(src + src_index));
#if greenbits == 6
dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
#else
dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, src_offset + x);
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int t = src[src_index];
#if greenbits == 6
*((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
#else
t >>= 3;
*((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
#ifdef DEPTH_0
__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, src_offset + (x << 2));
int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
*(__global uchar4 *)(dst + dst_index) =
(uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
mad24(src_pix.y, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
mad24(src_pix.z, src_pix.w, HALF_MAX_NUM) / MAX_NUM, src_pix.w);
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
uchar v3 = src_pix.w, v3_half = v3 / 2;
if (v3 == 0)
*(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
else
*(__global uchar4 *)(dst + dst_index) =
(uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
mad24(src_pix.y, MAX_NUM, v3_half) / v3,
mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
......@@ -76,12 +76,9 @@
enum
{
yuv_shift = 14,
xyz_shift = 12,
hsv_shift = 12,
R2Y = 4899,
G2Y = 9617,
B2Y = 1868,
BLOCK_SIZE = 256
};
//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
......@@ -120,14 +117,6 @@ enum
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
#ifndef hscale
#define hscale 0
#endif
#ifndef hrange
#define hrange 0
#endif
#if bidx == 0
#define R_COMP z
#define G_COMP y
......@@ -156,77 +145,6 @@ enum
#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE_3 src_pix = vload3(0, src);
#ifdef DEPTH_5
dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
#else
dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
#endif
++y;
src_index += src_step;
dst_index += dst_step;
}
}
}
}
__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE val = src[0];
#if dcn == 3 || defined DEPTH_5
dst[0] = dst[1] = dst[2] = val;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
#else
*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
__constant float c_RGB2YUVCoeffs_f[5] = { B2YF, G2YF, R2YF, B2UF, R2VF };
......@@ -754,1450 +672,3 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
}
}
}
///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols, __constant COEFF_TYPE * coeffs)
{
int dx = get_global_id(0);
int dy = get_global_id(1) * PIX_PER_WI_Y;
if (dx < cols)
{
int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (dy < rows)
{
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
#ifdef DEPTH_5
float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
#else
int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
#endif
dst[0] = SAT_CAST(x);
dst[1] = SAT_CAST(y);
dst[2] = SAT_CAST(z);
++dy;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols, __constant COEFF_TYPE * coeffs)
{
int dx = get_global_id(0);
int dy = get_global_id(1) * PIX_PER_WI_Y;
if (dx < cols)
{
int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (dy < rows)
{
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
#ifdef DEPTH_5
float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
#else
int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
#endif
DATA_TYPE dst0 = SAT_CAST(b);
DATA_TYPE dst1 = SAT_CAST(g);
DATA_TYPE dst2 = SAT_CAST(r);
#if dcn == 3 || defined DEPTH_5
dst[0] = dst0;
dst[1] = dst1;
dst[2] = dst2;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
#else
*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
#endif
++dy;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
#if scn == 3
DATA_TYPE_3 src_pix = vload3(0, src);
#else
DATA_TYPE_4 src_pix = vload4(0, src);
#endif
#ifdef REVERSE
dst[0] = src_pix.z;
dst[1] = src_pix.y;
dst[2] = src_pix.x;
#else
dst[0] = src_pix.x;
dst[1] = src_pix.y;
dst[2] = src_pix.z;
#endif
#if dcn == 4
#if scn == 3
dst[3] = MAX_NUM;
#else
dst[3] = src[3];
#endif
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
ushort t = *((__global const ushort*)(src + src_index));
#if greenbits == 6
dst[dst_index + bidx] = (uchar)(t << 3);
dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
#else
dst[dst_index + bidx] = (uchar)(t << 3);
dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
#endif
#if dcn == 4
#if greenbits == 6
dst[dst_index + 3] = 255;
#else
dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
#endif
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
#if greenbits == 6
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
#elif scn == 3
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
#else
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////
__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, dst_offset + x);
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int t = *((__global const ushort*)(src + src_index));
#if greenbits == 6
dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
#else
dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, src_offset + x);
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
int t = src[src_index];
#if greenbits == 6
*((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
#else
t >>= 3;
*((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
//////////////////////////////////// RGB <-> HSV //////////////////////////////////////
__constant int sector_data[][3] = { { 1, 3, 0 },
{ 1, 0, 2 },
{ 3, 0, 1 },
{ 0, 2, 1 },
{ 0, 1, 3 },
{ 2, 1, 0 } };
#ifdef DEPTH_0
__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols,
__constant int * sdiv_table, __constant int * hdiv_table)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
int h, s, v = b;
int vmin = b, diff;
int vr, vg;
v = max(v, g);
v = max(v, r);
vmin = min(vmin, g);
vmin = min(vmin, r);
diff = v - vmin;
vr = v == r ? -1 : 0;
vg = v == g ? -1 : 0;
s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
h = (vr & (g - b)) +
(~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
h += h < 0 ? hrange : 0;
dst[dst_index] = convert_uchar_sat_rte(h);
dst[dst_index + 1] = (uchar)s;
dst[dst_index + 2] = (uchar)v;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
float b, g, r;
if (s != 0)
{
float tab[4];
int sector;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
sector = convert_int_sat_rtn(h);
h -= sector;
if( (unsigned)sector >= 6u )
{
sector = 0;
h = 0.f;
}
tab[0] = v;
tab[1] = v*(1.f - s);
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h));
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = v;
dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
dst[dst_index + 3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#elif defined DEPTH_5
__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
float h, s, v;
float vmin, diff;
v = vmin = r;
if( v < g ) v = g;
if( v < b ) v = b;
if( vmin > g ) vmin = g;
if( vmin > b ) vmin = b;
diff = v - vmin;
s = diff/(float)(fabs(v) + FLT_EPSILON);
diff = (float)(60.f/(diff + FLT_EPSILON));
if( v == r )
h = (g - b)*diff;
else if( v == g )
h = fma(b - r, diff, 120.f);
else
h = fma(r - g, diff, 240.f);
if( h < 0 )
h += 360.f;
dst[0] = h*hscale;
dst[1] = s;
dst[2] = v;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float h = src_pix.x, s = src_pix.y, v = src_pix.z;
float b, g, r;
if (s != 0)
{
float tab[4];
int sector;
h *= hscale;
if(h < 0)
do h += 6; while (h < 0);
else if (h >= 6)
do h -= 6; while (h >= 6);
sector = convert_int_sat_rtn(h);
h -= sector;
if ((unsigned)sector >= 6u)
{
sector = 0;
h = 0.f;
}
tab[0] = v;
tab[1] = v*(1.f - s);
tab[2] = v*(1.f - s*h);
tab[3] = v*(1.f - s*(1.f - h));
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = v;
dst[bidx] = b;
dst[1] = g;
dst[bidx^2] = r;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
#ifdef DEPTH_0
__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
float h = 0.f, s = 0.f, l;
float vmin, vmax, diff;
vmax = vmin = r;
if (vmax < g) vmax = g;
if (vmax < b) vmax = b;
if (vmin > g) vmin = g;
if (vmin > b) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if (diff > FLT_EPSILON)
{
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
diff = 60.f/diff;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
h = fma(b - r, diff, 120.f);
else
h = fma(r - g, diff, 240.f);
if( h < 0.f )
h += 360.f;
}
dst[dst_index] = convert_uchar_sat_rte(h*hscale);
dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = vload4(0, src + src_index);
float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
float b, g, r;
if (s != 0)
{
float tab[4];
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
int sector = convert_int_sat_rtn(h);
h -= sector;
tab[0] = p2;
tab[1] = p1;
tab[2] = fma(p2 - p1, 1-h, p1);
tab[3] = fma(p2 - p1, h, p1);
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = l;
dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
dst[dst_index + 3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#elif defined DEPTH_5
__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
float h = 0.f, s = 0.f, l;
float vmin, vmax, diff;
vmax = vmin = r;
if (vmax < g) vmax = g;
if (vmax < b) vmax = b;
if (vmin > g) vmin = g;
if (vmin > b) vmin = b;
diff = vmax - vmin;
l = (vmax + vmin)*0.5f;
if (diff > FLT_EPSILON)
{
s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
diff = 60.f/diff;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
h = fma(b - r, diff, 120.f);
else
h = fma(r - g, diff, 240.f);
if( h < 0.f ) h += 360.f;
}
dst[0] = h*hscale;
dst[1] = l;
dst[2] = s;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float h = src_pix.x, l = src_pix.y, s = src_pix.z;
float b, g, r;
if (s != 0)
{
float tab[4];
int sector;
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
do h -= 6; while( h >= 6 );
sector = convert_int_sat_rtn(h);
h -= sector;
tab[0] = p2;
tab[1] = p1;
tab[2] = fma(p2 - p1, 1-h, p1);
tab[3] = fma(p2 - p1, h, p1);
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
r = tab[sector_data[sector][2]];
}
else
b = g = r = l;
dst[bidx] = b;
dst[1] = g;
dst[bidx^2] = r;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
#ifdef DEPTH_0
__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, src_offset + (x << 2));
int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
*(__global uchar4 *)(dst + dst_index) =
(uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
mad24(src_pix.y, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
mad24(src_pix.z, src_pix.w, HALF_MAX_NUM) / MAX_NUM, src_pix.w);
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
__global uchar* dst, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
uchar v3 = src_pix.w, v3_half = v3 / 2;
if (v3 == 0)
*(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
else
*(__global uchar4 *)(dst + dst_index) =
(uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
mad24(src_pix.y, MAX_NUM, v3_half) / v3,
mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
#define lab_shift xyz_shift
#define gamma_shift 3
#define lab_shift2 (lab_shift + gamma_shift)
#define GAMMA_TAB_SIZE 1024
#define GammaTabScale (float)GAMMA_TAB_SIZE
inline float splineInterpolate(float x, __global const float * tab, int n)
{
int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
x -= ix;
tab += ix << 2;
return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
}
#ifdef DEPTH_0
__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
__global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
__constant int * coeffs, int Lscale, int Lshift)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const uchar* src_ptr = src + src_index;
__global uchar* dst_ptr = dst + dst_index;
uchar4 src_pix = vload4(0, src_ptr);
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
dst_ptr[0] = SAT_CAST(L);
dst_ptr[1] = SAT_CAST(a);
dst_ptr[2] = SAT_CAST(b);
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#elif defined DEPTH_5
__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float _1_3, float _a)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
float R = clamp(src_pix.x, 0.0f, 1.0f);
float G = clamp(src_pix.y, 0.0f, 1.0f);
float B = clamp(src_pix.z, 0.0f, 1.0f);
#ifdef SRGB
R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
// 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
float X = fma(R, C0, fma(G, C1, B*C2));
float Y = fma(R, C3, fma(G, C4, B*C5));
float Z = fma(R, C6, fma(G, C7, B*C8));
float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
float a = 500.f * (FX - FY);
float b = 200.f * (FY - FZ);
dst[0] = L;
dst[1] = a;
dst[2] = b;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float lThresh, float fThresh)
{
float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
float y, fy;
// 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
if (li <= lThresh)
{
y = li / 903.3f;
fy = fma(7.787f, y, 16.0f / 116.0f);
}
else
{
fy = (li + 16.0f) / 116.0f;
y = fy * fy * fy;
}
float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
#pragma unroll
for (int j = 0; j < 2; j++)
if (fxz[j] <= fThresh)
fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
else
fxz[j] = fxz[j] * fxz[j] * fxz[j];
float x = fxz[0], z = fxz[1];
float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
#ifdef SRGB
ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
}
#ifdef DEPTH_0
__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float lThresh, float fThresh)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const uchar* src_ptr = src + src_index;
__global uchar * dst_ptr = dst + dst_index;
uchar4 src_pix = vload4(0, src_ptr);
float srcbuf[3], dstbuf[3];
srcbuf[0] = src_pix.x*(100.f/255.f);
srcbuf[1] = convert_float(src_pix.y - 128);
srcbuf[2] = convert_float(src_pix.z - 128);
Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB
gammaTab,
#endif
coeffs, lThresh, fThresh);
#if dcn == 3
dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
#else
*(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#elif defined DEPTH_5
__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float lThresh, float fThresh)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float srcbuf[3], dstbuf[3];
srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB
gammaTab,
#endif
coeffs, lThresh, fThresh);
dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
#if dcn == 4
dst[3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
}
#endif
/////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
#define LAB_CBRT_TAB_SIZE 1024
#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
#ifdef DEPTH_5
__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float R = src[0], G = src[1], B = src[2];
R = clamp(R, 0.f, 1.f);
G = clamp(G, 0.f, 1.f);
B = clamp(B, 0.f, 1.f);
#ifdef SRGB
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
L = fma(116.f, L, -16.f);
float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
float u = L*fma(X, d, -_un);
float v = L*fma(2.25f, Y*d, -_vn);
dst[0] = L;
dst[1] = u;
dst[2] = v;
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
#elif defined DEPTH_0
__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
if (y < rows)
{
float scale = 1.0f / 255.0f;
float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
#ifdef SRGB
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
L = 116.f*L - 16.f;
float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
float u = L*(X*d - _un);
float v = L*fma(2.25f, Y*d, -_vn);
dst[0] = SAT_CAST(L * 2.55f);
//0.72033 = 255/(220+134), 96.525 = 134*255/(220+134)
dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
//0.9732 = 255/(140+122), 136.259 = 140*255/(140+122)
dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
++y;
dst += dst_step;
src += src_step;
}
}
}
#endif
#ifdef DEPTH_5
__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
if (y < rows)
{
__global const float * src = (__global const float *)(srcptr + src_index);
__global float * dst = (__global float *)(dstptr + dst_index);
float L = src[0], u = src[1], v = src[2], X, Y, Z;
if(L >= 8)
{
Y = fma(L, 1.f/116.f, 16.f/116.f);
Y = Y*Y*Y;
}
else
{
Y = L * (1.0f/903.3f); // L*(3./29.)^3
}
float up = 3.f*fma(L, _un, u);
float vp = 0.25f/fma(L, _vn, v);
vp = clamp(vp, -0.25f, 0.25f);
X = 3.f*Y*up*vp;
Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
R = clamp(R, 0.f, 1.f);
G = clamp(G, 0.f, 1.f);
B = clamp(B, 0.f, 1.f);
#ifdef SRGB
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
dst[0] = R;
dst[1] = G;
dst[2] = B;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
++y;
dst_index += dst_step;
src_index += src_step;
}
}
}
#elif defined DEPTH_0
__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
#ifdef SRGB
__global const float * gammaTab,
#endif
__constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols)
{
src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
if (y < rows)
{
float d, X, Y, Z;
float L = src[0]*(100.f/255.f);
// 1.388235294117647 = (220+134)/255
float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
// 1.027450980392157 = (140+122)/255
float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
if(L >= 8)
{
Y = fma(L, 1.f/116.f, 16.f/116.f);
Y = Y*Y*Y;
}
else
{
Y = L * (1.0f/903.3f); // L*(3./29.)^3
}
float up = 3.f*fma(L, _un, u);
float vp = 0.25f/fma(L, _vn, v);
vp = clamp(vp, -0.25f, 0.25f);
X = 3.f*Y*up*vp;
Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
//limit X, Y, Z to [0, 2] to fit white point
X = clamp(X, 0.f, 2.f); Z = clamp(Z, 0.f, 2.f);
float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
R = clamp(R, 0.f, 1.f);
G = clamp(G, 0.f, 1.f);
B = clamp(B, 0.f, 1.f);
#ifdef SRGB
R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
uchar dst0 = SAT_CAST(R * 255.0f);
uchar dst1 = SAT_CAST(G * 255.0f);
uchar dst2 = SAT_CAST(B * 255.0f);
#if dcn == 4
*(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
#else
dst[0] = dst0;
dst[1] = dst1;
dst[2] = dst2;
#endif
++y;
dst += dst_step;
src += src_step;
}
}
}
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment