Merge pull request #10869 from savuor:color_cpp_split

color.cpp split (#10869) * initial split is done * files renamed (these names are excluded during compilation) * IPP code moved to corresponding files * splineBuild, splineInterpolate -> color_lab.cpp * Lab, Luv: little refactored * it compiles (didn't check work); Lab OCL code moved to color_lab.cpp * cvtcolor.cl: Lab/Luv part moved to color_lab.cl * cvtcolor.cl: color_rgb.cl extracted * cvtcolor.cl: color_yuv.cl separated * cvtcolor.cl: color_hsv.cl extracted * cvtcolor.cl: extracted to color_lab.cl and color_rgb.cl * helper functions moved to hpp file * Lab, Luv: moved to color_lab.cpp * CPU XYZ: to color_lab.cpp * OCL XYZ: to color_lab.cpp * warning fixed * CvtHelper added * CPU YUV: to color_yuv.cpp, helpers to color.hpp * CPU HLS/HSV: to color_hsv.cpp * CPU BGR2BGR: to color_rgb.cpp * CPU RGB: to color_rgb.cpp * extra arg removed * CPU YUV: to color_yuv.cpp * color code decoded * OclHelper added, some funcs rewritten * color_lab.cpp: refactored to use OclHelper * OCL RGB: to color_rgb.cpp * OCL HLS/HSV: to color_hsv.cpp * OCL YUV: to color_yuv.cpp * OCL YUV planes: to color_yuv.cpp * OCL: color code reduced * licence to demosaicing.cpp * IPP func tables to color_rgb.cpp * code cleanup * HAVE_OPENCL ifdefs added * helpers made more common * fixed two plane YUV with separate mats * fixed warning in gcc7.2.0 * precomp header fixed * color space classification functions fixed * helpers fixed * rename: isSRGB -> is_sRGB

Merge pull request #10869 from savuor:color_cpp_split
color.cpp split (#10869) * initial split is done * files renamed (these names are excluded during compilation) * IPP code moved to corresponding files * splineBuild, splineInterpolate -> color_lab.cpp * Lab, Luv: little refactored * it compiles (didn't check work); Lab OCL code moved to color_lab.cpp * cvtcolor.cl: Lab/Luv part moved to color_lab.cl * cvtcolor.cl: color_rgb.cl extracted * cvtcolor.cl: color_yuv.cl separated * cvtcolor.cl: color_hsv.cl extracted * cvtcolor.cl: extracted to color_lab.cl and color_rgb.cl * helper functions moved to hpp file * Lab, Luv: moved to color_lab.cpp * CPU XYZ: to color_lab.cpp * OCL XYZ: to color_lab.cpp * warning fixed * CvtHelper added * CPU YUV: to color_yuv.cpp, helpers to color.hpp * CPU HLS/HSV: to color_hsv.cpp * CPU BGR2BGR: to color_rgb.cpp * CPU RGB: to color_rgb.cpp * extra arg removed * CPU YUV: to color_yuv.cpp * color code decoded * OclHelper added, some funcs rewritten * color_lab.cpp: refactored to use OclHelper * OCL RGB: to color_rgb.cpp * OCL HLS/HSV: to color_hsv.cpp * OCL YUV: to color_yuv.cpp * OCL YUV planes: to color_yuv.cpp * OCL: color code reduced * licence to demosaicing.cpp * IPP func tables to color_rgb.cpp * code cleanup * HAVE_OPENCL ifdefs added * helpers made more common * fixed two plane YUV with separate mats * fixed warning in gcc7.2.0 * precomp header fixed * color space classification functions fixed * helpers fixed * rename: isSRGB -> is_sRGB
64916d3d · Rostislav Vasilikhin · Alexander Alekhin · c727e8a4 · 64916d3d · 64916d3d
Commit 64916d3d authored Mar 15, 2018 by Rostislav Vasilikhin Committed by Alexander Alekhin Mar 15, 2018
11 changed files
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
--- a/modules/imgproc/src/color.hpp
+++ b/modules/imgproc/src/color.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "opencv2/imgproc.hpp"
+#include "opencv2/core/utility.hpp"
+#include <limits>
+#include "opencl_kernels_imgproc.hpp"
+#include "hal_replacement.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/softfloat.hpp"
+
+#define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
+
+namespace cv
+{
+
+//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+const float B2YF = 0.114f;
+const float G2YF = 0.587f;
+const float R2YF = 0.299f;
+
+enum
+{
+    yuv_shift = 14,
+    xyz_shift = 12,
+    R2Y = 4899, // == R2YF*16384
+    G2Y = 9617, // == G2YF*16384
+    B2Y = 1868, // == B2YF*16384
+    BLOCK_SIZE = 256
+};
+
+template<typename _Tp> struct ColorChannel
+{
+    typedef float worktype_f;
+    static _Tp max() { return std::numeric_limits<_Tp>::max(); }
+    static _Tp half() { return (_Tp)(max()/2 + 1); }
+};
+
+template<> struct ColorChannel<float>
+{
+    typedef float worktype_f;
+    static float max() { return 1.f; }
+    static float half() { return 0.5f; }
+};
+
+/*template<> struct ColorChannel<double>
+{
+    typedef double worktype_f;
+    static double max() { return 1.; }
+    static double half() { return 0.5; }
+};*/
+
+//
+// Helper functions
+//
+
+namespace {
+
+inline bool isHSV(int code)
+{
+    switch(code)
+    {
+    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
+    case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool isLab(int code)
+{
+    switch (code)
+    {
+    case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Lab2LBGR: case COLOR_Lab2LRGB:
+    case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_LBGR2Lab: case COLOR_LRGB2Lab:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool is_sRGB(int code)
+{
+    switch (code)
+    {
+    case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_BGR2Luv: case COLOR_RGB2Luv:
+    case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Luv2BGR: case COLOR_Luv2RGB:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool swapBlue(int code)
+{
+    switch (code)
+    {
+    case COLOR_BGR2BGRA: case COLOR_BGRA2BGR:
+    case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555:
+    case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA:
+    case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
+    case COLOR_BGR2YCrCb: case COLOR_BGR2YUV:
+    case COLOR_YCrCb2BGR: case COLOR_YUV2BGR:
+    case COLOR_BGR2XYZ: case COLOR_XYZ2BGR:
+    case COLOR_BGR2HSV: case COLOR_BGR2HLS: case COLOR_BGR2HSV_FULL: case COLOR_BGR2HLS_FULL:
+    case COLOR_YUV2BGR_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2BGRA_IYUV:
+    case COLOR_YUV2BGR_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2BGRA_NV12:
+    case COLOR_Lab2BGR: case COLOR_Luv2BGR: case COLOR_Lab2LBGR: case COLOR_Luv2LBGR:
+    case COLOR_BGR2Lab: case COLOR_BGR2Luv: case COLOR_LBGR2Lab: case COLOR_LBGR2Luv:
+    case COLOR_HSV2BGR: case COLOR_HLS2BGR: case COLOR_HSV2BGR_FULL: case COLOR_HLS2BGR_FULL:
+    case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2:
+    case COLOR_YUV2BGRA_YUY2:  case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU:
+    case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12:
+        return false;
+    default:
+        return true;
+    }
+}
+
+inline bool isFullRangeHSV(int code)
+{
+    switch (code)
+    {
+    case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
+    case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline int dstChannels(int code)
+{
+    switch( code )
+    {
+        case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2RGBA:
+        case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
+        case COLOR_GRAY2BGRA:
+        case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
+        case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
+        case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
+        case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
+
+            return 4;
+
+        case COLOR_BGRA2BGR: case COLOR_RGBA2BGR: case COLOR_RGB2BGR:
+        case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
+        case COLOR_GRAY2BGR:
+        case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12:
+        case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV:
+        case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
+        case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2:
+
+            return 3;
+
+        default:
+            return 0;
+    }
+}
+
+inline int greenBits(int code)
+{
+    switch( code )
+    {
+        case COLOR_BGR2BGR565: case COLOR_RGB2BGR565: case COLOR_BGRA2BGR565: case COLOR_RGBA2BGR565:
+        case COLOR_BGR5652BGR: case COLOR_BGR5652RGB: case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA:
+        case COLOR_BGR5652GRAY: case COLOR_GRAY2BGR565:
+
+            return 6;
+
+        case COLOR_BGR2BGR555: case COLOR_RGB2BGR555: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR555:
+        case COLOR_BGR5552BGR: case COLOR_BGR5552RGB: case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA:
+        case COLOR_BGR5552GRAY: case COLOR_GRAY2BGR555:
+
+            return 5;
+
+        default:
+            return 0;
+    }
+}
+
+inline int uIndex(int code)
+{
+    switch( code )
+    {
+        case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
+
+            return 2;
+
+        case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
+        case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
+        case COLOR_YUV2BGR_NV21:  case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21:
+        case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
+
+            return 1;
+
+        case COLOR_YUV2BGR_NV12:  case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
+        case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
+        case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
+        case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
+
+            return 0;
+
+        default:
+            return -1;
+    }
+}
+
+} // namespace::
+
+template<int i0, int i1 = -1, int i2 = -1>
+struct Set
+{
+    static bool contains(int i)
+    {
+        return (i == i0 || i == i1 || i == i2);
+    }
+};
+
+template<int i0, int i1>
+struct Set<i0, i1, -1>
+{
+    static bool contains(int i)
+    {
+        return (i == i0 || i == i1);
+    }
+};
+
+template<int i0>
+struct Set<i0, -1, -1>
+{
+    static bool contains(int i)
+    {
+        return (i == i0);
+    }
+};
+
+enum SizePolicy
+{
+    TO_YUV, FROM_YUV, NONE
+};
+
+template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
+struct CvtHelper
+{
+    CvtHelper(InputArray _src, OutputArray _dst, int dcn)
+    {
+        int stype = _src.type();
+        scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype);
+
+        CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) );
+
+        if (_src.getObj() == _dst.getObj()) // inplace processing (#6653)
+            _src.copyTo(src);
+        else
+            src = _src.getMat();
+        Size sz = src.size();
+        switch (sizePolicy)
+        {
+        case TO_YUV:
+            CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0);
+            dstSz = Size(sz.width, sz.height / 2 * 3);
+            break;
+        case FROM_YUV:
+            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0);
+            dstSz = Size(sz.width, sz.height * 2 / 3);
+            break;
+        case NONE:
+        default:
+            dstSz = sz;
+            break;
+        }
+        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
+        dst = _dst.getMat();
+    }
+    Mat src, dst;
+    int depth, scn;
+    Size dstSz;
+};
+
+#ifdef HAVE_OPENCL
+
+template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
+struct OclHelper
+{
+    OclHelper( InputArray _src, OutputArray _dst, int dcn)
+    {
+        src = _src.getUMat();
+        Size sz = src.size(), dstSz;
+        int scn = src.channels();
+        int depth = src.depth();
+
+        CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) );
+        switch (sizePolicy)
+        {
+        case TO_YUV:
+            CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
+            dstSz = Size(sz.width, sz.height / 2 * 3);
+            break;
+        case FROM_YUV:
+            CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 );
+            dstSz = Size(sz.width, sz.height * 2 / 3);
+            break;
+        case NONE:
+        default:
+            dstSz = sz;
+            break;
+        }
+
+        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
+        dst = _dst.getUMat();
+    }
+
+    bool createKernel(cv::String name, ocl::ProgramSource& source, cv::String options)
+    {
+        ocl::Device dev = ocl::Device::getDefault();
+        int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
+        int pxPerWIx = 1;
+
+        cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
+                                        src.depth(), src.channels(), pxPerWIy);
+
+        switch (sizePolicy)
+        {
+        case TO_YUV:
+            if (dev.isIntel() &&
+                    src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
+                    dst.step % 4 == 0 && dst.offset % 4 == 0)
+            {
+                pxPerWIx = 2;
+            }
+            globalSize[0] = (size_t)dst.cols/(2*pxPerWIx);
+            globalSize[1] = ((size_t)dst.rows/3 + pxPerWIy - 1) / pxPerWIy;
+            baseOptions += format("-D PIX_PER_WI_X=%d ", pxPerWIx);
+            break;
+        case FROM_YUV:
+            globalSize[0] = (size_t)dst.cols/2;
+            globalSize[1] = ((size_t)dst.rows/2 + pxPerWIy - 1) / pxPerWIy;
+            break;
+        case NONE:
+        default:
+            globalSize[0] = (size_t)src.cols;
+            globalSize[1] = ((size_t)src.rows + pxPerWIy - 1) / pxPerWIy;
+            break;
+        }
+
+        k.create(name.c_str(), source, baseOptions + options);
+
+        if(k.empty())
+            return false;
+
+        nArgs = k.set(0, ocl::KernelArg::ReadOnlyNoSize(src));
+        nArgs = k.set(nArgs, ocl::KernelArg::WriteOnly(dst));
+        return true;
+    }
+
+    bool run()
+    {
+        return k.run(2, globalSize, NULL, false);
+    }
+
+    template<typename T>
+    void setArg(const T& arg)
+    {
+        nArgs = k.set(nArgs, arg);
+    }
+
+    UMat src, dst;
+    ocl::Kernel k;
+    size_t globalSize[2];
+    int nArgs;
+};
+
+#endif
+
+///////////////////////////// Top-level template function ////////////////////////////////
+
+template <typename Cvt>
+class CvtColorLoop_Invoker : public ParallelLoopBody
+{
+    typedef typename Cvt::channel_type _Tp;
+public:
+
+    CvtColorLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt) :
+        ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_),
+        width(width_), cvt(_cvt)
+    {
+    }
+
+    virtual void operator()(const Range& range) const
+    {
+        CV_TRACE_FUNCTION();
+
+        const uchar* yS = src_data + static_cast<size_t>(range.start) * src_step;
+        uchar* yD = dst_data + static_cast<size_t>(range.start) * dst_step;
+
+        for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step )
+            cvt(reinterpret_cast<const _Tp*>(yS), reinterpret_cast<_Tp*>(yD), width);
+    }
+
+private:
+    const uchar * src_data;
+    const size_t src_step;
+    uchar * dst_data;
+    const size_t dst_step;
+    const int width;
+    const Cvt& cvt;
+
+    const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
+};
+
+template <typename Cvt>
+void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
+{
+    parallel_for_(Range(0, height),
+                  CvtColorLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt),
+                  (width * height) / static_cast<double>(1<<16));
+}
+
+#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700)
+#  define NEED_IPP 1
+#else
+#  define NEED_IPP 0
+#endif
+
+#if NEED_IPP
+
+#define MAX_IPP8u   255
+#define MAX_IPP16u  65535
+#define MAX_IPP32f  1.0
+
+typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
+typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
+typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
+
+template <typename Cvt>
+class CvtColorIPPLoop_Invoker :
+        public ParallelLoopBody
+{
+public:
+
+    CvtColorIPPLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt, bool *_ok) :
+        ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), cvt(_cvt), ok(_ok)
+    {
+        *ok = true;
+    }
+
+    virtual void operator()(const Range& range) const
+    {
+        const void *yS = src_data + src_step * range.start;
+        void *yD = dst_data + dst_step * range.start;
+        if( !cvt(yS, static_cast<int>(src_step), yD, static_cast<int>(dst_step), width, range.end - range.start) )
+            *ok = false;
+        else
+        {
+            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
+        }
+    }
+
+private:
+    const uchar * src_data;
+    const size_t src_step;
+    uchar * dst_data;
+    const size_t dst_step;
+    const int width;
+    const Cvt& cvt;
+    bool *ok;
+
+    const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
+};
+
+
+template <typename Cvt>
+bool CvtColorIPPLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
+{
+    bool ok;
+    parallel_for_(Range(0, height), CvtColorIPPLoop_Invoker<Cvt>(src_data, src_step, dst_data, dst_step, width, cvt, &ok), (width * height)/(double)(1<<16) );
+    return ok;
+}
+
+
+template <typename Cvt>
+bool CvtColorIPPLoopCopy(const uchar * src_data, size_t src_step, int src_type, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt)
+{
+    Mat temp;
+    Mat src(Size(width, height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat source = src;
+    if( src_data == dst_data )
+    {
+        src.copyTo(temp);
+        source = temp;
+    }
+    bool ok;
+    parallel_for_(Range(0, source.rows),
+                  CvtColorIPPLoop_Invoker<Cvt>(source.data, source.step, dst_data, dst_step,
+                                               source.cols, cvt, &ok),
+                  source.total()/(double)(1<<16) );
+    return ok;
+}
+
+
+struct IPPGeneralFunctor
+{
+    IPPGeneralFunctor(ippiGeneralFunc _func) : ippiColorConvertGeneral(_func){}
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        return ippiColorConvertGeneral ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
+    }
+private:
+    ippiGeneralFunc ippiColorConvertGeneral;
+};
+
+
+struct IPPReorderFunctor
+{
+    IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : ippiColorConvertReorder(_func)
+    {
+        order[0] = _order0;
+        order[1] = _order1;
+        order[2] = _order2;
+        order[3] = 3;
+    }
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        return ippiColorConvertReorder ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
+    }
+private:
+    ippiReorderFunc ippiColorConvertReorder;
+    int order[4];
+};
+
+
+struct IPPReorderGeneralFunctor
+{
+    IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
+        ippiColorConvertReorder(_func1), ippiColorConvertGeneral(_func2), depth(_depth)
+    {
+        order[0] = _order0;
+        order[1] = _order1;
+        order[2] = _order2;
+        order[3] = 3;
+    }
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        if (ippiColorConvertReorder == 0 || ippiColorConvertGeneral == 0)
+            return false;
+
+        Mat temp;
+        temp.create(rows, cols, CV_MAKETYPE(depth, 3));
+        if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
+            return false;
+        return CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
+    }
+private:
+    ippiReorderFunc ippiColorConvertReorder;
+    ippiGeneralFunc ippiColorConvertGeneral;
+    int order[4];
+    int depth;
+};
+
+
+struct IPPGeneralReorderFunctor
+{
+    IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
+        ippiColorConvertGeneral(_func1), ippiColorConvertReorder(_func2), depth(_depth)
+    {
+        order[0] = _order0;
+        order[1] = _order1;
+        order[2] = _order2;
+        order[3] = 3;
+    }
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        if (ippiColorConvertGeneral == 0 || ippiColorConvertReorder == 0)
+            return false;
+
+        Mat temp;
+        temp.create(rows, cols, CV_MAKETYPE(depth, 3));
+        if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
+            return false;
+        return CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
+    }
+private:
+    ippiGeneralFunc ippiColorConvertGeneral;
+    ippiReorderFunc ippiColorConvertReorder;
+    int order[4];
+    int depth;
+};
+
+extern ippiReorderFunc ippiSwapChannelsC3C4RTab[8];
+extern ippiReorderFunc ippiSwapChannelsC4C3RTab[8];
+extern ippiReorderFunc ippiSwapChannelsC3RTab[8];
+
+#endif
+
+#ifdef HAVE_OPENCL
+
+bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb );
+bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb );
+bool oclCvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb);
+bool oclCvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb);
+bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx );
+bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
+
+bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full );
+bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full );
+bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full );
+bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full );
+
+bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse );
+bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits );
+bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits );
+bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits );
+bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits );
+bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx );
+bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn );
+bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst );
+bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst );
+
+bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx);
+bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx);
+bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx );
+bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
+
+bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx );
+bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
+bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
+bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx );
+bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
+
+#endif
+
+void cvtColorBGR2Lab( InputArray _src, OutputArray _dst, bool swapb, bool srgb);
+void cvtColorBGR2Luv( InputArray _src, OutputArray _dst, bool swapb, bool srgb);
+void cvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb );
+void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb );
+void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb );
+void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb );
+
+void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb);
+void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb);
+
+void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn);
+void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
+void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx );
+void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
+void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx);
+void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
+void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi );
+
+void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange );
+void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange );
+void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange);
+void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange);
+
+void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb);
+void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits);
+void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits);
+void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb);
+void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn);
+void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits);
+void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits);
+void cvtColorRGBA2mRGBA(InputArray _src, OutputArray _dst);
+void cvtColormRGBA2RGBA(InputArray _src, OutputArray _dst);
+
+} //namespace cv
--- a/modules/imgproc/src/color_hsv.cpp
+++ b/modules/imgproc/src/color_hsv.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "color.hpp"
+
+namespace cv
+{
+
+////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
+
+
+struct RGB2HSV_b
+{
+    typedef uchar channel_type;
+
+    RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
+    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
+    {
+        CV_Assert( hrange == 180 || hrange == 256 );
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int i, bidx = blueIdx, scn = srccn;
+        const int hsv_shift = 12;
+
+        static int sdiv_table[256];
+        static int hdiv_table180[256];
+        static int hdiv_table256[256];
+        static volatile bool initialized = false;
+
+        int hr = hrange;
+        const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
+        n *= 3;
+
+        if( !initialized )
+        {
+            sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
+            for( i = 1; i < 256; i++ )
+            {
+                sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
+                hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
+                hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
+            }
+            initialized = true;
+        }
+
+        for( i = 0; i < n; i += 3, src += scn )
+        {
+            int b = src[bidx], g = src[1], r = src[bidx^2];
+            int h, s, v = b;
+            int vmin = b;
+            int vr, vg;
+
+            CV_CALC_MAX_8U( v, g );
+            CV_CALC_MAX_8U( v, r );
+            CV_CALC_MIN_8U( vmin, g );
+            CV_CALC_MIN_8U( vmin, r );
+
+            uchar diff = saturate_cast<uchar>(v - vmin);
+            vr = v == r ? -1 : 0;
+            vg = v == g ? -1 : 0;
+
+            s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) +
+                (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += h < 0 ? hr : 0;
+
+            dst[i] = saturate_cast<uchar>(h);
+            dst[i+1] = (uchar)s;
+            dst[i+2] = (uchar)v;
+        }
+    }
+
+    int srccn, blueIdx, hrange;
+};
+
+
+struct RGB2HSV_f
+{
+    typedef float channel_type;
+
+    RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
+    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int i, bidx = blueIdx, scn = srccn;
+        float hscale = hrange*(1.f/360.f);
+        n *= 3;
+
+        for( i = 0; i < n; i += 3, src += scn )
+        {
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h, s, v;
+
+            float vmin, diff;
+
+            v = vmin = r;
+            if( v < g ) v = g;
+            if( v < b ) v = b;
+            if( vmin > g ) vmin = g;
+            if( vmin > b ) vmin = b;
+
+            diff = v - vmin;
+            s = diff/(float)(fabs(v) + FLT_EPSILON);
+            diff = (float)(60./(diff + FLT_EPSILON));
+            if( v == r )
+                h = (g - b)*diff;
+            else if( v == g )
+                h = (b - r)*diff + 120.f;
+            else
+                h = (r - g)*diff + 240.f;
+
+            if( h < 0 ) h += 360.f;
+
+            dst[i] = h*hscale;
+            dst[i+1] = s;
+            dst[i+2] = v;
+        }
+    }
+
+    int srccn, blueIdx;
+    float hrange;
+};
+
+
+struct HSV2RGB_f
+{
+    typedef float channel_type;
+
+    HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
+    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
+        #if CV_SSE2
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #endif
+    }
+
+    #if CV_SSE2
+    void process(__m128& v_h0, __m128& v_h1, __m128& v_s0,
+                 __m128& v_s1, __m128& v_v0, __m128& v_v1) const
+    {
+        v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
+        v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
+
+        __m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
+        __m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
+
+        v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
+        v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
+
+        __m128 v_tab00 = v_v0;
+        __m128 v_tab01 = v_v1;
+        __m128 v_tab10 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), v_s0));
+        __m128 v_tab11 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), v_s1));
+        __m128 v_tab20 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s0, v_h0)));
+        __m128 v_tab21 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s1, v_h1)));
+        __m128 v_tab30 = _mm_mul_ps(v_v0, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s0, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0))));
+        __m128 v_tab31 = _mm_mul_ps(v_v1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(v_s1, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1))));
+
+        __m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
+        __m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
+        v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
+        v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
+        v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
+        v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
+        v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
+        v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
+
+        v_h0 = _mm_and_ps(v_tab10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
+        v_h1 = _mm_and_ps(v_tab11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
+        v_s0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
+        v_s1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab00, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab01, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
+        v_v0 = _mm_and_ps(v_tab00, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
+        v_v1 = _mm_and_ps(v_tab01, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
+        v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
+        v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
+        v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
+        v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
+        v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
+        v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
+        v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
+        v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
+        v_v0 = _mm_or_ps(v_v0, _mm_and_ps(v_tab00, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
+        v_v1 = _mm_or_ps(v_v1, _mm_and_ps(v_tab01, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
+    }
+    #endif
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int i = 0, bidx = blueIdx, dcn = dstcn;
+        float _hscale = hscale;
+        float alpha = ColorChannel<float>::max();
+        n *= 3;
+
+        #if CV_SSE2
+        if (haveSIMD)
+        {
+            for( ; i <= n - 24; i += 24, dst += dcn * 8 )
+            {
+                __m128 v_h0 = _mm_loadu_ps(src + i +  0);
+                __m128 v_h1 = _mm_loadu_ps(src + i +  4);
+                __m128 v_s0 = _mm_loadu_ps(src + i +  8);
+                __m128 v_s1 = _mm_loadu_ps(src + i + 12);
+                __m128 v_v0 = _mm_loadu_ps(src + i + 16);
+                __m128 v_v1 = _mm_loadu_ps(src + i + 20);
+
+                _mm_deinterleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
+
+                process(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
+
+                if (dcn == 3)
+                {
+                    if (bidx)
+                    {
+                        _mm_interleave_ps(v_v0, v_v1, v_s0, v_s1, v_h0, v_h1);
+
+                        _mm_storeu_ps(dst +  0, v_v0);
+                        _mm_storeu_ps(dst +  4, v_v1);
+                        _mm_storeu_ps(dst +  8, v_s0);
+                        _mm_storeu_ps(dst + 12, v_s1);
+                        _mm_storeu_ps(dst + 16, v_h0);
+                        _mm_storeu_ps(dst + 20, v_h1);
+                    }
+                    else
+                    {
+                        _mm_interleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1);
+
+                        _mm_storeu_ps(dst +  0, v_h0);
+                        _mm_storeu_ps(dst +  4, v_h1);
+                        _mm_storeu_ps(dst +  8, v_s0);
+                        _mm_storeu_ps(dst + 12, v_s1);
+                        _mm_storeu_ps(dst + 16, v_v0);
+                        _mm_storeu_ps(dst + 20, v_v1);
+                    }
+                }
+                else
+                {
+                    __m128 v_a0 = _mm_set1_ps(alpha);
+                    __m128 v_a1 = _mm_set1_ps(alpha);
+                    if (bidx)
+                    {
+                        _mm_interleave_ps(v_v0, v_v1, v_s0, v_s1, v_h0, v_h1, v_a0, v_a1);
+
+                        _mm_storeu_ps(dst +  0, v_v0);
+                        _mm_storeu_ps(dst +  4, v_v1);
+                        _mm_storeu_ps(dst +  8, v_s0);
+                        _mm_storeu_ps(dst + 12, v_s1);
+                        _mm_storeu_ps(dst + 16, v_h0);
+                        _mm_storeu_ps(dst + 20, v_h1);
+                        _mm_storeu_ps(dst + 24, v_a0);
+                        _mm_storeu_ps(dst + 28, v_a1);
+                    }
+                    else
+                    {
+                        _mm_interleave_ps(v_h0, v_h1, v_s0, v_s1, v_v0, v_v1, v_a0, v_a1);
+
+                        _mm_storeu_ps(dst +  0, v_h0);
+                        _mm_storeu_ps(dst +  4, v_h1);
+                        _mm_storeu_ps(dst +  8, v_s0);
+                        _mm_storeu_ps(dst + 12, v_s1);
+                        _mm_storeu_ps(dst + 16, v_v0);
+                        _mm_storeu_ps(dst + 20, v_v1);
+                        _mm_storeu_ps(dst + 24, v_a0);
+                        _mm_storeu_ps(dst + 28, v_a1);
+                    }
+                }
+            }
+        }
+        #endif
+        for( ; i < n; i += 3, dst += dcn )
+        {
+            float h = src[i], s = src[i+1], v = src[i+2];
+            float b, g, r;
+
+            if( s == 0 )
+                b = g = r = v;
+            else
+            {
+                static const int sector_data[][3]=
+                    {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
+                float tab[4];
+                int sector;
+                h *= _hscale;
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+                sector = cvFloor(h);
+                h -= sector;
+                if( (unsigned)sector >= 6u )
+                {
+                    sector = 0;
+                    h = 0.f;
+                }
+
+                tab[0] = v;
+                tab[1] = v*(1.f - s);
+                tab[2] = v*(1.f - s*h);
+                tab[3] = v*(1.f - s*(1.f - h));
+
+                b = tab[sector_data[sector][0]];
+                g = tab[sector_data[sector][1]];
+                r = tab[sector_data[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+
+    int dstcn, blueIdx;
+    float hscale;
+    #if CV_SSE2
+    bool haveSIMD;
+    #endif
+};
+
+
+struct HSV2RGB_b
+{
+    typedef uchar channel_type;
+
+    HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
+    : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(1.f/255.f);
+        v_scale = vdupq_n_f32(255.f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #elif CV_SSE2
+        v_scale = _mm_set1_ps(255.0f);
+        v_alpha = _mm_set1_ps(ColorChannel<uchar>::max());
+        v_zero = _mm_setzero_si128();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #endif
+    }
+
+    #if CV_SSE2
+    void process(__m128i v_r, __m128i v_g, __m128i v_b,
+                 const __m128& v_coeffs_,
+                 float * buf) const
+    {
+        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
+        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
+        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
+
+        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
+        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
+        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
+
+        __m128 v_coeffs = v_coeffs_;
+
+        v_r0 = _mm_mul_ps(v_r0, v_coeffs);
+        v_g1 = _mm_mul_ps(v_g1, v_coeffs);
+
+        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
+
+        v_r1 = _mm_mul_ps(v_r1, v_coeffs);
+        v_b0 = _mm_mul_ps(v_b0, v_coeffs);
+
+        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
+
+        v_g0 = _mm_mul_ps(v_g0, v_coeffs);
+        v_b1 = _mm_mul_ps(v_b1, v_coeffs);
+
+        _mm_store_ps(buf, v_r0);
+        _mm_store_ps(buf + 4, v_r1);
+        _mm_store_ps(buf + 8, v_g0);
+        _mm_store_ps(buf + 12, v_g1);
+        _mm_store_ps(buf + 16, v_b0);
+        _mm_store_ps(buf + 20, v_b1);
+    }
+    #endif
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int i, j, dcn = dstcn;
+        uchar alpha = ColorChannel<uchar>::max();
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
+        #if CV_SSE2
+        __m128 v_coeffs = _mm_set_ps(1.f, 1.f/255.f, 1.f/255.f, 1.f);
+        #endif
+
+        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
+        {
+            int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
+
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                uint8x8x3_t v_src = vld3_u8(src + j);
+                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
+                           v_t1 = vmovl_u8(v_src.val[1]),
+                           v_t2 = vmovl_u8(v_src.val[2]);
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 8) * 3; j += 24)
+                {
+                    __m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j));
+                    __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16));
+
+                    process(_mm_unpacklo_epi8(v_src0, v_zero),
+                            _mm_unpackhi_epi8(v_src0, v_zero),
+                            _mm_unpacklo_epi8(v_src1, v_zero),
+                            v_coeffs,
+                            buf + j);
+                }
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3 )
+            {
+                buf[j] = src[j];
+                buf[j+1] = src[j+1]*(1.f/255.f);
+                buf[j+2] = src[j+2]*(1.f/255.f);
+            }
+            cvt(buf, buf, dn);
+
+            j = 0;
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
+                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
+                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+
+                if (dcn == 4)
+                {
+                    uint8x8x4_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    v_dst.val[3] = v_alpha;
+                    vst4_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    vst3_u8(dst, v_dst);
+                }
+            }
+            #elif CV_SSE2
+            if (dcn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
+                {
+                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
+                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
+                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
+                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
+
+                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                                     _mm_cvtps_epi32(v_src1));
+                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
+                                                     _mm_cvtps_epi32(v_src3));
+
+                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    dst -= jr, j -= jr;
+            }
+            else if (dcn == 4 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 12); j += 12, dst += 16)
+                {
+                    __m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
+                    __m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
+                    __m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
+
+                    __m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha);
+                    __m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha);
+
+                    __m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44));
+                    __m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78);
+                    __m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e));
+                    __m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78);
+
+                    __m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1);
+                    __m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3);
+
+                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    dst -= jr, j -= jr;
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3, dst += dcn )
+            {
+                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
+                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
+                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
+                if( dcn == 4 )
+                    dst[3] = alpha;
+            }
+        }
+    }
+
+    int dstcn;
+    HSV2RGB_f cvt;
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv;
+    uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale;
+    __m128 v_alpha;
+    __m128i v_zero;
+    bool haveSIMD;
+    #endif
+};
+
+
+///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
+
+struct RGB2HLS_f
+{
+    typedef float channel_type;
+
+    RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
+    : srccn(_srccn), blueIdx(_blueIdx), hscale(_hrange/360.f) {
+        #if CV_SSE2
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #endif
+    }
+
+    #if CV_SSE2
+    void process(__m128& v_b0, __m128& v_b1, __m128& v_g0,
+                 __m128& v_g1, __m128& v_r0, __m128& v_r1) const
+    {
+        __m128 v_max0 = _mm_max_ps(_mm_max_ps(v_b0, v_g0), v_r0);
+        __m128 v_max1 = _mm_max_ps(_mm_max_ps(v_b1, v_g1), v_r1);
+        __m128 v_min0 = _mm_min_ps(_mm_min_ps(v_b0, v_g0), v_r0);
+        __m128 v_min1 = _mm_min_ps(_mm_min_ps(v_b1, v_g1), v_r1);
+        __m128 v_diff0 = _mm_sub_ps(v_max0, v_min0);
+        __m128 v_diff1 = _mm_sub_ps(v_max1, v_min1);
+        __m128 v_sum0 = _mm_add_ps(v_max0, v_min0);
+        __m128 v_sum1 = _mm_add_ps(v_max1, v_min1);
+        __m128 v_l0 = _mm_mul_ps(v_sum0, _mm_set1_ps(0.5f));
+        __m128 v_l1 = _mm_mul_ps(v_sum1, _mm_set1_ps(0.5f));
+
+        __m128 v_gel0 = _mm_cmpge_ps(v_l0, _mm_set1_ps(0.5f));
+        __m128 v_gel1 = _mm_cmpge_ps(v_l1, _mm_set1_ps(0.5f));
+        __m128 v_s0 = _mm_and_ps(v_gel0, _mm_sub_ps(_mm_set1_ps(2.0f), v_sum0));
+        __m128 v_s1 = _mm_and_ps(v_gel1, _mm_sub_ps(_mm_set1_ps(2.0f), v_sum1));
+        v_s0 = _mm_or_ps(v_s0, _mm_andnot_ps(v_gel0, v_sum0));
+        v_s1 = _mm_or_ps(v_s1, _mm_andnot_ps(v_gel1, v_sum1));
+        v_s0 = _mm_div_ps(v_diff0, v_s0);
+        v_s1 = _mm_div_ps(v_diff1, v_s1);
+
+        __m128 v_gteps0 = _mm_cmpgt_ps(v_diff0, _mm_set1_ps(FLT_EPSILON));
+        __m128 v_gteps1 = _mm_cmpgt_ps(v_diff1, _mm_set1_ps(FLT_EPSILON));
+
+        v_diff0 = _mm_div_ps(_mm_set1_ps(60.f), v_diff0);
+        v_diff1 = _mm_div_ps(_mm_set1_ps(60.f), v_diff1);
+
+        __m128 v_eqr0 = _mm_cmpeq_ps(v_max0, v_r0);
+        __m128 v_eqr1 = _mm_cmpeq_ps(v_max1, v_r1);
+        __m128 v_h0 = _mm_and_ps(v_eqr0, _mm_mul_ps(_mm_sub_ps(v_g0, v_b0), v_diff0));
+        __m128 v_h1 = _mm_and_ps(v_eqr1, _mm_mul_ps(_mm_sub_ps(v_g1, v_b1), v_diff1));
+        __m128 v_eqg0 = _mm_cmpeq_ps(v_max0, v_g0);
+        __m128 v_eqg1 = _mm_cmpeq_ps(v_max1, v_g1);
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(_mm_andnot_ps(v_eqr0, v_eqg0), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_b0, v_r0), v_diff0), _mm_set1_ps(120.f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(_mm_andnot_ps(v_eqr1, v_eqg1), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_b1, v_r1), v_diff1), _mm_set1_ps(120.f))));
+        v_h0 = _mm_or_ps(v_h0, _mm_andnot_ps(_mm_or_ps(v_eqr0, v_eqg0), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_r0, v_g0), v_diff0), _mm_set1_ps(240.f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_andnot_ps(_mm_or_ps(v_eqr1, v_eqg1), _mm_add_ps(_mm_mul_ps(_mm_sub_ps(v_r1, v_g1), v_diff1), _mm_set1_ps(240.f))));
+        v_h0 = _mm_add_ps(v_h0, _mm_and_ps(_mm_cmplt_ps(v_h0, _mm_setzero_ps()), _mm_set1_ps(360.f)));
+        v_h1 = _mm_add_ps(v_h1, _mm_and_ps(_mm_cmplt_ps(v_h1, _mm_setzero_ps()), _mm_set1_ps(360.f)));
+        v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
+        v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
+
+        v_b0 = _mm_and_ps(v_gteps0, v_h0);
+        v_b1 = _mm_and_ps(v_gteps1, v_h1);
+        v_g0 = v_l0;
+        v_g1 = v_l1;
+        v_r0 = _mm_and_ps(v_gteps0, v_s0);
+        v_r1 = _mm_and_ps(v_gteps1, v_s1);
+    }
+    #endif
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int i = 0, bidx = blueIdx, scn = srccn;
+        n *= 3;
+
+        #if CV_SSE2
+        if (haveSIMD)
+        {
+            for( ; i <= n - 24; i += 24, src += scn * 8 )
+            {
+                __m128 v_b0 = _mm_loadu_ps(src +  0);
+                __m128 v_b1 = _mm_loadu_ps(src +  4);
+                __m128 v_g0 = _mm_loadu_ps(src +  8);
+                __m128 v_g1 = _mm_loadu_ps(src + 12);
+                __m128 v_r0 = _mm_loadu_ps(src + 16);
+                __m128 v_r1 = _mm_loadu_ps(src + 20);
+
+                if (scn == 3)
+                {
+                    _mm_deinterleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
+                }
+                else
+                {
+                    __m128 v_a0 = _mm_loadu_ps(src + 24);
+                    __m128 v_a1 = _mm_loadu_ps(src + 28);
+                    _mm_deinterleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1, v_a0, v_a1);
+                }
+
+                if (bidx)
+                {
+                    __m128 v_tmp0 = v_b0;
+                    __m128 v_tmp1 = v_b1;
+                    v_b0 = v_r0;
+                    v_b1 = v_r1;
+                    v_r0 = v_tmp0;
+                    v_r1 = v_tmp1;
+                }
+
+                process(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
+
+                _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
+
+                _mm_storeu_ps(dst + i +  0, v_b0);
+                _mm_storeu_ps(dst + i +  4, v_b1);
+                _mm_storeu_ps(dst + i +  8, v_g0);
+                _mm_storeu_ps(dst + i + 12, v_g1);
+                _mm_storeu_ps(dst + i + 16, v_r0);
+                _mm_storeu_ps(dst + i + 20, v_r1);
+            }
+        }
+        #endif
+
+        for( ; i < n; i += 3, src += scn )
+        {
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h = 0.f, s = 0.f, l;
+            float vmin, vmax, diff;
+
+            vmax = vmin = r;
+            if( vmax < g ) vmax = g;
+            if( vmax < b ) vmax = b;
+            if( vmin > g ) vmin = g;
+            if( vmin > b ) vmin = b;
+
+            diff = vmax - vmin;
+            l = (vmax + vmin)*0.5f;
+
+            if( diff > FLT_EPSILON )
+            {
+                s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+                diff = 60.f/diff;
+
+                if( vmax == r )
+                    h = (g - b)*diff;
+                else if( vmax == g )
+                    h = (b - r)*diff + 120.f;
+                else
+                    h = (r - g)*diff + 240.f;
+
+                if( h < 0.f ) h += 360.f;
+            }
+
+            dst[i] = h*hscale;
+            dst[i+1] = l;
+            dst[i+2] = s;
+        }
+    }
+
+    int srccn, blueIdx;
+    float hscale;
+    #if CV_SSE2
+    bool haveSIMD;
+    #endif
+};
+
+
+struct RGB2HLS_b
+{
+    typedef uchar channel_type;
+
+    RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
+    : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(1.f/255.f);
+        v_scale = vdupq_n_f32(255.f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #elif CV_SSE2
+        v_scale_inv = _mm_set1_ps(1.f/255.f);
+        v_zero = _mm_setzero_si128();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #endif
+    }
+
+    #if CV_SSE2
+    void process(const float * buf,
+                 __m128 & v_coeffs, uchar * dst) const
+    {
+        __m128 v_l0f = _mm_load_ps(buf);
+        __m128 v_l1f = _mm_load_ps(buf + 4);
+        __m128 v_u0f = _mm_load_ps(buf + 8);
+        __m128 v_u1f = _mm_load_ps(buf + 12);
+
+        v_l0f = _mm_mul_ps(v_l0f, v_coeffs);
+        v_u1f = _mm_mul_ps(v_u1f, v_coeffs);
+        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92));
+        v_u0f = _mm_mul_ps(v_u0f, v_coeffs);
+        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92));
+        v_l1f = _mm_mul_ps(v_l1f, v_coeffs);
+
+        __m128i v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
+        __m128i v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f));
+        __m128i v_l0 = _mm_packus_epi16(v_l, v_u);
+
+        _mm_storeu_si128((__m128i *)(dst), v_l0);
+    }
+    #endif
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int i, j, scn = srccn;
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
+        #if CV_SSE2
+        __m128 v_coeffs = _mm_set_ps(1.f, 255.f, 255.f, 1.f);
+        #endif
+
+        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
+        {
+            int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
+
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
+            {
+                uint16x8_t v_t0, v_t1, v_t2;
+
+                if (scn == 3)
+                {
+                    uint8x8x3_t v_src = vld3_u8(src);
+                    v_t0 = vmovl_u8(v_src.val[0]);
+                    v_t1 = vmovl_u8(v_src.val[1]);
+                    v_t2 = vmovl_u8(v_src.val[2]);
+                }
+                else
+                {
+                    uint8x8x4_t v_src = vld4_u8(src);
+                    v_t0 = vmovl_u8(v_src.val[0]);
+                    v_t1 = vmovl_u8(v_src.val[1]);
+                    v_t2 = vmovl_u8(v_src.val[2]);
+                }
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #elif CV_SSE2
+            if (scn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
+
+                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
+                    _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
+                    _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
+
+                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
+                    _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
+                    _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    src -= jr, j -= jr;
+            }
+            else if (scn == 4 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 12); j += 12, src += 16)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
+
+                    __m128i v_src_lo = _mm_unpacklo_epi8(v_src, v_zero);
+                    __m128i v_src_hi = _mm_unpackhi_epi8(v_src, v_zero);
+                    _mm_storeu_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_lo, v_zero)), v_scale_inv));
+                    _mm_storeu_ps(buf + j + 3, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_lo, v_zero)), v_scale_inv));
+                    _mm_storeu_ps(buf + j + 6, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_hi, v_zero)), v_scale_inv));
+                    float tmp = buf[j + 8];
+                    _mm_storeu_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_unpackhi_epi16(v_src_hi, v_zero), 0x90)), v_scale_inv));
+                    buf[j + 8] = tmp;
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    src -= jr, j -= jr;
+            }
+            #endif
+            for( ; j < dn*3; j += 3, src += scn )
+            {
+                buf[j] = src[0]*(1.f/255.f);
+                buf[j+1] = src[1]*(1.f/255.f);
+                buf[j+2] = src[2]*(1.f/255.f);
+            }
+            cvt(buf, buf, dn);
+
+            j = 0;
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+
+                uint8x8x3_t v_dst;
+                v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
+                v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
+                v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+                vst3_u8(dst + j, v_dst);
+            }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 16) * 3; j += 48)
+                {
+                    process(buf + j,
+                            v_coeffs, dst + j);
+
+                    process(buf + j + 16,
+                            v_coeffs, dst + j + 16);
+
+                    process(buf + j + 32,
+                            v_coeffs, dst + j + 32);
+                }
+            }
+            #endif
+            for( ; j < dn*3; j += 3 )
+            {
+                dst[j] = saturate_cast<uchar>(buf[j]);
+                dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
+                dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
+            }
+        }
+    }
+
+    int srccn;
+    RGB2HLS_f cvt;
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv;
+    uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale_inv;
+    __m128i v_zero;
+    bool haveSIMD;
+    #endif
+};
+
+
+struct HLS2RGB_f
+{
+    typedef float channel_type;
+
+    HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
+    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
+        #if CV_SSE2
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #endif
+    }
+
+    #if CV_SSE2
+    void process(__m128& v_h0, __m128& v_h1, __m128& v_l0,
+                 __m128& v_l1, __m128& v_s0, __m128& v_s1) const
+    {
+        __m128 v_lel0 = _mm_cmple_ps(v_l0, _mm_set1_ps(0.5f));
+        __m128 v_lel1 = _mm_cmple_ps(v_l1, _mm_set1_ps(0.5f));
+        __m128 v_p20 = _mm_andnot_ps(v_lel0, _mm_sub_ps(_mm_add_ps(v_l0, v_s0), _mm_mul_ps(v_l0, v_s0)));
+        __m128 v_p21 = _mm_andnot_ps(v_lel1, _mm_sub_ps(_mm_add_ps(v_l1, v_s1), _mm_mul_ps(v_l1, v_s1)));
+        v_p20 = _mm_or_ps(v_p20, _mm_and_ps(v_lel0, _mm_mul_ps(v_l0, _mm_add_ps(_mm_set1_ps(1.0f), v_s0))));
+        v_p21 = _mm_or_ps(v_p21, _mm_and_ps(v_lel1, _mm_mul_ps(v_l1, _mm_add_ps(_mm_set1_ps(1.0f), v_s1))));
+
+        __m128 v_p10 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l0), v_p20);
+        __m128 v_p11 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l1), v_p21);
+
+        v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
+        v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
+
+        __m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
+        __m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
+
+        v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
+        v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
+
+        __m128 v_p2_p10 = _mm_sub_ps(v_p20, v_p10);
+        __m128 v_p2_p11 = _mm_sub_ps(v_p21, v_p11);
+        __m128 v_tab20 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0)));
+        __m128 v_tab21 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1)));
+        __m128 v_tab30 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, v_h0));
+        __m128 v_tab31 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, v_h1));
+
+        __m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
+        __m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
+        v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
+        v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
+        v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
+        v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
+        v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
+        v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
+
+        v_h0 = _mm_and_ps(v_p10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
+        v_h1 = _mm_and_ps(v_p11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
+        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
+        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
+        v_l0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
+        v_l1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
+        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
+        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
+        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
+        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
+        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
+        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
+        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
+        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
+        v_s0 = _mm_and_ps(v_p20, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
+        v_s1 = _mm_and_ps(v_p21, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
+        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
+        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
+    }
+    #endif
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int i = 0, bidx = blueIdx, dcn = dstcn;
+        float _hscale = hscale;
+        float alpha = ColorChannel<float>::max();
+        n *= 3;
+
+        #if CV_SSE2
+        if (haveSIMD)
+        {
+            for( ; i <= n - 24; i += 24, dst += dcn * 8 )
+            {
+                __m128 v_h0 = _mm_loadu_ps(src + i +  0);
+                __m128 v_h1 = _mm_loadu_ps(src + i +  4);
+                __m128 v_l0 = _mm_loadu_ps(src + i +  8);
+                __m128 v_l1 = _mm_loadu_ps(src + i + 12);
+                __m128 v_s0 = _mm_loadu_ps(src + i + 16);
+                __m128 v_s1 = _mm_loadu_ps(src + i + 20);
+
+                _mm_deinterleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
+
+                process(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
+
+                if (dcn == 3)
+                {
+                    if (bidx)
+                    {
+                        _mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1);
+
+                        _mm_storeu_ps(dst +  0, v_s0);
+                        _mm_storeu_ps(dst +  4, v_s1);
+                        _mm_storeu_ps(dst +  8, v_l0);
+                        _mm_storeu_ps(dst + 12, v_l1);
+                        _mm_storeu_ps(dst + 16, v_h0);
+                        _mm_storeu_ps(dst + 20, v_h1);
+                    }
+                    else
+                    {
+                        _mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
+
+                        _mm_storeu_ps(dst +  0, v_h0);
+                        _mm_storeu_ps(dst +  4, v_h1);
+                        _mm_storeu_ps(dst +  8, v_l0);
+                        _mm_storeu_ps(dst + 12, v_l1);
+                        _mm_storeu_ps(dst + 16, v_s0);
+                        _mm_storeu_ps(dst + 20, v_s1);
+                    }
+                }
+                else
+                {
+                    __m128 v_a0 = _mm_set1_ps(alpha);
+                    __m128 v_a1 = _mm_set1_ps(alpha);
+                    if (bidx)
+                    {
+                        _mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1, v_a0, v_a1);
+
+                        _mm_storeu_ps(dst +  0, v_s0);
+                        _mm_storeu_ps(dst +  4, v_s1);
+                        _mm_storeu_ps(dst +  8, v_l0);
+                        _mm_storeu_ps(dst + 12, v_l1);
+                        _mm_storeu_ps(dst + 16, v_h0);
+                        _mm_storeu_ps(dst + 20, v_h1);
+                        _mm_storeu_ps(dst + 24, v_a0);
+                        _mm_storeu_ps(dst + 28, v_a1);
+                    }
+                    else
+                    {
+                        _mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1, v_a0, v_a1);
+
+                        _mm_storeu_ps(dst +  0, v_h0);
+                        _mm_storeu_ps(dst +  4, v_h1);
+                        _mm_storeu_ps(dst +  8, v_l0);
+                        _mm_storeu_ps(dst + 12, v_l1);
+                        _mm_storeu_ps(dst + 16, v_s0);
+                        _mm_storeu_ps(dst + 20, v_s1);
+                        _mm_storeu_ps(dst + 24, v_a0);
+                        _mm_storeu_ps(dst + 28, v_a1);
+                    }
+                }
+            }
+        }
+        #endif
+        for( ; i < n; i += 3, dst += dcn )
+        {
+            float h = src[i], l = src[i+1], s = src[i+2];
+            float b, g, r;
+
+            if( s == 0 )
+                b = g = r = l;
+            else
+            {
+                static const int sector_data[][3]=
+                {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
+                float tab[4];
+                int sector;
+
+                float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+                float p1 = 2*l - p2;
+
+                h *= _hscale;
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                assert( 0 <= h && h < 6 );
+                sector = cvFloor(h);
+                h -= sector;
+
+                tab[0] = p2;
+                tab[1] = p1;
+                tab[2] = p1 + (p2 - p1)*(1-h);
+                tab[3] = p1 + (p2 - p1)*h;
+
+                b = tab[sector_data[sector][0]];
+                g = tab[sector_data[sector][1]];
+                r = tab[sector_data[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+
+    int dstcn, blueIdx;
+    float hscale;
+    #if CV_SSE2
+    bool haveSIMD;
+    #endif
+};
+
+
+struct HLS2RGB_b
+{
+    typedef uchar channel_type;
+
+    HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
+    : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(1.f/255.f);
+        v_scale = vdupq_n_f32(255.f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #elif CV_SSE2
+        v_scale = _mm_set1_ps(255.f);
+        v_alpha = _mm_set1_ps(ColorChannel<uchar>::max());
+        v_zero = _mm_setzero_si128();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #endif
+    }
+
+    #if CV_SSE2
+    void process(__m128i v_r, __m128i v_g, __m128i v_b,
+                 const __m128& v_coeffs_,
+                 float * buf) const
+    {
+        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
+        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
+        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
+
+        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
+        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
+        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
+
+        __m128 v_coeffs = v_coeffs_;
+
+        v_r0 = _mm_mul_ps(v_r0, v_coeffs);
+        v_g1 = _mm_mul_ps(v_g1, v_coeffs);
+
+        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
+
+        v_r1 = _mm_mul_ps(v_r1, v_coeffs);
+        v_b0 = _mm_mul_ps(v_b0, v_coeffs);
+
+        v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49));
+
+        v_g0 = _mm_mul_ps(v_g0, v_coeffs);
+        v_b1 = _mm_mul_ps(v_b1, v_coeffs);
+
+        _mm_store_ps(buf, v_r0);
+        _mm_store_ps(buf + 4, v_r1);
+        _mm_store_ps(buf + 8, v_g0);
+        _mm_store_ps(buf + 12, v_g1);
+        _mm_store_ps(buf + 16, v_b0);
+        _mm_store_ps(buf + 20, v_b1);
+    }
+    #endif
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int i, j, dcn = dstcn;
+        uchar alpha = ColorChannel<uchar>::max();
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
+        #if CV_SSE2
+        __m128 v_coeffs = _mm_set_ps(1.f, 1.f/255.f, 1.f/255.f, 1.f);
+        #endif
+
+        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
+        {
+            int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
+
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                uint8x8x3_t v_src = vld3_u8(src + j);
+                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
+                           v_t1 = vmovl_u8(v_src.val[1]),
+                           v_t2 = vmovl_u8(v_src.val[2]);
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 8) * 3; j += 24)
+                {
+                    __m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j));
+                    __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16));
+
+                    process(_mm_unpacklo_epi8(v_src0, v_zero),
+                            _mm_unpackhi_epi8(v_src0, v_zero),
+                            _mm_unpacklo_epi8(v_src1, v_zero),
+                            v_coeffs,
+                            buf + j);
+                }
+            }
+            #endif
+            for( ; j < dn*3; j += 3 )
+            {
+                buf[j] = src[j];
+                buf[j+1] = src[j+1]*(1.f/255.f);
+                buf[j+2] = src[j+2]*(1.f/255.f);
+            }
+            cvt(buf, buf, dn);
+
+            j = 0;
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
+                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
+                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+
+                if (dcn == 4)
+                {
+                    uint8x8x4_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    v_dst.val[3] = v_alpha;
+                    vst4_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    vst3_u8(dst, v_dst);
+                }
+            }
+            #elif CV_SSE2
+            if (dcn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
+                {
+                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
+                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
+                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
+                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
+
+                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                                     _mm_cvtps_epi32(v_src1));
+                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
+                                                     _mm_cvtps_epi32(v_src3));
+
+                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    dst -= jr, j -= jr;
+            }
+            else if (dcn == 4 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 12); j += 12, dst += 16)
+                {
+                    __m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
+                    __m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
+                    __m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
+
+                    __m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha);
+                    __m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha);
+
+                    __m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44));
+                    __m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78);
+                    __m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e));
+                    __m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78);
+
+                    __m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1);
+                    __m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3);
+
+                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    dst -= jr, j -= jr;
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3, dst += dcn )
+            {
+                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
+                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
+                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
+                if( dcn == 4 )
+                    dst[3] = alpha;
+            }
+        }
+    }
+
+    int dstcn;
+    HLS2RGB_f cvt;
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv;
+    uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale;
+    __m128 v_alpha;
+    __m128i v_zero;
+    bool haveSIMD;
+    #endif
+};
+
+//
+// IPP functions
+//
+
+#if NEED_IPP
+
+#if !IPP_DISABLE_RGB_HSV
+static ippiGeneralFunc ippiRGB2HSVTab[] =
+{
+    (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
+    0, 0, 0, 0
+};
+#endif
+
+static ippiGeneralFunc ippiHSV2RGBTab[] =
+{
+    (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0,
+    0, 0, 0, 0
+};
+
+static ippiGeneralFunc ippiRGB2HLSTab[] =
+{
+    (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0,
+    0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0
+};
+
+static ippiGeneralFunc ippiHLS2RGBTab[] =
+{
+    (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0,
+    0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
+};
+
+#endif
+
+//
+// HAL functions
+//
+
+namespace hal
+{
+
+// 8u, 32f
+void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtBGRtoHSV, cv_hal_cvtBGRtoHSV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+        if(depth == CV_8U && isFullRange)
+        {
+            if (isHSV)
+            {
+#if !IPP_DISABLE_RGB_HSV // breaks OCL accuracy tests
+                if(scn == 3 && !swapBlue)
+                {
+                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
+                                            IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
+                        return;
+                }
+                else if(scn == 4 && !swapBlue)
+                {
+                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
+                        return;
+                }
+                else if(scn == 4 && swapBlue)
+                {
+                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
+                        return;
+                }
+#endif
+            }
+            else
+            {
+                if(scn == 3 && !swapBlue)
+                {
+                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
+                                            IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
+                        return;
+                }
+                else if(scn == 4 && !swapBlue)
+                {
+                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
+                        return;
+                }
+                else if(scn == 3 && swapBlue)
+                {
+                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height,
+                                            IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
+                        return;
+                }
+                else if(scn == 4 && swapBlue)
+                {
+                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                        IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
+                        return;
+                }
+            }
+        }
+    }
+#endif
+
+    int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180;
+    int blueIdx = swapBlue ? 2 : 0;
+    if(isHSV)
+    {
+        if(depth == CV_8U)
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_b(scn, blueIdx, hrange));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_f(scn, blueIdx, static_cast<float>(hrange)));
+    }
+    else
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_b(scn, blueIdx, hrange));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_f(scn, blueIdx, static_cast<float>(hrange)));
+    }
+}
+
+// 8u, 32f
+void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
+                        uchar * dst_data, size_t dst_step,
+                        int width, int height,
+                        int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtHSVtoBGR, cv_hal_cvtHSVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+        if (depth == CV_8U && isFullRange)
+        {
+            if (isHSV)
+            {
+                if(dcn == 3 && !swapBlue)
+                {
+                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                            IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+                        return;
+                }
+                else if(dcn == 4 && !swapBlue)
+                {
+                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                        IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+                        return;
+                }
+                else if(dcn == 3 && swapBlue)
+                {
+                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                            IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
+                        return;
+                }
+                else if(dcn == 4 && swapBlue)
+                {
+                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                        IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+                        return;
+                }
+            }
+            else
+            {
+                if(dcn == 3 && !swapBlue)
+                {
+                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                            IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+                        return;
+                }
+                else if(dcn == 4 && !swapBlue)
+                {
+                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                        IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+                        return;
+                }
+                else if(dcn == 3 && swapBlue)
+                {
+                    if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height,
+                                            IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
+                        return;
+                }
+                else if(dcn == 4 && swapBlue)
+                {
+                    if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                        IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+                        return;
+                }
+            }
+        }
+    }
+#endif
+
+    int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180;
+    int blueIdx = swapBlue ? 2 : 0;
+    if(isHSV)
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_b(dcn, blueIdx, hrange));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_f(dcn, blueIdx, static_cast<float>(hrange)));
+    }
+    else
+    {
+        if( depth == CV_8U )
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_b(dcn, blueIdx, hrange));
+        else
+            CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_f(dcn, blueIdx, static_cast<float>(hrange)));
+    }
+}
+
+} // namespace hal
+
+//
+// OCL calls
+//
+
+#ifdef HAVE_OPENCL
+
+bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full )
+{
+    OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
+
+    int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
+
+    if(!h.createKernel("HSV2RGB", ocl::imgproc::color_hsv_oclsrc,
+                       format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full )
+{
+    OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
+
+    int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
+
+    if(!h.createKernel("HLS2RGB", ocl::imgproc::color_hsv_oclsrc,
+                       format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full )
+{
+    OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
+
+    float hscale = (_src.depth() == CV_32F ? 360.f : (!full ? 180.f : 256.f))/360.f;
+
+    if(!h.createKernel("RGB2HLS", ocl::imgproc::color_hsv_oclsrc,
+                       format("-D hscale=%ff -D bidx=%d -D dcn=3", hscale, bidx)))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full )
+{
+    OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
+
+    int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 256);
+
+    cv::String options = (_src.depth() == CV_8U ?
+                          format("-D hrange=%d -D bidx=%d -D dcn=3", hrange, bidx) :
+                          format("-D hscale=%ff -D bidx=%d -D dcn=3", hrange*(1.f/360.f), bidx));
+
+    if(!h.createKernel("RGB2HSV", ocl::imgproc::color_hsv_oclsrc, options))
+    {
+        return false;
+    }
+
+    if(_src.depth() == CV_8U)
+    {
+        static UMat sdiv_data;
+        static UMat hdiv_data180;
+        static UMat hdiv_data256;
+        static int sdiv_table[256];
+        static int hdiv_table180[256];
+        static int hdiv_table256[256];
+        static volatile bool initialized180 = false, initialized256 = false;
+        volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
+
+        if (!initialized)
+        {
+            int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
+            UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
+
+            sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
+
+            int v = 255 << hsv_shift;
+            if (!initialized180 && !initialized256)
+            {
+                for(int i = 1; i < 256; i++ )
+                    sdiv_table[i] = saturate_cast<int>(v/(1.*i));
+                Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
+            }
+
+            v = hrange << hsv_shift;
+            for (int i = 1; i < 256; i++ )
+                hdiv_table[i] = saturate_cast<int>(v/(6.*i));
+
+            Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
+            initialized = true;
+        }
+
+        h.setArg(ocl::KernelArg::PtrReadOnly(sdiv_data));
+        h.setArg(hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
+                                 ocl::KernelArg::PtrReadOnly(hdiv_data180));
+    }
+
+    return h.run();
+}
+
+#endif
+
+//
+// HAL calls
+//
+
+void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange )
+{
+    CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
+
+    hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                     h.depth, h.scn, swapb, fullRange, false);
+}
+
+void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange )
+{
+    CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
+
+    hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                     h.depth, h.scn, swapb, fullRange, true);
+}
+
+void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange)
+{
+    if(dcn <= 0) dcn = 3;
+    CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
+
+    hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                     h.depth, dcn, swapb, fullRange, false);
+}
+
+void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange)
+{
+    if(dcn <= 0) dcn = 3;
+    CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
+
+    hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                     h.depth, dcn, swapb, fullRange, true);
+}
+
+
+} // namespace cv
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
--- a/modules/imgproc/src/color_rgb.cpp
+++ b/modules/imgproc/src/color_rgb.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "color.hpp"
+
+namespace cv
+{
+
+////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
+
+template<typename _Tp> struct RGB2RGB
+{
+    typedef _Tp channel_type;
+
+    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
+    void operator()(const _Tp* src, _Tp* dst, int n) const
+    {
+        int scn = srccn, dcn = dstcn, bidx = blueIdx;
+        if( dcn == 3 )
+        {
+            n *= 3;
+            for( int i = 0; i < n; i += 3, src += scn )
+            {
+                _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
+                dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
+            }
+        }
+        else if( scn == 3 )
+        {
+            n *= 3;
+            _Tp alpha = ColorChannel<_Tp>::max();
+            for( int i = 0; i < n; i += 3, dst += 4 )
+            {
+                _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
+                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
+            }
+        }
+        else
+        {
+            n *= 4;
+            for( int i = 0; i < n; i += 4 )
+            {
+                _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
+                dst[i+bidx] = t0; dst[i+1] = t1; dst[i+(bidx^2)] = t2; dst[i+3] = t3;
+            }
+        }
+    }
+
+    int srccn, dstcn, blueIdx;
+};
+
+#if CV_NEON
+
+template<> struct RGB2RGB<uchar>
+{
+    typedef uchar channel_type;
+
+    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
+        srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
+        v_alpha2 = vget_low_u8(v_alpha);
+    }
+
+    void operator()(const uchar * src, uchar * dst, int n) const
+    {
+        int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
+        if (dcn == 3)
+        {
+            n *= 3;
+            if (scn == 3)
+            {
+                for ( ; i <= n - 48; i += 48, src += 48 )
+                {
+                    uint8x16x3_t v_src = vld3q_u8(src), v_dst;
+                    v_dst.val[0] = v_src.val[bidx];
+                    v_dst.val[1] = v_src.val[1];
+                    v_dst.val[2] = v_src.val[bidx ^ 2];
+                    vst3q_u8(dst + i, v_dst);
+                }
+                for ( ; i <= n - 24; i += 24, src += 24 )
+                {
+                    uint8x8x3_t v_src = vld3_u8(src), v_dst;
+                    v_dst.val[0] = v_src.val[bidx];
+                    v_dst.val[1] = v_src.val[1];
+                    v_dst.val[2] = v_src.val[bidx ^ 2];
+                    vst3_u8(dst + i, v_dst);
+                }
+                for ( ; i < n; i += 3, src += 3 )
+                {
+                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
+                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
+                }
+            }
+            else
+            {
+                for ( ; i <= n - 48; i += 48, src += 64 )
+                {
+                    uint8x16x4_t v_src = vld4q_u8(src);
+                    uint8x16x3_t v_dst;
+                    v_dst.val[0] = v_src.val[bidx];
+                    v_dst.val[1] = v_src.val[1];
+                    v_dst.val[2] = v_src.val[bidx ^ 2];
+                    vst3q_u8(dst + i, v_dst);
+                }
+                for ( ; i <= n - 24; i += 24, src += 32 )
+                {
+                    uint8x8x4_t v_src = vld4_u8(src);
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = v_src.val[bidx];
+                    v_dst.val[1] = v_src.val[1];
+                    v_dst.val[2] = v_src.val[bidx ^ 2];
+                    vst3_u8(dst + i, v_dst);
+                }
+                for ( ; i < n; i += 3, src += 4 )
+                {
+                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
+                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
+                }
+            }
+        }
+        else if (scn == 3)
+        {
+            n *= 3;
+            for ( ; i <= n - 48; i += 48, dst += 64 )
+            {
+                uint8x16x3_t v_src = vld3q_u8(src + i);
+                uint8x16x4_t v_dst;
+                v_dst.val[bidx] = v_src.val[0];
+                v_dst.val[1] = v_src.val[1];
+                v_dst.val[bidx ^ 2] = v_src.val[2];
+                v_dst.val[3] = v_alpha;
+                vst4q_u8(dst, v_dst);
+            }
+            for ( ; i <= n - 24; i += 24, dst += 32 )
+            {
+                uint8x8x3_t v_src = vld3_u8(src + i);
+                uint8x8x4_t v_dst;
+                v_dst.val[bidx] = v_src.val[0];
+                v_dst.val[1] = v_src.val[1];
+                v_dst.val[bidx ^ 2] = v_src.val[2];
+                v_dst.val[3] = v_alpha2;
+                vst4_u8(dst, v_dst);
+            }
+            uchar alpha = ColorChannel<uchar>::max();
+            for (; i < n; i += 3, dst += 4 )
+            {
+                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
+                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
+            }
+        }
+        else
+        {
+            n *= 4;
+            for ( ; i <= n - 64; i += 64 )
+            {
+                uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
+                v_dst.val[0] = v_src.val[bidx];
+                v_dst.val[1] = v_src.val[1];
+                v_dst.val[2] = v_src.val[bidx^2];
+                v_dst.val[3] = v_src.val[3];
+                vst4q_u8(dst + i, v_dst);
+            }
+            for ( ; i <= n - 32; i += 32 )
+            {
+                uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
+                v_dst.val[0] = v_src.val[bidx];
+                v_dst.val[1] = v_src.val[1];
+                v_dst.val[2] = v_src.val[bidx^2];
+                v_dst.val[3] = v_src.val[3];
+                vst4_u8(dst + i, v_dst);
+            }
+            for ( ; i < n; i += 4)
+            {
+                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
+                dst[i+bidx] = t0; dst[i+1] = t1; dst[i+(bidx^2)] = t2; dst[i+3] = t3;
+            }
+        }
+    }
+
+    int srccn, dstcn, blueIdx;
+
+    uint8x16_t v_alpha;
+    uint8x8_t v_alpha2;
+};
+
+#endif
+
+/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
+
+struct RGB5x52RGB
+{
+    typedef uchar channel_type;
+
+    RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
+        : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
+    {
+        #if CV_NEON
+        v_n3 = vdupq_n_u16(~3);
+        v_n7 = vdupq_n_u16(~7);
+        v_255 = vdupq_n_u8(255);
+        v_0 = vdupq_n_u8(0);
+        v_mask = vdupq_n_u16(0x8000);
+        #endif
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        if( greenBits == 6 )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 16; i += 16, dst += dcn * 16)
+            {
+                uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
+                uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
+                uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)),
+                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3)));
+                uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)),
+                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7)));
+                if (dcn == 3)
+                {
+                    uint8x16x3_t v_dst;
+                    v_dst.val[bidx] = v_b;
+                    v_dst.val[1] = v_g;
+                    v_dst.val[bidx^2] = v_r;
+                    vst3q_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x16x4_t v_dst;
+                    v_dst.val[bidx] = v_b;
+                    v_dst.val[1] = v_g;
+                    v_dst.val[bidx^2] = v_r;
+                    v_dst.val[3] = v_255;
+                    vst4q_u8(dst, v_dst);
+                }
+            }
+            #endif
+            for( ; i < n; i++, dst += dcn )
+            {
+                unsigned t = ((const ushort*)src)[i];
+                dst[bidx] = (uchar)(t << 3);
+                dst[1] = (uchar)((t >> 3) & ~3);
+                dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
+                if( dcn == 4 )
+                    dst[3] = 255;
+            }
+        }
+        else
+        {
+            #if CV_NEON
+            for ( ; i <= n - 16; i += 16, dst += dcn * 16)
+            {
+                uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
+                uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
+                uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)),
+                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7)));
+                uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)),
+                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7)));
+                if (dcn == 3)
+                {
+                    uint8x16x3_t v_dst;
+                    v_dst.val[bidx] = v_b;
+                    v_dst.val[1] = v_g;
+                    v_dst.val[bidx^2] = v_r;
+                    vst3q_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x16x4_t v_dst;
+                    v_dst.val[bidx] = v_b;
+                    v_dst.val[1] = v_g;
+                    v_dst.val[bidx^2] = v_r;
+                    v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)),
+                                                        vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0);
+                    vst4q_u8(dst, v_dst);
+                }
+            }
+            #endif
+            for( ; i < n; i++, dst += dcn )
+            {
+                unsigned t = ((const ushort*)src)[i];
+                dst[bidx] = (uchar)(t << 3);
+                dst[1] = (uchar)((t >> 2) & ~7);
+                dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
+                if( dcn == 4 )
+                    dst[3] = t & 0x8000 ? 255 : 0;
+            }
+        }
+    }
+
+    int dstcn, blueIdx, greenBits;
+    #if CV_NEON
+    uint16x8_t v_n3, v_n7, v_mask;
+    uint8x16_t v_255, v_0;
+    #endif
+};
+
+
+struct RGB2RGB5x5
+{
+    typedef uchar channel_type;
+
+    RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
+        : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
+    {
+        #if CV_NEON
+        v_n3 = vdup_n_u8(~3);
+        v_n7 = vdup_n_u8(~7);
+        v_mask = vdupq_n_u16(0x8000);
+        v_0 = vdupq_n_u16(0);
+        v_full = vdupq_n_u16(0xffff);
+        #endif
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        if (greenBits == 6)
+        {
+            if (scn == 3)
+            {
+                #if CV_NEON
+                for ( ; i <= n - 8; i += 8, src += 24 )
+                {
+                    uint8x8x3_t v_src = vld3_u8(src);
+                    uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
+                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
+                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
+                    vst1q_u16((ushort *)dst + i, v_dst);
+                }
+                #endif
+                for ( ; i < n; i++, src += 3 )
+                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
+            }
+            else
+            {
+                #if CV_NEON
+                for ( ; i <= n - 8; i += 8, src += 32 )
+                {
+                    uint8x8x4_t v_src = vld4_u8(src);
+                    uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
+                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
+                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
+                    vst1q_u16((ushort *)dst + i, v_dst);
+                }
+                #endif
+                for ( ; i < n; i++, src += 4 )
+                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
+            }
+        }
+        else if (scn == 3)
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8, src += 24 )
+            {
+                uint8x8x3_t v_src = vld3_u8(src);
+                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7));
+                vst1q_u16((ushort *)dst + i, v_dst);
+            }
+            #endif
+            for ( ; i < n; i++, src += 3 )
+                ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
+        }
+        else
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8, src += 32 )
+            {
+                uint8x8x4_t v_src = vld4_u8(src);
+                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
+                v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7),
+                                                   vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0)));
+                vst1q_u16((ushort *)dst + i, v_dst);
+            }
+            #endif
+            for ( ; i < n; i++, src += 4 )
+                ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
+                    ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
+        }
+    }
+
+    int srccn, blueIdx, greenBits;
+    #if CV_NEON
+    uint8x8_t v_n3, v_n7;
+    uint16x8_t v_mask, v_0, v_full;
+    #endif
+};
+
+///////////////////////////////// Color to/from Grayscale ////////////////////////////////
+
+template<typename _Tp>
+struct Gray2RGB
+{
+    typedef _Tp channel_type;
+
+    Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
+    void operator()(const _Tp* src, _Tp* dst, int n) const
+    {
+        if( dstcn == 3 )
+            for( int i = 0; i < n; i++, dst += 3 )
+            {
+                dst[0] = dst[1] = dst[2] = src[i];
+            }
+        else
+        {
+            _Tp alpha = ColorChannel<_Tp>::max();
+            for( int i = 0; i < n; i++, dst += 4 )
+            {
+                dst[0] = dst[1] = dst[2] = src[i];
+                dst[3] = alpha;
+            }
+        }
+    }
+
+    int dstcn;
+};
+
+
+struct Gray2RGB5x5
+{
+    typedef uchar channel_type;
+
+    Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
+    {
+        #if CV_NEON
+        v_n7 = vdup_n_u8(~7);
+        v_n3 = vdup_n_u8(~3);
+        #elif CV_SSE2
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        v_n7 = _mm_set1_epi16(~7);
+        v_n3 = _mm_set1_epi16(~3);
+        v_zero = _mm_setzero_si128();
+        #endif
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int i = 0;
+        if( greenBits == 6 )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8 )
+            {
+                uint8x8_t v_src = vld1_u8(src + i);
+                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
+                vst1q_u16((ushort *)dst + i, v_dst);
+            }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; i <= n - 16; i += 16 )
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
+
+                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
+                    __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
+                                    _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
+                                                 _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
+                    _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
+
+                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
+                    v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
+                            _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
+                                         _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
+                    _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
+                }
+            }
+            #endif
+            for ( ; i < n; i++ )
+            {
+                int t = src[i];
+                ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
+            }
+        }
+        else
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8 )
+            {
+                uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3));
+                uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
+                vst1q_u16((ushort *)dst + i, v_dst);
+            }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; i <= n - 16; i += 8 )
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
+
+                    __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3);
+                    __m128i v_dst = _mm_or_si128(v_src_p,
+                                    _mm_or_si128(_mm_slli_epi32(v_src_p, 5),
+                                                 _mm_slli_epi16(v_src_p, 10)));
+                    _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
+
+                    v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3);
+                    v_dst = _mm_or_si128(v_src_p,
+                            _mm_or_si128(_mm_slli_epi16(v_src_p, 5),
+                                         _mm_slli_epi16(v_src_p, 10)));
+                    _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
+                }
+            }
+            #endif
+            for( ; i < n; i++ )
+            {
+                int t = src[i] >> 3;
+                ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
+            }
+        }
+    }
+    int greenBits;
+
+    #if CV_NEON
+    uint8x8_t v_n7, v_n3;
+    #elif CV_SSE2
+    __m128i v_n7, v_n3, v_zero;
+    bool haveSIMD;
+    #endif
+};
+
+
+struct RGB5x52Gray
+{
+    typedef uchar channel_type;
+
+    RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
+    {
+        #if CV_NEON
+        v_b2y = vdup_n_u16(B2Y);
+        v_g2y = vdup_n_u16(G2Y);
+        v_r2y = vdup_n_u16(R2Y);
+        v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
+        v_f8 = vdupq_n_u16(0xf8);
+        v_fc = vdupq_n_u16(0xfc);
+        #elif CV_SSE2
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        const __m128i v_b2y = _mm_set1_epi16(B2Y);
+        const __m128i v_g2y = _mm_set1_epi16(G2Y);
+        v_bg2y = _mm_unpacklo_epi16(v_b2y, v_g2y);
+        const __m128i v_r2y = _mm_set1_epi16(R2Y);
+        const __m128i v_one = _mm_set1_epi16(1);
+        v_rd2y = _mm_unpacklo_epi16(v_r2y, v_one);
+        v_delta = _mm_slli_epi16(v_one, yuv_shift - 1);
+        #endif
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int i = 0;
+        if( greenBits == 6 )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8)
+            {
+                uint16x8_t v_src = vld1q_u16((ushort *)src + i);
+                uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
+                           v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc),
+                           v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
+
+                uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
+                                              vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
+                uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
+                                              vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
+                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
+                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
+
+                vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
+            }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; i <= n - 8; i += 8)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
+                    __m128i v_b = _mm_srli_epi16(_mm_slli_epi16(v_src, 11), 8),
+                            v_g = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 5), 10),8),
+                            v_r = _mm_slli_epi16(_mm_srli_epi16(v_src, 11), 3);
+
+                    __m128i v_bg_lo = _mm_unpacklo_epi16(v_b, v_g);
+                    __m128i v_rd_lo = _mm_unpacklo_epi16(v_r, v_delta);
+                    __m128i v_bg_hi = _mm_unpackhi_epi16(v_b, v_g);
+                    __m128i v_rd_hi = _mm_unpackhi_epi16(v_r, v_delta);
+                    v_bg_lo = _mm_madd_epi16(v_bg_lo, v_bg2y);
+                    v_rd_lo = _mm_madd_epi16(v_rd_lo, v_rd2y);
+                    v_bg_hi = _mm_madd_epi16(v_bg_hi, v_bg2y);
+                    v_rd_hi = _mm_madd_epi16(v_rd_hi, v_rd2y);
+
+                    __m128i v_bgr_lo = _mm_add_epi32(v_bg_lo, v_rd_lo);
+                    __m128i v_bgr_hi = _mm_add_epi32(v_bg_hi, v_rd_hi);
+                    v_bgr_lo = _mm_srli_epi32(v_bgr_lo, yuv_shift);
+                    v_bgr_hi = _mm_srli_epi32(v_bgr_hi, yuv_shift);
+
+                    __m128i v_dst = _mm_packs_epi32(v_bgr_lo, v_bgr_hi);
+                    v_dst = _mm_packus_epi16(v_dst, v_dst);
+                    _mm_storel_epi64((__m128i *)(dst + i), v_dst);
+                }
+            }
+            #endif
+            for ( ; i < n; i++)
+            {
+                int t = ((ushort*)src)[i];
+                dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
+                                           ((t >> 3) & 0xfc)*G2Y +
+                                           ((t >> 8) & 0xf8)*R2Y, yuv_shift);
+            }
+        }
+        else
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8)
+            {
+                uint16x8_t v_src = vld1q_u16((ushort *)src + i);
+                uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
+                           v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
+                           v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8);
+
+                uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
+                                              vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
+                uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
+                                              vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
+                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
+                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
+
+                vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
+            }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; i <= n - 8; i += 8)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
+                    __m128i v_b = _mm_srli_epi16(_mm_slli_epi16(v_src, 11), 8),
+                            v_g = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 5), 11),8),
+                            v_r = _mm_srli_epi16(_mm_slli_epi16(_mm_srli_epi16(v_src, 10), 11),8);
+
+                    __m128i v_bg_lo = _mm_unpacklo_epi16(v_b, v_g);
+                    __m128i v_rd_lo = _mm_unpacklo_epi16(v_r, v_delta);
+                    __m128i v_bg_hi = _mm_unpackhi_epi16(v_b, v_g);
+                    __m128i v_rd_hi = _mm_unpackhi_epi16(v_r, v_delta);
+                    v_bg_lo = _mm_madd_epi16(v_bg_lo, v_bg2y);
+                    v_rd_lo = _mm_madd_epi16(v_rd_lo, v_rd2y);
+                    v_bg_hi = _mm_madd_epi16(v_bg_hi, v_bg2y);
+                    v_rd_hi = _mm_madd_epi16(v_rd_hi, v_rd2y);
+
+                    __m128i v_bgr_lo = _mm_add_epi32(v_bg_lo, v_rd_lo);
+                    __m128i v_bgr_hi = _mm_add_epi32(v_bg_hi, v_rd_hi);
+                    v_bgr_lo = _mm_srli_epi32(v_bgr_lo, yuv_shift);
+                    v_bgr_hi = _mm_srli_epi32(v_bgr_hi, yuv_shift);
+
+                    __m128i v_dst = _mm_packs_epi32(v_bgr_lo, v_bgr_hi);
+                    v_dst = _mm_packus_epi16(v_dst, v_dst);
+                    _mm_storel_epi64((__m128i *)(dst + i), v_dst);
+                }
+            }
+            #endif
+            for ( ; i < n; i++)
+            {
+                int t = ((ushort*)src)[i];
+                dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
+                                           ((t >> 2) & 0xf8)*G2Y +
+                                           ((t >> 7) & 0xf8)*R2Y, yuv_shift);
+            }
+        }
+    }
+    int greenBits;
+
+    #if CV_NEON
+    uint16x4_t v_b2y, v_g2y, v_r2y;
+    uint32x4_t v_delta;
+    uint16x8_t v_f8, v_fc;
+    #elif CV_SSE2
+    bool haveSIMD;
+    __m128i v_bg2y, v_rd2y;
+    __m128i v_delta;
+    #endif
+};
+
+
+template<typename _Tp> struct RGB2Gray
+{
+    typedef _Tp channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const float coeffs0[] = { R2YF, G2YF, B2YF };
+        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
+        if(blueIdx == 0)
+            std::swap(coeffs[0], coeffs[2]);
+    }
+
+    void operator()(const _Tp* src, _Tp* dst, int n) const
+    {
+        int scn = srccn;
+        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
+        for(int i = 0; i < n; i++, src += scn)
+            dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
+    }
+    int srccn;
+    float coeffs[3];
+};
+
+template<> struct RGB2Gray<uchar>
+{
+    typedef uchar channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn)
+    {
+        const int coeffs0[] = { R2Y, G2Y, B2Y };
+        if(!coeffs) coeffs = coeffs0;
+
+        int b = 0, g = 0, r = (1 << (yuv_shift-1));
+        int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx];
+
+        for( int i = 0; i < 256; i++, b += db, g += dg, r += dr )
+        {
+            tab[i] = b;
+            tab[i+256] = g;
+            tab[i+512] = r;
+        }
+    }
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int scn = srccn;
+        const int* _tab = tab;
+        for(int i = 0; i < n; i++, src += scn)
+            dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift);
+    }
+    int srccn;
+    int tab[256*3];
+};
+
+#if CV_NEON
+
+template <>
+struct RGB2Gray<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
+        srccn(_srccn)
+    {
+        static const int coeffs0[] = { R2Y, G2Y, B2Y };
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
+        if( blueIdx == 0 )
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = vdup_n_u16(coeffs[0]);
+        v_cg = vdup_n_u16(coeffs[1]);
+        v_cr = vdup_n_u16(coeffs[2]);
+        v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
+
+        for ( ; i <= n - 8; i += 8, src += scn * 8)
+        {
+            uint16x8_t v_b, v_r, v_g;
+            if (scn == 3)
+            {
+                uint16x8x3_t v_src = vld3q_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+            else
+            {
+                uint16x8x4_t v_src = vld4q_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+
+            uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
+                                           vmull_u16(vget_low_u16(v_b), v_cb),
+                                                     vget_low_u16(v_g), v_cg),
+                                                     vget_low_u16(v_r), v_cr);
+            uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
+                                           vmull_u16(vget_high_u16(v_b), v_cb),
+                                                     vget_high_u16(v_g), v_cg),
+                                                     vget_high_u16(v_r), v_cr);
+
+            uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
+            uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
+
+            vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
+        }
+
+        for ( ; i <= n - 4; i += 4, src += scn * 4)
+        {
+            uint16x4_t v_b, v_r, v_g;
+            if (scn == 3)
+            {
+                uint16x4x3_t v_src = vld3_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+            else
+            {
+                uint16x4x4_t v_src = vld4_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+
+            uint32x4_t v_dst = vmlal_u16(vmlal_u16(
+                                         vmull_u16(v_b, v_cb),
+                                                   v_g, v_cg),
+                                                   v_r, v_cr);
+
+            vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
+        }
+
+        for( ; i < n; i++, src += scn)
+            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
+    }
+
+    int srccn, coeffs[3];
+    uint16x4_t v_cb, v_cg, v_cr;
+    uint32x4_t v_delta;
+};
+
+template <>
+struct RGB2Gray<float>
+{
+    typedef float channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const float coeffs0[] = { R2YF, G2YF, B2YF };
+        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
+        if(blueIdx == 0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = vdupq_n_f32(coeffs[0]);
+        v_cg = vdupq_n_f32(coeffs[1]);
+        v_cr = vdupq_n_f32(coeffs[2]);
+    }
+
+    void operator()(const float * src, float * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
+
+        if (scn == 3)
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                float32x4x3_t v_src = vld3q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+
+                v_src = vld3q_f32(src + scn * 4);
+                vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+
+            for ( ; i <= n - 4; i += 4, src += scn * 4)
+            {
+                float32x4x3_t v_src = vld3q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+        }
+        else
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+
+                v_src = vld4q_f32(src + scn * 4);
+                vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+
+            for ( ; i <= n - 4; i += 4, src += scn * 4)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+        }
+
+        for ( ; i < n; i++, src += scn)
+            dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
+    }
+
+    int srccn;
+    float coeffs[3];
+    float32x4_t v_cb, v_cg, v_cr;
+};
+
+#elif CV_SSE2
+
+#if CV_SSE4_1
+
+template <>
+struct RGB2Gray<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
+        srccn(_srccn)
+    {
+        static const int coeffs0[] = { R2Y, G2Y, B2Y };
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
+        if( blueIdx == 0 )
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
+        v_zero = _mm_setzero_si128();
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    // 16s x 8
+    void process(__m128i* v_rgb, __m128i* v_coeffs,
+                 __m128i & v_gray) const
+    {
+        __m128i v_rgb_hi[4];
+        v_rgb_hi[0] = _mm_cmplt_epi16(v_rgb[0], v_zero);
+        v_rgb_hi[1] = _mm_cmplt_epi16(v_rgb[1], v_zero);
+        v_rgb_hi[2] = _mm_cmplt_epi16(v_rgb[2], v_zero);
+        v_rgb_hi[3] = _mm_cmplt_epi16(v_rgb[3], v_zero);
+
+        v_rgb_hi[0] = _mm_and_si128(v_rgb_hi[0], v_coeffs[1]);
+        v_rgb_hi[1] = _mm_and_si128(v_rgb_hi[1], v_coeffs[1]);
+        v_rgb_hi[2] = _mm_and_si128(v_rgb_hi[2], v_coeffs[1]);
+        v_rgb_hi[3] = _mm_and_si128(v_rgb_hi[3], v_coeffs[1]);
+
+        v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[1]);
+        v_rgb_hi[2] = _mm_hadd_epi16(v_rgb_hi[2], v_rgb_hi[3]);
+        v_rgb_hi[0] = _mm_hadd_epi16(v_rgb_hi[0], v_rgb_hi[2]);
+
+        v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_coeffs[0]);
+        v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_coeffs[0]);
+        v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_coeffs[0]);
+        v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_coeffs[0]);
+
+        v_rgb[0] = _mm_hadd_epi32(v_rgb[0], v_rgb[1]);
+        v_rgb[2] = _mm_hadd_epi32(v_rgb[2], v_rgb[3]);
+
+        v_rgb[0] = _mm_add_epi32(v_rgb[0], v_delta);
+        v_rgb[2] = _mm_add_epi32(v_rgb[2], v_delta);
+
+        v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift);
+        v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift);
+
+        v_gray = _mm_packs_epi32(v_rgb[0], v_rgb[2]);
+        v_gray = _mm_add_epi16(v_gray, v_rgb_hi[0]);
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
+
+        if (scn == 3 && haveSIMD)
+        {
+            __m128i v_coeffs[2];
+            v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0);
+            v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2);
+
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                __m128i v_src[3];
+                v_src[0] = _mm_loadu_si128((__m128i const *)(src));
+                v_src[1] = _mm_loadu_si128((__m128i const *)(src + 8));
+                v_src[2] = _mm_loadu_si128((__m128i const *)(src + 16));
+
+                __m128i v_rgb[4];
+                v_rgb[0] = _mm_slli_si128(v_src[0], 2);
+                v_rgb[1] = _mm_alignr_epi8(v_src[1], v_src[0], 10);
+                v_rgb[2] = _mm_alignr_epi8(v_src[2], v_src[1], 6);
+                v_rgb[3] = _mm_srli_si128(v_src[2], 2);
+
+                __m128i v_gray;
+                process(v_rgb, v_coeffs,
+                        v_gray);
+
+                _mm_storeu_si128((__m128i *)(dst + i), v_gray);
+            }
+        }
+        else if (scn == 4 && haveSIMD)
+        {
+            __m128i v_coeffs[2];
+            v_coeffs[0] = _mm_set_epi16(0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0], 0, (short)coeffs[2], (short)coeffs[1], (short)coeffs[0]);
+            v_coeffs[1] = _mm_slli_epi16(v_coeffs[0], 2);
+
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                __m128i v_rgb[4];
+                v_rgb[0] = _mm_loadu_si128((__m128i const *)(src));
+                v_rgb[1] = _mm_loadu_si128((__m128i const *)(src + 8));
+                v_rgb[2] = _mm_loadu_si128((__m128i const *)(src + 16));
+                v_rgb[3] = _mm_loadu_si128((__m128i const *)(src + 24));
+
+                __m128i v_gray;
+                process(v_rgb, v_coeffs,
+                        v_gray);
+
+                _mm_storeu_si128((__m128i *)(dst + i), v_gray);
+            }
+        }
+
+        for( ; i < n; i++, src += scn)
+            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
+    }
+
+    int srccn, coeffs[3];
+    __m128i v_delta;
+    __m128i v_zero;
+    bool haveSIMD;
+};
+
+#endif // CV_SSE4_1
+
+template <>
+struct RGB2Gray<float>
+{
+    typedef float channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const float coeffs0[] = { R2YF, G2YF, B2YF };
+        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
+        if(blueIdx == 0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = _mm_set1_ps(coeffs[0]);
+        v_cg = _mm_set1_ps(coeffs[1]);
+        v_cr = _mm_set1_ps(coeffs[2]);
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    void process(__m128 v_b, __m128 v_g, __m128 v_r,
+                 __m128 & v_gray) const
+    {
+        v_gray = _mm_mul_ps(v_r, v_cr);
+        v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg));
+        v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb));
+    }
+
+    void operator()(const float * src, float * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
+
+        if (scn == 3 && haveSIMD)
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                __m128 v_r0 = _mm_loadu_ps(src);
+                __m128 v_r1 = _mm_loadu_ps(src + 4);
+                __m128 v_g0 = _mm_loadu_ps(src + 8);
+                __m128 v_g1 = _mm_loadu_ps(src + 12);
+                __m128 v_b0 = _mm_loadu_ps(src + 16);
+                __m128 v_b1 = _mm_loadu_ps(src + 20);
+
+                _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                __m128 v_gray0;
+                process(v_r0, v_g0, v_b0,
+                        v_gray0);
+
+                __m128 v_gray1;
+                process(v_r1, v_g1, v_b1,
+                        v_gray1);
+
+                _mm_storeu_ps(dst + i, v_gray0);
+                _mm_storeu_ps(dst + i + 4, v_gray1);
+            }
+        }
+        else if (scn == 4 && haveSIMD)
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                __m128 v_r0 = _mm_loadu_ps(src);
+                __m128 v_r1 = _mm_loadu_ps(src + 4);
+                __m128 v_g0 = _mm_loadu_ps(src + 8);
+                __m128 v_g1 = _mm_loadu_ps(src + 12);
+                __m128 v_b0 = _mm_loadu_ps(src + 16);
+                __m128 v_b1 = _mm_loadu_ps(src + 20);
+                __m128 v_a0 = _mm_loadu_ps(src + 24);
+                __m128 v_a1 = _mm_loadu_ps(src + 28);
+
+                _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
+
+                __m128 v_gray0;
+                process(v_r0, v_g0, v_b0,
+                        v_gray0);
+
+                __m128 v_gray1;
+                process(v_r1, v_g1, v_b1,
+                        v_gray1);
+
+                _mm_storeu_ps(dst + i, v_gray0);
+                _mm_storeu_ps(dst + i + 4, v_gray1);
+            }
+        }
+
+        for ( ; i < n; i++, src += scn)
+            dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
+    }
+
+    int srccn;
+    float coeffs[3];
+    __m128 v_cb, v_cg, v_cr;
+    bool haveSIMD;
+};
+
+#endif // CV_SSE2
+
+#if !CV_NEON && !CV_SSE4_1
+
+template<> struct RGB2Gray<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
+    {
+        static const int coeffs0[] = { R2Y, G2Y, B2Y };
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
+        if( blueIdx == 0 )
+            std::swap(coeffs[0], coeffs[2]);
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
+        for(int i = 0; i < n; i++, src += scn)
+            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
+    }
+    int srccn;
+    int coeffs[3];
+};
+
+#endif // !CV_NEON && !CV_SSE4_1
+
+
+/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
+
+template<typename _Tp>
+struct RGBA2mRGBA
+{
+    typedef _Tp channel_type;
+
+    void operator()(const _Tp* src, _Tp* dst, int n) const
+    {
+        _Tp max_val  = ColorChannel<_Tp>::max();
+        _Tp half_val = ColorChannel<_Tp>::half();
+        for( int i = 0; i < n; i++ )
+        {
+            _Tp v0 = *src++;
+            _Tp v1 = *src++;
+            _Tp v2 = *src++;
+            _Tp v3 = *src++;
+
+            *dst++ = (v0 * v3 + half_val) / max_val;
+            *dst++ = (v1 * v3 + half_val) / max_val;
+            *dst++ = (v2 * v3 + half_val) / max_val;
+            *dst++ = v3;
+        }
+    }
+};
+
+
+template<typename _Tp>
+struct mRGBA2RGBA
+{
+    typedef _Tp channel_type;
+
+    void operator()(const _Tp* src, _Tp* dst, int n) const
+    {
+        _Tp max_val = ColorChannel<_Tp>::max();
+        for( int i = 0; i < n; i++ )
+        {
+            _Tp v0 = *src++;
+            _Tp v1 = *src++;
+            _Tp v2 = *src++;
+            _Tp v3 = *src++;
+            _Tp v3_half = v3 / 2;
+
+            *dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
+            *dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
+            *dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
+            *dst++ = v3;
+        }
+    }
+};
+
+//
+// IPP functions
+//
+
+#if NEED_IPP
+
+static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
+{
+    (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
+    0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
+};
+
+static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
+{
+    (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
+    0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
+};
+
+static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
+{
+    (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
+    0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
+};
+
+static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
+{
+    (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
+    0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
+};
+
+
+static IppStatus ippiGrayToRGB_C1C3R(const Ipp8u*  pSrc, int srcStep, Ipp8u*  pDst, int dstStep, IppiSize roiSize)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
+}
+static IppStatus ippiGrayToRGB_C1C3R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
+}
+static IppStatus ippiGrayToRGB_C1C3R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize);
+}
+
+static IppStatus ippiGrayToRGB_C1C4R(const Ipp8u*  pSrc, int srcStep, Ipp8u*  pDst, int dstStep, IppiSize roiSize, Ipp8u  aval)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
+}
+static IppStatus ippiGrayToRGB_C1C4R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize, Ipp16u aval)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
+}
+static IppStatus ippiGrayToRGB_C1C4R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize, Ipp32f aval)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval);
+}
+
+struct IPPColor2GrayFunctor
+{
+    IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
+        ippiColorToGray(_func)
+    {
+        coeffs[0] = B2YF;
+        coeffs[1] = G2YF;
+        coeffs[2] = R2YF;
+    }
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        return ippiColorToGray ? CV_INSTRUMENT_FUN_IPP(ippiColorToGray, src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false;
+    }
+private:
+    ippiColor2GrayFunc ippiColorToGray;
+    Ipp32f coeffs[3];
+};
+
+template <typename T>
+struct IPPGray2BGRFunctor
+{
+    IPPGray2BGRFunctor(){}
+
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        return ippiGrayToRGB_C1C3R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows)) >= 0;
+    }
+};
+
+template <typename T>
+struct IPPGray2BGRAFunctor
+{
+    IPPGray2BGRAFunctor()
+    {
+        alpha = ColorChannel<T>::max();
+    }
+
+    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
+    {
+        return ippiGrayToRGB_C1C4R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows), alpha) >= 0;
+    }
+
+    T alpha;
+};
+
+static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep,
+         IppiSize roiSize, const int *dstOrder)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_8u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u);
+}
+
+static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep,
+         IppiSize roiSize, const int *dstOrder)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_16u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u);
+}
+
+static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep,
+         IppiSize roiSize, const int *dstOrder)
+{
+    return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_32f_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f);
+}
+
+// shared
+ippiReorderFunc ippiSwapChannelsC3C4RTab[] =
+{
+    (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0,
+    0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0
+};
+
+static ippiGeneralFunc ippiCopyAC4C3RTab[] =
+{
+    (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0,
+    0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0
+};
+
+// shared
+ippiReorderFunc ippiSwapChannelsC4C3RTab[] =
+{
+    (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0,
+    0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0
+};
+
+// shared
+ippiReorderFunc ippiSwapChannelsC3RTab[] =
+{
+    (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0,
+    0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0
+};
+
+#if IPP_VERSION_X100 >= 810
+static ippiReorderFunc ippiSwapChannelsC4RTab[] =
+{
+    (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0,
+    0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0
+};
+#endif
+
+#endif
+
+//
+// HAL functions
+//
+
+namespace hal
+{
+
+// 8u, 16u, 32f
+void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
+                 uchar * dst_data, size_t dst_step,
+                 int width, int height,
+                 int depth, int scn, int dcn, bool swapBlue)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+    if(scn == 3 && dcn == 4 && !swapBlue)
+    {
+        if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                             IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
+            return;
+    }
+    else if(scn == 4 && dcn == 3 && !swapBlue)
+    {
+        if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                             IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
+            return;
+    }
+    else if(scn == 3 && dcn == 4 && swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
+            return;
+    }
+    else if(scn == 4 && dcn == 3 && swapBlue)
+    {
+        if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                            IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
+            return;
+    }
+    else if(scn == 3 && dcn == 3 && swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
+                                IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
+            return;
+    }
+#if IPP_VERSION_X100 >= 810
+    else if(scn == 4 && dcn == 4 && swapBlue)
+    {
+        if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height,
+                                IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
+            return;
+    }
+    }
+#endif
+#endif
+
+    int blueIdx = swapBlue ? 2 : 0;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<uchar>(scn, dcn, blueIdx));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<ushort>(scn, dcn, blueIdx));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB<float>(scn, dcn, blueIdx));
+}
+
+// only 8u
+void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
+                    uchar * dst_data, size_t dst_step,
+                    int width, int height,
+                    int scn, bool swapBlue, int greenBits)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits);
+
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits));
+}
+
+// only 8u
+void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
+                    uchar * dst_data, size_t dst_step,
+                    int width, int height,
+                    int dcn, bool swapBlue, int greenBits)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits);
+
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits));
+}
+
+// 8u, 16u, 32f
+void cvtBGRtoGray(const uchar * src_data, size_t src_step,
+                  uchar * dst_data, size_t dst_step,
+                  int width, int height,
+                  int depth, int scn, bool swapBlue)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtBGRtoGray, cv_hal_cvtBGRtoGray, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+        if(depth == CV_32F && scn == 3 && !swapBlue)
+        {
+            if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
+                return;
+        }
+        else if(depth == CV_32F && scn == 3 && swapBlue)
+        {
+            if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
+                return;
+        }
+        else if(depth == CV_32F && scn == 4 && !swapBlue)
+        {
+            if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
+                return;
+        }
+        else if(depth == CV_32F && scn == 4 && swapBlue)
+        {
+            if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                                IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
+                return;
+        }
+    }
+#endif
+
+    int blueIdx = swapBlue ? 2 : 0;
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<uchar>(scn, blueIdx, 0));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<ushort>(scn, blueIdx, 0));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray<float>(scn, blueIdx, 0));
+}
+
+// 8u, 16u, 32f
+void cvtGraytoBGR(const uchar * src_data, size_t src_step,
+                  uchar * dst_data, size_t dst_step,
+                  int width, int height,
+                  int depth, int dcn)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtGraytoBGR, cv_hal_cvtGraytoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn);
+
+#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700
+    CV_IPP_CHECK()
+    {
+        bool ippres = false;
+        if(dcn == 3)
+        {
+            if( depth == CV_8U )
+                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp8u>());
+            else if( depth == CV_16U )
+                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp16u>());
+            else
+                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor<Ipp32f>());
+        }
+        else if(dcn == 4)
+        {
+            if( depth == CV_8U )
+                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp8u>());
+            else if( depth == CV_16U )
+                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp16u>());
+            else
+                ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor<Ipp32f>());
+        }
+        if(ippres)
+            return;
+    }
+#endif
+
+    if( depth == CV_8U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<uchar>(dcn));
+    else if( depth == CV_16U )
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<ushort>(dcn));
+    else
+        CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB<float>(dcn));
+}
+
+// only 8u
+void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
+                     uchar * dst_data, size_t dst_step,
+                     int width, int height,
+                     int greenBits)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits);
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits));
+}
+
+// only 8u
+void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
+                     uchar * dst_data, size_t dst_step,
+                     int width, int height,
+                     int greenBits)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits);
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits));
+}
+
+void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtRGBAtoMultipliedRGBA, cv_hal_cvtRGBAtoMultipliedRGBA, src_data, src_step, dst_data, dst_step, width, height);
+
+#ifdef HAVE_IPP
+    CV_IPP_CHECK()
+    {
+    if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height,
+                        IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
+        return;
+    }
+#endif
+
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA<uchar>());
+}
+
+void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height)
+{
+    CV_INSTRUMENT_REGION()
+
+    CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height);
+    CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA<uchar>());
+}
+
+} // namespace hal
+
+//
+// OCL calls
+//
+
+#ifdef HAVE_OPENCL
+
+bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse )
+{
+    OclHelper< Set<3, 4>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
+
+    if(!h.createKernel("RGB", ocl::imgproc::color_rgb_oclsrc,
+                       format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER")))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits )
+{
+    OclHelper< Set<3, 4>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
+
+    if(!h.createKernel("RGB2RGB5x5", ocl::imgproc::color_rgb_oclsrc,
+                       format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, gbits)))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits)
+{
+    OclHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
+
+    if(!h.createKernel("RGB5x52RGB", ocl::imgproc::color_rgb_oclsrc,
+                       format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, gbits)))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits)
+{
+    OclHelper< Set<2>, Set<1>, Set<CV_8U> > h(_src, _dst, 1);
+
+    if(!h.createKernel("BGR5x52Gray", ocl::imgproc::color_rgb_oclsrc,
+                       format("-D dcn=1 -D bidx=0 -D greenbits=%d", gbits)))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits)
+{
+    OclHelper< Set<1>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
+
+    if(!h.createKernel("Gray2BGR5x5", ocl::imgproc::color_rgb_oclsrc,
+                        format("-D dcn=2 -D bidx=0 -D greenbits=%d", gbits)))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx)
+{
+    OclHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
+
+    int stripeSize = 1;
+    if(!h.createKernel("RGB2Gray", ocl::imgproc::color_rgb_oclsrc,
+                       format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", bidx, stripeSize)))
+    {
+        return false;
+    }
+
+    h.globalSize[0] = (h.src.cols + stripeSize - 1)/stripeSize;
+    return h.run();
+}
+
+bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn)
+{
+    OclHelper< Set<1>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
+    if(!h.createKernel("Gray2RGB", ocl::imgproc::color_rgb_oclsrc,
+                       format("-D bidx=0 -D dcn=%d", dcn)))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst)
+{
+    OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
+
+    if(!h.createKernel("RGBA2mRGBA", ocl::imgproc::color_rgb_oclsrc,
+                       "-D dcn=4 -D bidx=3"))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst)
+{
+    OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
+
+    if(!h.createKernel("mRGBA2RGBA", ocl::imgproc::color_rgb_oclsrc,
+                       "-D dcn=4 -D bidx=3"))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
+#endif
+
+//
+// HAL calls
+//
+
+void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb)
+{
+    CvtHelper< Set<3, 4>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
+
+    hal::cvtBGRtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                     h.depth, h.scn, dcn, swapb);
+}
+
+void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits)
+{
+    CvtHelper< Set<3, 4>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
+
+    hal::cvtBGRtoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                        h.scn, swapb, gbits);
+}
+
+void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits)
+{
+    if(dcn <= 0) dcn = 3;
+    CvtHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
+
+    hal::cvtBGR5x5toBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                        dcn, swapb, gbits);
+}
+
+void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb)
+{
+    CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 1);
+
+    hal::cvtBGRtoGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                      h.depth, h.scn, swapb);
+}
+
+void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn)
+{
+    if(dcn <= 0) dcn = 3;
+    CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
+
+    hal::cvtGraytoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, h.depth, dcn);
+}
+
+void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits)
+{
+    CvtHelper< Set<2>, Set<1>, Set<CV_8U> > h(_src, _dst, 1);
+
+    hal::cvtBGR5x5toGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits);
+}
+
+void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits)
+{
+    CvtHelper< Set<1>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
+
+    hal::cvtGraytoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits);
+}
+
+void cvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst)
+{
+    CvtHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
+
+    hal::cvtRGBAtoMultipliedRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows);
+}
+
+void cvtColormRGBA2RGBA( InputArray _src, OutputArray _dst)
+{
+    CvtHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
+
+    hal::cvtMultipliedRGBAtoRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows);
+}
+
+} // namespace cv
--- a/modules/imgproc/src/color_yuv.cpp
+++ b/modules/imgproc/src/color_yuv.cpp
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@@ -41,6 +41,50 @@
 //
 //M*/

+/********************************* COPYRIGHT NOTICE *******************************\
+  Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
+  from MD-Mathematische Dienste GmbH. Below is the copyright notice:
+
+    IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+    By downloading, copying, installing or using the software you agree
+    to this license. If you do not agree to this license, do not download,
+    install, copy or use the software.
+
+    Contributors License Agreement:
+
+      Copyright (c) 2002,
+      MD-Mathematische Dienste GmbH
+      Im Defdahl 5-10
+      44141 Dortmund
+      Germany
+      www.md-it.de
+
+    Redistribution and use in source and binary forms,
+    with or without modification, are permitted provided
+    that the following conditions are met:
+
+    Redistributions of source code must retain
+    the above copyright notice, this list of conditions and the following disclaimer.
+    Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+    The name of Contributor may not be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+    THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
+    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+    OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+    THE POSSIBILITY OF SUCH DAMAGE.
+\**********************************************************************************/
+
+
 #include "precomp.hpp"

 #include <limits>

--- a/modules/imgproc/src/opencl/color_hsv.cl
+++ b/modules/imgproc/src/opencl/color_hsv.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX_NUM 128
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX_NUM 32768
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX_NUM 0.5f
+    #define COEFF_TYPE float
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    hsv_shift  = 12
+};
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+#ifndef hscale
+#define hscale 0
+#endif
+
+#ifndef hrange
+#define hrange 0
+#endif
+
+#if bidx == 0
+#define R_COMP z
+#define G_COMP y
+#define B_COMP x
+#else
+#define R_COMP x
+#define G_COMP y
+#define B_COMP z
+#endif
+
+//////////////////////////////////// RGB <-> HSV //////////////////////////////////////
+
+__constant int sector_data[][3] = { { 1, 3, 0 },
+                                    { 1, 0, 2 },
+                                    { 3, 0, 1 },
+                                    { 0, 2, 1 },
+                                    { 0, 1, 3 },
+                                    { 2, 1, 0 } };
+
+#ifdef DEPTH_0
+
+__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
+                      __global uchar* dst, int dst_step, int dst_offset,
+                      int rows, int cols,
+                      __constant int * sdiv_table, __constant int * hdiv_table)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+                int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                int h, s, v = b;
+                int vmin = b, diff;
+                int vr, vg;
+
+                v = max(v, g);
+                v = max(v, r);
+                vmin = min(vmin, g);
+                vmin = min(vmin, r);
+
+                diff = v - vmin;
+                vr = v == r ? -1 : 0;
+                vg = v == g ? -1 : 0;
+
+                s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
+                h = (vr & (g - b)) +
+                    (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
+                h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
+                h += h < 0 ? hrange : 0;
+
+                dst[dst_index] = convert_uchar_sat_rte(h);
+                dst[dst_index + 1] = (uchar)s;
+                dst[dst_index + 2] = (uchar)v;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
+                      __global uchar* dst, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+                float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+                    if( (unsigned)sector >= 6u )
+                    {
+                        sector = 0;
+                        h = 0.f;
+                    }
+
+                    tab[0] = v;
+                    tab[1] = v*(1.f - s);
+                    tab[2] = v*(1.f - s*h);
+                    tab[3] = v*(1.f - s*(1.f - h));
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = v;
+
+                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
+                dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
+                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+#if dcn == 4
+                dst[dst_index + 3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                float h, s, v;
+
+                float vmin, diff;
+
+                v = vmin = r;
+                if( v < g ) v = g;
+                if( v < b ) v = b;
+                if( vmin > g ) vmin = g;
+                if( vmin > b ) vmin = b;
+
+                diff = v - vmin;
+                s = diff/(float)(fabs(v) + FLT_EPSILON);
+                diff = (float)(60.f/(diff + FLT_EPSILON));
+                if( v == r )
+                    h = (g - b)*diff;
+                else if( v == g )
+                    h = fma(b - r, diff, 120.f);
+                else
+                    h = fma(r - g, diff, 240.f);
+
+                if( h < 0 )
+                    h += 360.f;
+
+                dst[0] = h*hscale;
+                dst[1] = s;
+                dst[2] = v;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float h = src_pix.x, s = src_pix.y, v = src_pix.z;
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+                    h *= hscale;
+                    if(h < 0)
+                        do h += 6; while (h < 0);
+                    else if (h >= 6)
+                        do h -= 6; while (h >= 6);
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+                    if ((unsigned)sector >= 6u)
+                    {
+                        sector = 0;
+                        h = 0.f;
+                    }
+
+                    tab[0] = v;
+                    tab[1] = v*(1.f - s);
+                    tab[2] = v*(1.f - s*h);
+                    tab[3] = v*(1.f - s*(1.f - h));
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = v;
+
+                dst[bidx] = b;
+                dst[1] = g;
+                dst[bidx^2] = r;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
+
+///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
+
+#ifdef DEPTH_0
+
+__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
+                      __global uchar* dst, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+                float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
+                float h = 0.f, s = 0.f, l;
+                float vmin, vmax, diff;
+
+                vmax = vmin = r;
+                if (vmax < g) vmax = g;
+                if (vmax < b) vmax = b;
+                if (vmin > g) vmin = g;
+                if (vmin > b) vmin = b;
+
+                diff = vmax - vmin;
+                l = (vmax + vmin)*0.5f;
+
+                if (diff > FLT_EPSILON)
+                {
+                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+                    diff = 60.f/diff;
+
+                    if( vmax == r )
+                        h = (g - b)*diff;
+                    else if( vmax == g )
+                        h = fma(b - r, diff, 120.f);
+                    else
+                        h = fma(r - g, diff, 240.f);
+
+                    if( h < 0.f )
+                        h += 360.f;
+                }
+
+                dst[dst_index] = convert_uchar_sat_rte(h*hscale);
+                dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
+                dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
+                      __global uchar* dst, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+                float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+
+                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+                    float p1 = 2*l - p2;
+
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+
+                    int sector = convert_int_sat_rtn(h);
+                    h -= sector;
+
+                    tab[0] = p2;
+                    tab[1] = p1;
+                    tab[2] = fma(p2 - p1, 1-h, p1);
+                    tab[3] = fma(p2 - p1, h, p1);
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = l;
+
+                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
+                dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
+                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+#if dcn == 4
+                dst[dst_index + 3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+                float h = 0.f, s = 0.f, l;
+                float vmin, vmax, diff;
+
+                vmax = vmin = r;
+                if (vmax < g) vmax = g;
+                if (vmax < b) vmax = b;
+                if (vmin > g) vmin = g;
+                if (vmin > b) vmin = b;
+
+                diff = vmax - vmin;
+                l = (vmax + vmin)*0.5f;
+
+                if (diff > FLT_EPSILON)
+                {
+                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+                    diff = 60.f/diff;
+
+                    if( vmax == r )
+                        h = (g - b)*diff;
+                    else if( vmax == g )
+                        h = fma(b - r, diff, 120.f);
+                    else
+                        h = fma(r - g, diff, 240.f);
+
+                    if( h < 0.f ) h += 360.f;
+                }
+
+                dst[0] = h*hscale;
+                dst[1] = l;
+                dst[2] = s;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
+                      __global uchar* dstptr, int dst_step, int dst_offset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float h = src_pix.x, l = src_pix.y, s = src_pix.z;
+                float b, g, r;
+
+                if (s != 0)
+                {
+                    float tab[4];
+                    int sector;
+
+                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+                    float p1 = 2*l - p2;
+
+                    h *= hscale;
+                    if( h < 0 )
+                        do h += 6; while( h < 0 );
+                    else if( h >= 6 )
+                        do h -= 6; while( h >= 6 );
+
+                    sector = convert_int_sat_rtn(h);
+                    h -= sector;
+
+                    tab[0] = p2;
+                    tab[1] = p1;
+                    tab[2] = fma(p2 - p1, 1-h, p1);
+                    tab[3] = fma(p2 - p1, h, p1);
+
+                    b = tab[sector_data[sector][0]];
+                    g = tab[sector_data[sector][1]];
+                    r = tab[sector_data[sector][2]];
+                }
+                else
+                    b = g = r = l;
+
+                dst[bidx] = b;
+                dst[1] = g;
+                dst[bidx^2] = r;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
--- a/modules/imgproc/src/opencl/color_lab.cl
+++ b/modules/imgproc/src/opencl/color_lab.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX_NUM 128
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX_NUM 32768
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX_NUM 0.5f
+    #define COEFF_TYPE float
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    xyz_shift  = 12,
+};
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
+
+///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
+
+__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset,
+                      int rows, int cols, __constant COEFF_TYPE * coeffs)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (dx < cols)
+    {
+        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
+        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (dy < rows)
+            {
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
+
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
+
+#ifdef DEPTH_5
+                float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
+                float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
+                float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
+#else
+                int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
+                int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
+                int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
+#endif
+                dst[0] = SAT_CAST(x);
+                dst[1] = SAT_CAST(y);
+                dst[2] = SAT_CAST(z);
+
+                ++dy;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset,
+                      int rows, int cols, __constant COEFF_TYPE * coeffs)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (dx < cols)
+    {
+        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
+        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (dy < rows)
+            {
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
+
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
+
+#ifdef DEPTH_5
+                float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
+                float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
+                float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
+#else
+                int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
+                int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
+                int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
+#endif
+
+                DATA_TYPE dst0 = SAT_CAST(b);
+                DATA_TYPE dst1 = SAT_CAST(g);
+                DATA_TYPE dst2 = SAT_CAST(r);
+#if dcn == 3 || defined DEPTH_5
+                dst[0] = dst0;
+                dst[1] = dst1;
+                dst[2] = dst2;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+#else
+                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
+#endif
+
+                ++dy;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
+
+#define lab_shift xyz_shift
+#define gamma_shift 3
+#define lab_shift2 (lab_shift + gamma_shift)
+#define GAMMA_TAB_SIZE 1024
+#define GammaTabScale (float)GAMMA_TAB_SIZE
+
+inline float splineInterpolate(float x, __global const float * tab, int n)
+{
+    int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
+    x -= ix;
+    tab += ix << 2;
+    return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
+}
+
+#ifdef DEPTH_0
+
+__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+                      __global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
+                      __constant int * coeffs, int Lscale, int Lshift)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const uchar* src_ptr = src + src_index;
+                __global uchar* dst_ptr = dst + dst_index;
+                uchar4 src_pix = vload4(0, src_ptr);
+
+                int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                    C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                    C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+                int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
+                int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
+                int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
+                int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
+
+                int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
+                int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
+                int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
+
+                dst_ptr[0] = SAT_CAST(L);
+                dst_ptr[1] = SAT_CAST(a);
+                dst_ptr[2] = SAT_CAST(b);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _1_3, float _a)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                      C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                      C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+                float R = clamp(src_pix.x, 0.0f, 1.0f);
+                float G = clamp(src_pix.y, 0.0f, 1.0f);
+                float B = clamp(src_pix.z, 0.0f, 1.0f);
+
+#ifdef SRGB
+                R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
+                float X = fma(R, C0, fma(G, C1, B*C2));
+                float Y = fma(R, C3, fma(G, C4, B*C5));
+                float Z = fma(R, C6, fma(G, C7, B*C8));
+
+                float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
+                float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
+                float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
+
+                float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
+                float a = 500.f * (FX - FY);
+                float b = 200.f * (FY - FZ);
+
+                dst[0] = L;
+                dst[1] = a;
+                dst[2] = b;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
+
+inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
+
+    float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+          C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+          C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+    float y, fy;
+    // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
+    if (li <= lThresh)
+    {
+        y = li / 903.3f;
+        fy = fma(7.787f, y, 16.0f / 116.0f);
+    }
+    else
+    {
+        fy = (li + 16.0f) / 116.0f;
+        y = fy * fy * fy;
+    }
+
+    float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
+
+    #pragma unroll
+    for (int j = 0; j < 2; j++)
+        if (fxz[j] <= fThresh)
+            fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
+        else
+            fxz[j] = fxz[j] * fxz[j] * fxz[j];
+
+    float x = fxz[0], z = fxz[1];
+    float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
+    float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
+    float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
+
+#ifdef SRGB
+    ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+    go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+    bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+    dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
+}
+
+#ifdef DEPTH_0
+
+__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const uchar* src_ptr = src + src_index;
+                __global uchar * dst_ptr = dst + dst_index;
+                uchar4 src_pix = vload4(0, src_ptr);
+
+                float srcbuf[3], dstbuf[3];
+                srcbuf[0] = src_pix.x*(100.f/255.f);
+                srcbuf[1] = convert_float(src_pix.y - 128);
+                srcbuf[2] = convert_float(src_pix.z - 128);
+
+                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+#ifdef SRGB
+                    gammaTab,
+#endif
+                    coeffs, lThresh, fThresh);
+
+#if dcn == 3
+                dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
+                dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
+                dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
+#else
+                *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
+                    SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float srcbuf[3], dstbuf[3];
+                srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
+
+                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+#ifdef SRGB
+                    gammaTab,
+#endif
+                    coeffs, lThresh, fThresh);
+
+                dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
+
+/////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
+
+#define LAB_CBRT_TAB_SIZE 1024
+#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
+
+__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
+
+#ifdef DEPTH_5
+
+__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+
+                float R = src[0], G = src[1], B = src[2];
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
+                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
+                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
+
+                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                L = fma(116.f, L, -16.f);
+
+                float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
+                float u = L*fma(X, d, -_un);
+                float v = L*fma(2.25f, Y*d, -_vn);
+
+                dst[0] = L;
+                dst[1] = u;
+                dst[2] = v;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+    }
+}
+
+#elif defined DEPTH_0
+
+__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                float scale = 1.0f / 255.0f;
+                float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
+                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
+                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
+
+                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                L = 116.f*L - 16.f;
+
+                float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
+                float u = L*(X*d - _un);
+                float v = L*fma(2.25f, Y*d, -_vn);
+
+                dst[0] = SAT_CAST(L * 2.55f);
+                //0.72033 = 255/(220+134), 96.525 = 134*255/(220+134)
+                dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
+                //0.9732 = 255/(140+122), 136.259 = 140*255/(140+122)
+                dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
+
+                ++y;
+                dst += dst_step;
+                src += src_step;
+            }
+    }
+}
+
+#endif
+
+#ifdef DEPTH_5
+
+__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+
+                float L = src[0], u = src[1], v = src[2], X, Y, Z;
+                if(L >= 8)
+                {
+                    Y = fma(L, 1.f/116.f, 16.f/116.f);
+                    Y = Y*Y*Y;
+                }
+                else
+                {
+                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
+                }
+                float up = 3.f*fma(L, _un, u);
+                float vp = 0.25f/fma(L, _vn, v);
+                vp = clamp(vp, -0.25f, 0.25f);
+                X = 3.f*Y*up*vp;
+                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
+
+                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
+                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
+                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                dst[0] = R;
+                dst[1] = G;
+                dst[2] = B;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+    }
+}
+
+#elif defined DEPTH_0
+
+__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                float d, X, Y, Z;
+                float L = src[0]*(100.f/255.f);
+                // 1.388235294117647 = (220+134)/255
+                float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
+                // 1.027450980392157 = (140+122)/255
+                float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
+                if(L >= 8)
+                {
+                    Y = fma(L, 1.f/116.f, 16.f/116.f);
+                    Y = Y*Y*Y;
+                }
+                else
+                {
+                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
+                }
+                float up = 3.f*fma(L, _un, u);
+                float vp = 0.25f/fma(L, _vn, v);
+                vp = clamp(vp, -0.25f, 0.25f);
+                X = 3.f*Y*up*vp;
+                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
+
+                //limit X, Y, Z to [0, 2] to fit white point
+                X = clamp(X, 0.f, 2.f); Z = clamp(Z, 0.f, 2.f);
+
+                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
+                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
+                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                uchar dst0 = SAT_CAST(R * 255.0f);
+                uchar dst1 = SAT_CAST(G * 255.0f);
+                uchar dst2 = SAT_CAST(B * 255.0f);
+
+#if dcn == 4
+                *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
+#else
+                dst[0] = dst0;
+                dst[1] = dst1;
+                dst[2] = dst2;
+#endif
+
+                ++y;
+                dst += dst_step;
+                src += src_step;
+            }
+    }
+}
+
+#endif
--- a/modules/imgproc/src/opencl/color_rgb.cl
+++ b/modules/imgproc/src/opencl/color_rgb.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX_NUM 128
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX_NUM 32768
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX_NUM 0.5f
+    #define COEFF_TYPE float
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    yuv_shift  = 14,
+    R2Y        = 4899,
+    G2Y        = 9617,
+    B2Y        = 1868
+};
+
+//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+#define B2YF 0.114f
+#define G2YF 0.587f
+#define R2YF 0.299f
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+#if bidx == 0
+#define R_COMP z
+#define G_COMP y
+#define B_COMP x
+#else
+#define R_COMP x
+#define G_COMP y
+#define B_COMP z
+#endif
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
+
+///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
+
+__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
+                       __global uchar * dstptr, int dst_step, int dst_offset,
+                       int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE_3 src_pix = vload3(0, src);
+#ifdef DEPTH_5
+                dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
+#else
+                dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
+#endif
+                ++y;
+                src_index += src_step;
+                dst_index += dst_step;
+            }
+        }
+    }
+}
+
+__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
+                       __global uchar * dstptr, int dst_step, int dst_offset,
+                       int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE val = src[0];
+#if dcn == 3 || defined DEPTH_5
+                dst[0] = dst[1] = dst[2] = val;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+#else
+                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
+
+__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
+                  __global uchar* dstptr, int dst_step, int dst_offset,
+                  int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
+#if scn == 3
+                DATA_TYPE_3 src_pix = vload3(0, src);
+#else
+                DATA_TYPE_4 src_pix = vload4(0, src);
+#endif
+
+#ifdef REVERSE
+                dst[0] = src_pix.z;
+                dst[1] = src_pix.y;
+                dst[2] = src_pix.x;
+#else
+                dst[0] = src_pix.x;
+                dst[1] = src_pix.y;
+                dst[2] = src_pix.z;
+#endif
+
+#if dcn == 4
+#if scn == 3
+                dst[3] = MAX_NUM;
+#else
+                dst[3] = src[3];
+#endif
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
+
+__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
+                         __global uchar* dst, int dst_step, int dst_offset,
+                         int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                ushort t = *((__global const ushort*)(src + src_index));
+
+#if greenbits == 6
+                dst[dst_index + bidx] = (uchar)(t << 3);
+                dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
+                dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
+#else
+                dst[dst_index + bidx] = (uchar)(t << 3);
+                dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
+                dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
+#endif
+
+#if dcn == 4
+#if greenbits == 6
+                dst[dst_index + 3] = 255;
+#else
+                dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
+#endif
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
+                         __global uchar* dst, int dst_step, int dst_offset,
+                         int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = vload4(0, src + src_index);
+
+#if greenbits == 6
+                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
+#elif scn == 3
+                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
+#else
+                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
+                        ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
+#endif
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////
+
+__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
+                          __global uchar* dst, int dst_step, int dst_offset,
+                          int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, dst_offset + x);
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int t = *((__global const ushort*)(src + src_index));
+
+#if greenbits == 6
+                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
+#else
+                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
+                          __global uchar* dst, int dst_step, int dst_offset,
+                          int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, src_offset + x);
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                int t = src[src_index];
+
+#if greenbits == 6
+                *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+#else
+                t >>= 3;
+                *((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
+
+#ifdef DEPTH_0
+
+__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
+                         __global uchar* dst, int dst_step, int dst_offset,
+                         int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, src_offset + (x << 2));
+        int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
+
+                *(__global uchar4 *)(dst + dst_index) =
+                    (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
+                             mad24(src_pix.y, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
+                             mad24(src_pix.z, src_pix.w, HALF_MAX_NUM) / MAX_NUM, src_pix.w);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
+                         __global uchar* dst, int dst_step, int dst_offset,
+                         int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
+                uchar v3 = src_pix.w, v3_half = v3 / 2;
+
+                if (v3 == 0)
+                    *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
+                else
+                    *(__global uchar4 *)(dst + dst_index) =
+                        (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
+                                 mad24(src_pix.y, MAX_NUM, v3_half) / v3,
+                                 mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@@ -76,12 +76,9 @@
 enum
 {
    yuv_shift  = 14,
-    xyz_shift  = 12,
-    hsv_shift  = 12,
    R2Y        = 4899,
    G2Y        = 9617,
    B2Y        = 1868,
-    BLOCK_SIZE = 256
 };

 //constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
@@ -120,14 +117,6 @@ enum
 #define scnbytes ((int)sizeof(DATA_TYPE)*scn)
 #define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)

-#ifndef hscale
-#define hscale 0
-#endif
-
-#ifndef hrange
-#define hrange 0
-#endif
-
 #if bidx == 0
 #define R_COMP z
 #define G_COMP y
@@ -156,77 +145,6 @@ enum
 #define DATA_TYPE_4 CAT(DATA_TYPE, 4)
 #define DATA_TYPE_3 CAT(DATA_TYPE, 3)

-///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
-
-__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
-                       __global uchar * dstptr, int dst_step, int dst_offset,
-                       int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
-                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
-                DATA_TYPE_3 src_pix = vload3(0, src);
-#ifdef DEPTH_5
-                dst[0] = fma(src_pix.B_COMP, B2YF, fma(src_pix.G_COMP, G2YF, src_pix.R_COMP * R2YF));
-#else
-                dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
-#endif
-                ++y;
-                src_index += src_step;
-                dst_index += dst_step;
-            }
-        }
-    }
-}
-
-__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
-                       __global uchar * dstptr, int dst_step, int dst_offset,
-                       int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
-                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
-                DATA_TYPE val = src[0];
-#if dcn == 3 || defined DEPTH_5
-                dst[0] = dst[1] = dst[2] = val;
-#if dcn == 4
-                dst[3] = MAX_NUM;
-#endif
-#else
-                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
-#endif
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
 ///////////////////////////////////// RGB <-> YUV //////////////////////////////////////

 __constant float c_RGB2YUVCoeffs_f[5]  = { B2YF, G2YF, R2YF, B2UF, R2VF };
@@ -754,1450 +672,3 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
        }
    }
 }
-
-///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
-
-__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
-                      __global uchar * dstptr, int dst_step, int dst_offset,
-                      int rows, int cols, __constant COEFF_TYPE * coeffs)
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (dx < cols)
-    {
-        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
-        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (dy < rows)
-            {
-                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
-                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
-
-                DATA_TYPE_4 src_pix = vload4(0, src);
-                DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
-
-#ifdef DEPTH_5
-                float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
-                float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
-                float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
-#else
-                int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
-                int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
-                int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
-#endif
-                dst[0] = SAT_CAST(x);
-                dst[1] = SAT_CAST(y);
-                dst[2] = SAT_CAST(z);
-
-                ++dy;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
-                      __global uchar * dstptr, int dst_step, int dst_offset,
-                      int rows, int cols, __constant COEFF_TYPE * coeffs)
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (dx < cols)
-    {
-        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
-        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (dy < rows)
-            {
-                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
-                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
-
-                DATA_TYPE_4 src_pix = vload4(0, src);
-                DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
-
-#ifdef DEPTH_5
-                float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
-                float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
-                float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
-#else
-                int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
-                int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
-                int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
-#endif
-
-                DATA_TYPE dst0 = SAT_CAST(b);
-                DATA_TYPE dst1 = SAT_CAST(g);
-                DATA_TYPE dst2 = SAT_CAST(r);
-#if dcn == 3 || defined DEPTH_5
-                dst[0] = dst0;
-                dst[1] = dst1;
-                dst[2] = dst2;
-#if dcn == 4
-                dst[3] = MAX_NUM;
-#endif
-#else
-                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
-#endif
-
-                ++dy;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
-
-__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
-                  __global uchar* dstptr, int dst_step, int dst_offset,
-                  int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
-                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
-#if scn == 3
-                DATA_TYPE_3 src_pix = vload3(0, src);
-#else
-                DATA_TYPE_4 src_pix = vload4(0, src);
-#endif
-
-#ifdef REVERSE
-                dst[0] = src_pix.z;
-                dst[1] = src_pix.y;
-                dst[2] = src_pix.x;
-#else
-                dst[0] = src_pix.x;
-                dst[1] = src_pix.y;
-                dst[2] = src_pix.z;
-#endif
-
-#if dcn == 4
-#if scn == 3
-                dst[3] = MAX_NUM;
-#else
-                dst[3] = src[3];
-#endif
-#endif
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
-
-__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
-                         __global uchar* dst, int dst_step, int dst_offset,
-                         int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                ushort t = *((__global const ushort*)(src + src_index));
-
-#if greenbits == 6
-                dst[dst_index + bidx] = (uchar)(t << 3);
-                dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
-                dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
-#else
-                dst[dst_index + bidx] = (uchar)(t << 3);
-                dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
-                dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
-#endif
-
-#if dcn == 4
-#if greenbits == 6
-                dst[dst_index + 3] = 255;
-#else
-                dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
-#endif
-#endif
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
-                         __global uchar* dst, int dst_step, int dst_offset,
-                         int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                uchar4 src_pix = vload4(0, src + src_index);
-
-#if greenbits == 6
-                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
-#elif scn == 3
-                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
-#else
-                    *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
-                        ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
-#endif
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////
-
-__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
-                          __global uchar* dst, int dst_step, int dst_offset,
-                          int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, dst_offset + x);
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                int t = *((__global const ushort*)(src + src_index));
-
-#if greenbits == 6
-                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
-#else
-                dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
-#endif
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
-                          __global uchar* dst, int dst_step, int dst_offset,
-                          int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, src_offset + x);
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                int t = src[src_index];
-
-#if greenbits == 6
-                *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
-#else
-                t >>= 3;
-                *((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
-#endif
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-//////////////////////////////////// RGB <-> HSV //////////////////////////////////////
-
-__constant int sector_data[][3] = { { 1, 3, 0 },
-                                    { 1, 0, 2 },
-                                    { 3, 0, 1 },
-                                    { 0, 2, 1 },
-                                    { 0, 1, 3 },
-                                    { 2, 1, 0 } };
-
-#ifdef DEPTH_0
-
-__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
-                      __global uchar* dst, int dst_step, int dst_offset,
-                      int rows, int cols,
-                      __constant int * sdiv_table, __constant int * hdiv_table)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                uchar4 src_pix = vload4(0, src + src_index);
-
-                int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
-                int h, s, v = b;
-                int vmin = b, diff;
-                int vr, vg;
-
-                v = max(v, g);
-                v = max(v, r);
-                vmin = min(vmin, g);
-                vmin = min(vmin, r);
-
-                diff = v - vmin;
-                vr = v == r ? -1 : 0;
-                vg = v == g ? -1 : 0;
-
-                s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
-                h = (vr & (g - b)) +
-                    (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
-                h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
-                h += h < 0 ? hrange : 0;
-
-                dst[dst_index] = convert_uchar_sat_rte(h);
-                dst[dst_index + 1] = (uchar)s;
-                dst[dst_index + 2] = (uchar)v;
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
-                      __global uchar* dst, int dst_step, int dst_offset,
-                      int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                uchar4 src_pix = vload4(0, src + src_index);
-
-                float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
-                float b, g, r;
-
-                if (s != 0)
-                {
-                    float tab[4];
-                    int sector;
-                    h *= hscale;
-                    if( h < 0 )
-                        do h += 6; while( h < 0 );
-                    else if( h >= 6 )
-                        do h -= 6; while( h >= 6 );
-                    sector = convert_int_sat_rtn(h);
-                    h -= sector;
-                    if( (unsigned)sector >= 6u )
-                    {
-                        sector = 0;
-                        h = 0.f;
-                    }
-
-                    tab[0] = v;
-                    tab[1] = v*(1.f - s);
-                    tab[2] = v*(1.f - s*h);
-                    tab[3] = v*(1.f - s*(1.f - h));
-
-                    b = tab[sector_data[sector][0]];
-                    g = tab[sector_data[sector][1]];
-                    r = tab[sector_data[sector][2]];
-                }
-                else
-                    b = g = r = v;
-
-                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
-                dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
-                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
-#if dcn == 4
-                dst[dst_index + 3] = MAX_NUM;
-#endif
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#elif defined DEPTH_5
-
-__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
-                      __global uchar* dstptr, int dst_step, int dst_offset,
-                      int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const float * src = (__global const float *)(srcptr + src_index);
-                __global float * dst = (__global float *)(dstptr + dst_index);
-                float4 src_pix = vload4(0, src);
-
-                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
-                float h, s, v;
-
-                float vmin, diff;
-
-                v = vmin = r;
-                if( v < g ) v = g;
-                if( v < b ) v = b;
-                if( vmin > g ) vmin = g;
-                if( vmin > b ) vmin = b;
-
-                diff = v - vmin;
-                s = diff/(float)(fabs(v) + FLT_EPSILON);
-                diff = (float)(60.f/(diff + FLT_EPSILON));
-                if( v == r )
-                    h = (g - b)*diff;
-                else if( v == g )
-                    h = fma(b - r, diff, 120.f);
-                else
-                    h = fma(r - g, diff, 240.f);
-
-                if( h < 0 )
-                    h += 360.f;
-
-                dst[0] = h*hscale;
-                dst[1] = s;
-                dst[2] = v;
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
-                      __global uchar* dstptr, int dst_step, int dst_offset,
-                      int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-
-                __global const float * src = (__global const float *)(srcptr + src_index);
-                __global float * dst = (__global float *)(dstptr + dst_index);
-                float4 src_pix = vload4(0, src);
-
-                float h = src_pix.x, s = src_pix.y, v = src_pix.z;
-                float b, g, r;
-
-                if (s != 0)
-                {
-                    float tab[4];
-                    int sector;
-                    h *= hscale;
-                    if(h < 0)
-                        do h += 6; while (h < 0);
-                    else if (h >= 6)
-                        do h -= 6; while (h >= 6);
-                    sector = convert_int_sat_rtn(h);
-                    h -= sector;
-                    if ((unsigned)sector >= 6u)
-                    {
-                        sector = 0;
-                        h = 0.f;
-                    }
-
-                    tab[0] = v;
-                    tab[1] = v*(1.f - s);
-                    tab[2] = v*(1.f - s*h);
-                    tab[3] = v*(1.f - s*(1.f - h));
-
-                    b = tab[sector_data[sector][0]];
-                    g = tab[sector_data[sector][1]];
-                    r = tab[sector_data[sector][2]];
-                }
-                else
-                    b = g = r = v;
-
-                dst[bidx] = b;
-                dst[1] = g;
-                dst[bidx^2] = r;
-#if dcn == 4
-                dst[3] = MAX_NUM;
-#endif
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#endif
-
-///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
-
-#ifdef DEPTH_0
-
-__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
-                      __global uchar* dst, int dst_step, int dst_offset,
-                      int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                uchar4 src_pix = vload4(0, src + src_index);
-
-                float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
-                float h = 0.f, s = 0.f, l;
-                float vmin, vmax, diff;
-
-                vmax = vmin = r;
-                if (vmax < g) vmax = g;
-                if (vmax < b) vmax = b;
-                if (vmin > g) vmin = g;
-                if (vmin > b) vmin = b;
-
-                diff = vmax - vmin;
-                l = (vmax + vmin)*0.5f;
-
-                if (diff > FLT_EPSILON)
-                {
-                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
-                    diff = 60.f/diff;
-
-                    if( vmax == r )
-                        h = (g - b)*diff;
-                    else if( vmax == g )
-                        h = fma(b - r, diff, 120.f);
-                    else
-                        h = fma(r - g, diff, 240.f);
-
-                    if( h < 0.f )
-                        h += 360.f;
-                }
-
-                dst[dst_index] = convert_uchar_sat_rte(h*hscale);
-                dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
-                dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
-                      __global uchar* dst, int dst_step, int dst_offset,
-                      int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                uchar4 src_pix = vload4(0, src + src_index);
-
-                float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
-                float b, g, r;
-
-                if (s != 0)
-                {
-                    float tab[4];
-
-                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
-                    float p1 = 2*l - p2;
-
-                    h *= hscale;
-                    if( h < 0 )
-                        do h += 6; while( h < 0 );
-                    else if( h >= 6 )
-                        do h -= 6; while( h >= 6 );
-
-                    int sector = convert_int_sat_rtn(h);
-                    h -= sector;
-
-                    tab[0] = p2;
-                    tab[1] = p1;
-                    tab[2] = fma(p2 - p1, 1-h, p1);
-                    tab[3] = fma(p2 - p1, h, p1);
-
-                    b = tab[sector_data[sector][0]];
-                    g = tab[sector_data[sector][1]];
-                    r = tab[sector_data[sector][2]];
-                }
-                else
-                    b = g = r = l;
-
-                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
-                dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
-                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
-#if dcn == 4
-                dst[dst_index + 3] = MAX_NUM;
-#endif
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#elif defined DEPTH_5
-
-__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
-                      __global uchar* dstptr, int dst_step, int dst_offset,
-                      int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const float * src = (__global const float *)(srcptr + src_index);
-                __global float * dst = (__global float *)(dstptr + dst_index);
-                float4 src_pix = vload4(0, src);
-
-                float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
-                float h = 0.f, s = 0.f, l;
-                float vmin, vmax, diff;
-
-                vmax = vmin = r;
-                if (vmax < g) vmax = g;
-                if (vmax < b) vmax = b;
-                if (vmin > g) vmin = g;
-                if (vmin > b) vmin = b;
-
-                diff = vmax - vmin;
-                l = (vmax + vmin)*0.5f;
-
-                if (diff > FLT_EPSILON)
-                {
-                    s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
-                    diff = 60.f/diff;
-
-                    if( vmax == r )
-                        h = (g - b)*diff;
-                    else if( vmax == g )
-                        h = fma(b - r, diff, 120.f);
-                    else
-                        h = fma(r - g, diff, 240.f);
-
-                    if( h < 0.f ) h += 360.f;
-                }
-
-                dst[0] = h*hscale;
-                dst[1] = l;
-                dst[2] = s;
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
-                      __global uchar* dstptr, int dst_step, int dst_offset,
-                      int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const float * src = (__global const float *)(srcptr + src_index);
-                __global float * dst = (__global float *)(dstptr + dst_index);
-                float4 src_pix = vload4(0, src);
-
-                float h = src_pix.x, l = src_pix.y, s = src_pix.z;
-                float b, g, r;
-
-                if (s != 0)
-                {
-                    float tab[4];
-                    int sector;
-
-                    float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
-                    float p1 = 2*l - p2;
-
-                    h *= hscale;
-                    if( h < 0 )
-                        do h += 6; while( h < 0 );
-                    else if( h >= 6 )
-                        do h -= 6; while( h >= 6 );
-
-                    sector = convert_int_sat_rtn(h);
-                    h -= sector;
-
-                    tab[0] = p2;
-                    tab[1] = p1;
-                    tab[2] = fma(p2 - p1, 1-h, p1);
-                    tab[3] = fma(p2 - p1, h, p1);
-
-                    b = tab[sector_data[sector][0]];
-                    g = tab[sector_data[sector][1]];
-                    r = tab[sector_data[sector][2]];
-                }
-                else
-                    b = g = r = l;
-
-                dst[bidx] = b;
-                dst[1] = g;
-                dst[bidx^2] = r;
-#if dcn == 4
-                dst[3] = MAX_NUM;
-#endif
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#endif
-
-/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
-
-#ifdef DEPTH_0
-
-__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
-                         __global uchar* dst, int dst_step, int dst_offset,
-                         int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, src_offset + (x << 2));
-        int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
-
-                *(__global uchar4 *)(dst + dst_index) =
-                    (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
-                             mad24(src_pix.y, src_pix.w, HALF_MAX_NUM) / MAX_NUM,
-                             mad24(src_pix.z, src_pix.w, HALF_MAX_NUM) / MAX_NUM, src_pix.w);
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
-                         __global uchar* dst, int dst_step, int dst_offset,
-                         int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
-                uchar v3 = src_pix.w, v3_half = v3 / 2;
-
-                if (v3 == 0)
-                    *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
-                else
-                    *(__global uchar4 *)(dst + dst_index) =
-                        (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
-                                 mad24(src_pix.y, MAX_NUM, v3_half) / v3,
-                                 mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#endif
-
-/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
-
-#define lab_shift xyz_shift
-#define gamma_shift 3
-#define lab_shift2 (lab_shift + gamma_shift)
-#define GAMMA_TAB_SIZE 1024
-#define GammaTabScale (float)GAMMA_TAB_SIZE
-
-inline float splineInterpolate(float x, __global const float * tab, int n)
-{
-    int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
-    x -= ix;
-    tab += ix << 2;
-    return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
-}
-
-#ifdef DEPTH_0
-
-__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
-                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
-                      __global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
-                      __constant int * coeffs, int Lscale, int Lshift)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const uchar* src_ptr = src + src_index;
-                __global uchar* dst_ptr = dst + dst_index;
-                uchar4 src_pix = vload4(0, src_ptr);
-
-                int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-                    C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-                    C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-
-                int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
-                int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
-                int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
-                int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
-
-                int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
-                int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
-                int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
-
-                dst_ptr[0] = SAT_CAST(L);
-                dst_ptr[1] = SAT_CAST(a);
-                dst_ptr[2] = SAT_CAST(b);
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#elif defined DEPTH_5
-
-__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
-                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
-#ifdef SRGB
-                      __global const float * gammaTab,
-#endif
-                      __constant float * coeffs, float _1_3, float _a)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const float * src = (__global const float *)(srcptr + src_index);
-                __global float * dst = (__global float *)(dstptr + dst_index);
-                float4 src_pix = vload4(0, src);
-
-                float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-                      C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-                      C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-
-                float R = clamp(src_pix.x, 0.0f, 1.0f);
-                float G = clamp(src_pix.y, 0.0f, 1.0f);
-                float B = clamp(src_pix.z, 0.0f, 1.0f);
-
-#ifdef SRGB
-                R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-#endif
-
-                // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
-                float X = fma(R, C0, fma(G, C1, B*C2));
-                float Y = fma(R, C3, fma(G, C4, B*C5));
-                float Z = fma(R, C6, fma(G, C7, B*C8));
-
-                float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
-                float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
-                float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
-
-                float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
-                float a = 500.f * (FX - FY);
-                float b = 200.f * (FY - FZ);
-
-                dst[0] = L;
-                dst[1] = a;
-                dst[2] = b;
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#endif
-
-inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
-#ifdef SRGB
-                      __global const float * gammaTab,
-#endif
-                      __constant float * coeffs, float lThresh, float fThresh)
-{
-    float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
-
-    float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-          C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-          C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-
-    float y, fy;
-    // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
-    if (li <= lThresh)
-    {
-        y = li / 903.3f;
-        fy = fma(7.787f, y, 16.0f / 116.0f);
-    }
-    else
-    {
-        fy = (li + 16.0f) / 116.0f;
-        y = fy * fy * fy;
-    }
-
-    float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
-
-    #pragma unroll
-    for (int j = 0; j < 2; j++)
-        if (fxz[j] <= fThresh)
-            fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
-        else
-            fxz[j] = fxz[j] * fxz[j] * fxz[j];
-
-    float x = fxz[0], z = fxz[1];
-    float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
-    float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
-    float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
-
-#ifdef SRGB
-    ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-    go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-    bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-#endif
-
-    dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
-}
-
-#ifdef DEPTH_0
-
-__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
-                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
-#ifdef SRGB
-                      __global const float * gammaTab,
-#endif
-                      __constant float * coeffs, float lThresh, float fThresh)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const uchar* src_ptr = src + src_index;
-                __global uchar * dst_ptr = dst + dst_index;
-                uchar4 src_pix = vload4(0, src_ptr);
-
-                float srcbuf[3], dstbuf[3];
-                srcbuf[0] = src_pix.x*(100.f/255.f);
-                srcbuf[1] = convert_float(src_pix.y - 128);
-                srcbuf[2] = convert_float(src_pix.z - 128);
-
-                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
-#ifdef SRGB
-                    gammaTab,
-#endif
-                    coeffs, lThresh, fThresh);
-
-#if dcn == 3
-                dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
-                dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
-                dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
-#else
-                *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
-                    SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
-#endif
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#elif defined DEPTH_5
-
-__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
-                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
-#ifdef SRGB
-                      __global const float * gammaTab,
-#endif
-                      __constant float * coeffs, float lThresh, float fThresh)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-        {
-            if (y < rows)
-            {
-                __global const float * src = (__global const float *)(srcptr + src_index);
-                __global float * dst = (__global float *)(dstptr + dst_index);
-                float4 src_pix = vload4(0, src);
-
-                float srcbuf[3], dstbuf[3];
-                srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
-
-                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
-#ifdef SRGB
-                    gammaTab,
-#endif
-                    coeffs, lThresh, fThresh);
-
-                dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
-#if dcn == 4
-                dst[3] = MAX_NUM;
-#endif
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-        }
-    }
-}
-
-#endif
-
-/////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
-
-#define LAB_CBRT_TAB_SIZE 1024
-#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
-
-__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
-
-#ifdef DEPTH_5
-
-__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
-                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
-#ifdef SRGB
-                      __global const float * gammaTab,
-#endif
-                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-            if (y < rows)
-            {
-                __global const float * src = (__global const float *)(srcptr + src_index);
-                __global float * dst = (__global float *)(dstptr + dst_index);
-
-                float R = src[0], G = src[1], B = src[2];
-
-                R = clamp(R, 0.f, 1.f);
-                G = clamp(G, 0.f, 1.f);
-                B = clamp(B, 0.f, 1.f);
-
-#ifdef SRGB
-                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-#endif
-                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
-                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
-                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
-
-                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-                L = fma(116.f, L, -16.f);
-
-                float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
-                float u = L*fma(X, d, -_un);
-                float v = L*fma(2.25f, Y*d, -_vn);
-
-                dst[0] = L;
-                dst[1] = u;
-                dst[2] = v;
-
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-    }
-}
-
-#elif defined DEPTH_0
-
-__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
-                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
-#ifdef SRGB
-                      __global const float * gammaTab,
-#endif
-                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-            if (y < rows)
-            {
-                float scale = 1.0f / 255.0f;
-                float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
-
-#ifdef SRGB
-                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-#endif
-                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
-                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
-                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
-
-                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-                L = 116.f*L - 16.f;
-
-                float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
-                float u = L*(X*d - _un);
-                float v = L*fma(2.25f, Y*d, -_vn);
-
-                dst[0] = SAT_CAST(L * 2.55f);
-                //0.72033 = 255/(220+134), 96.525 = 134*255/(220+134)
-                dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
-                //0.9732 = 255/(140+122), 136.259 = 140*255/(140+122)
-                dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
-
-                ++y;
-                dst += dst_step;
-                src += src_step;
-            }
-    }
-}
-
-#endif
-
-#ifdef DEPTH_5
-
-__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
-                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
-#ifdef SRGB
-                      __global const float * gammaTab,
-#endif
-                      __constant float * coeffs, float _un, float _vn)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-            if (y < rows)
-            {
-                __global const float * src = (__global const float *)(srcptr + src_index);
-                __global float * dst = (__global float *)(dstptr + dst_index);
-
-                float L = src[0], u = src[1], v = src[2], X, Y, Z;
-                if(L >= 8)
-                {
-                    Y = fma(L, 1.f/116.f, 16.f/116.f);
-                    Y = Y*Y*Y;
-                }
-                else
-                {
-                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
-                }
-                float up = 3.f*fma(L, _un, u);
-                float vp = 0.25f/fma(L, _vn, v);
-                vp = clamp(vp, -0.25f, 0.25f);
-                X = 3.f*Y*up*vp;
-                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
-
-                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
-                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
-                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
-
-                R = clamp(R, 0.f, 1.f);
-                G = clamp(G, 0.f, 1.f);
-                B = clamp(B, 0.f, 1.f);
-
-#ifdef SRGB
-                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-#endif
-
-                dst[0] = R;
-                dst[1] = G;
-                dst[2] = B;
-#if dcn == 4
-                dst[3] = MAX_NUM;
-#endif
-                ++y;
-                dst_index += dst_step;
-                src_index += src_step;
-            }
-    }
-}
-
-#elif defined DEPTH_0
-
-__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
-                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
-#ifdef SRGB
-                      __global const float * gammaTab,
-#endif
-                      __constant float * coeffs, float _un, float _vn)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1) * PIX_PER_WI_Y;
-
-    if (x < cols)
-    {
-        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
-        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
-            if (y < rows)
-            {
-                float d, X, Y, Z;
-                float L = src[0]*(100.f/255.f);
-                // 1.388235294117647 = (220+134)/255
-                float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
-                // 1.027450980392157 = (140+122)/255
-                float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
-                if(L >= 8)
-                {
-                    Y = fma(L, 1.f/116.f, 16.f/116.f);
-                    Y = Y*Y*Y;
-                }
-                else
-                {
-                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
-                }
-                float up = 3.f*fma(L, _un, u);
-                float vp = 0.25f/fma(L, _vn, v);
-                vp = clamp(vp, -0.25f, 0.25f);
-                X = 3.f*Y*up*vp;
-                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
-
-                //limit X, Y, Z to [0, 2] to fit white point
-                X = clamp(X, 0.f, 2.f); Z = clamp(Z, 0.f, 2.f);
-
-                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
-                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
-                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
-
-                R = clamp(R, 0.f, 1.f);
-                G = clamp(G, 0.f, 1.f);
-                B = clamp(B, 0.f, 1.f);
-
-#ifdef SRGB
-                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
-#endif
-
-                uchar dst0 = SAT_CAST(R * 255.0f);
-                uchar dst1 = SAT_CAST(G * 255.0f);
-                uchar dst2 = SAT_CAST(B * 255.0f);
-
-#if dcn == 4
-                *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
-#else
-                dst[0] = dst0;
-                dst[1] = dst1;
-                dst[2] = dst2;
-#endif
-
-                ++y;
-                dst += dst_step;
-                src += src_step;
-            }
-    }
-}
-
-#endif