/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

/* ////////////////////////////////////////////////////////////////////
//
//  Matrix arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ...
//
// */

#include "precomp.hpp"

namespace cv
{

#if CV_SSE2

enum { ARITHM_SIMD = CV_CPU_SSE2 };
    
template<class Op8> struct VBinOp8
{
    int operator()(const uchar* src1, const uchar* src2, uchar* dst, int len) const
    {
        int x = 0;
        for( ; x <= len - 32; x += 32 )
        {
            __m128i r0 = _mm_loadu_si128((const __m128i*)(src1 + x));
            __m128i r1 = _mm_loadu_si128((const __m128i*)(src1 + x + 16));
            r0 = op(r0,_mm_loadu_si128((const __m128i*)(src2 + x)));
            r1 = op(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 16)));
            _mm_storeu_si128((__m128i*)(dst + x), r0);
            _mm_storeu_si128((__m128i*)(dst + x + 16), r1);
        }
        for( ; x <= len - 8; x += 8 )
        {
            __m128i r0 = _mm_loadl_epi64((const __m128i*)(src1 + x));
            r0 = op(r0,_mm_loadl_epi64((const __m128i*)(src2 + x)));
            _mm_storel_epi64((__m128i*)(dst + x), r0);
        }
        return x;
    }
    Op8 op;
};

template<typename T, class Op16> struct VBinOp16
{
    int operator()(const T* src1, const T* src2, T* dst, int len) const
    {
        int x = 0;
        for( ; x <= len - 16; x += 16 )
        {
            __m128i r0 = _mm_loadu_si128((const __m128i*)(src1 + x));
            __m128i r1 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
            r0 = op(r0,_mm_loadu_si128((const __m128i*)(src2 + x)));
            r1 = op(r1,_mm_loadu_si128((const __m128i*)(src2 + x + 8)));
            _mm_storeu_si128((__m128i*)(dst + x), r0);
            _mm_storeu_si128((__m128i*)(dst + x + 8), r1);
        }
        for( ; x <= len - 4; x += 4 )
        {
            __m128i r0 = _mm_loadl_epi64((const __m128i*)(src1 + x));
            r0 = op(r0,_mm_loadl_epi64((const __m128i*)(src2 + x)));
            _mm_storel_epi64((__m128i*)(dst + x), r0);
        }
        return x;
    }
    Op16 op;
};

template<class Op32f> struct VBinOp32f
{
    int operator()(const float* src1, const float* src2, float* dst, int len) const
    {
        int x = 0;
        if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
            for( ; x <= len - 8; x += 8 )
            {
                __m128 r0 = _mm_load_ps(src1 + x);
                __m128 r1 = _mm_load_ps(src1 + x + 4);
                r0 = op(r0,_mm_load_ps(src2 + x));
                r1 = op(r1,_mm_load_ps(src2 + x + 4));
                _mm_store_ps(dst + x, r0);
                _mm_store_ps(dst + x + 4, r1);
            }
        else
            for( ; x <= len - 8; x += 8 )
            {
                __m128 r0 = _mm_loadu_ps(src1 + x);
                __m128 r1 = _mm_loadu_ps(src1 + x + 4);
                r0 = op(r0,_mm_loadu_ps(src2 + x));
                r1 = op(r1,_mm_loadu_ps(src2 + x + 4));
                _mm_storeu_ps(dst + x, r0);
                _mm_storeu_ps(dst + x + 4, r1);
            }
        return x;
    }
    Op32f op;
};

struct _VAdd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu8(a,b); }};
struct _VSub8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu8(a,b); }};
struct _VMin8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }};
struct _VMax8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epu8(a,b); }};
struct _VCmpGT8u { __m128i operator()(const __m128i& a, const __m128i& b) const
{
    __m128i delta = _mm_set1_epi32(0x80808080);
    return _mm_cmpgt_epi8(_mm_xor_si128(a,delta),_mm_xor_si128(b,delta));
}};
struct _VCmpEQ8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_cmpeq_epi8(a,b); }};
struct _VAbsDiff8u
{
    __m128i operator()(const __m128i& a, const __m128i& b) const
    { return _mm_add_epi8(_mm_subs_epu8(a,b),_mm_subs_epu8(b,a)); }
};
struct _VAdd16u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu16(a,b); }};
struct _VSub16u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu16(a,b); }};
struct _VMin16u
{
    __m128i operator()(const __m128i& a, const __m128i& b) const
    { return _mm_subs_epu16(a,_mm_subs_epu16(a,b)); }
};
struct _VMax16u
{
    __m128i operator()(const __m128i& a, const __m128i& b) const
    { return _mm_adds_epu16(_mm_subs_epu16(a,b),b); }
};
struct _VAbsDiff16u
{
    __m128i operator()(const __m128i& a, const __m128i& b) const
    { return _mm_add_epi16(_mm_subs_epu16(a,b),_mm_subs_epu16(b,a)); }
};
struct _VAdd16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epi16(a,b); }};
struct _VSub16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epi16(a,b); }};
struct _VMin16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epi16(a,b); }};
struct _VMax16s { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epi16(a,b); }};
struct _VAbsDiff16s
{
    __m128i operator()(const __m128i& a, const __m128i& b) const
    {
        __m128i M = _mm_max_epi16(a,b), m = _mm_min_epi16(a,b);
        return _mm_subs_epi16(M, m);
    }
};
struct _VAdd32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_add_ps(a,b); }};
struct _VSub32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_sub_ps(a,b); }};
struct _VMin32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_min_ps(a,b); }};
struct _VMax32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_max_ps(a,b); }};
static int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
struct _VAbsDiff32f
{
    __m128 operator()(const __m128& a, const __m128& b) const
    {
        return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
    }
};

struct _VAnd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_and_si128(a,b); }};
struct _VOr8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_or_si128(a,b); }};
struct _VXor8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_xor_si128(a,b); }};

typedef VBinOp8<_VAdd8u> VAdd8u;
typedef VBinOp8<_VSub8u> VSub8u;
typedef VBinOp8<_VMin8u> VMin8u;
typedef VBinOp8<_VMax8u> VMax8u;
typedef VBinOp8<_VAbsDiff8u> VAbsDiff8u;
typedef VBinOp8<_VCmpEQ8u> VCmpEQ8u;
typedef VBinOp8<_VCmpGT8u> VCmpGT8u;

typedef VBinOp16<ushort, _VAdd16u> VAdd16u;
typedef VBinOp16<ushort, _VSub16u> VSub16u;
typedef VBinOp16<ushort, _VMin16u> VMin16u;
typedef VBinOp16<ushort, _VMax16u> VMax16u;
typedef VBinOp16<ushort, _VAbsDiff16u> VAbsDiff16u;

typedef VBinOp16<short, _VAdd16s> VAdd16s;
typedef VBinOp16<short, _VSub16s> VSub16s;
typedef VBinOp16<short, _VMin16s> VMin16s;
typedef VBinOp16<short, _VMax16s> VMax16s;
typedef VBinOp16<short, _VAbsDiff16s> VAbsDiff16s;

typedef VBinOp32f<_VAdd32f> VAdd32f;
typedef VBinOp32f<_VSub32f> VSub32f;
typedef VBinOp32f<_VMin32f> VMin32f;
typedef VBinOp32f<_VMax32f> VMax32f;
typedef VBinOp32f<_VAbsDiff32f> VAbsDiff32f;

typedef VBinOp8<_VAnd8u> VAnd8u;
typedef VBinOp8<_VOr8u> VOr8u;
typedef VBinOp8<_VXor8u> VXor8u;

#else

enum { ARITHM_SIMD = CV_CPU_NONE };    
    
typedef NoVec VAdd8u;
typedef NoVec VSub8u;
typedef NoVec VMin8u;
typedef NoVec VMax8u;
typedef NoVec VAbsDiff8u;
typedef NoVec VCmpEQ8u;
typedef NoVec VCmpGT8u;

typedef NoVec VAdd16u;
typedef NoVec VSub16u;
typedef NoVec VMin16u;
typedef NoVec VMax16u;
typedef NoVec VAbsDiff16u;

typedef NoVec VAdd16s;
typedef NoVec VSub16s;
typedef NoVec VMin16s;
typedef NoVec VMax16s;
typedef NoVec VAbsDiff16s;

typedef NoVec VAdd32f;
typedef NoVec VSub32f;
typedef NoVec VMin32f;
typedef NoVec VMax32f;
typedef NoVec VAbsDiff32f;

typedef NoVec VAnd8u;
typedef NoVec VOr8u;
typedef NoVec VXor8u;

#endif

/****************************************************************************************\
*                                   logical operations                                   *
\****************************************************************************************/

template<typename T> struct AndOp
{
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()( T a, T b ) const { return a & b; }
};

template<typename T> struct OrOp
{
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()( T a, T b ) const { return a | b; }
};

template<typename T> struct XorOp
{
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()( T a, T b ) const { return a ^ b; }
};

template<class OPB, class OPI, class OPV> static void
bitwiseOp_( const Mat& srcmat1, const Mat& srcmat2, Mat& dstmat )
{
    OPB opb; OPI opi; OPV opv;
    const uchar* src1 = srcmat1.data;
    const uchar* src2 = srcmat2.data;
    uchar* dst = dstmat.data;
    size_t step1 = srcmat1.step, step2 = srcmat2.step, step = dstmat.step;
    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, (int)srcmat1.elemSize() );
    bool useSIMD = checkHardwareSupport(ARITHM_SIMD);

    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
    {
        int i = useSIMD ? opv(src1, src2, dst, size.width) : 0;

        if( (((size_t)src1 | (size_t)src2 | (size_t)dst) & 3) == 0 )
        {
            for( ; i <= size.width - 16; i += 16 )
            {
                int t0 = opi(((const int*)(src1+i))[0], ((const int*)(src2+i))[0]);
                int t1 = opi(((const int*)(src1+i))[1], ((const int*)(src2+i))[1]);

                ((int*)(dst+i))[0] = t0;
                ((int*)(dst+i))[1] = t1;

                t0 = opi(((const int*)(src1+i))[2], ((const int*)(src2+i))[2]);
                t1 = opi(((const int*)(src1+i))[3], ((const int*)(src2+i))[3]);

                ((int*)(dst+i))[2] = t0;
                ((int*)(dst+i))[3] = t1;
            }

            for( ; i <= size.width - 4; i += 4 )
            {
                int t = opi(*(const int*)(src1+i), *(const int*)(src2+i));
                *(int*)(dst+i) = t;
            }
        }

        for( ; i < size.width; i++ )
            dst[i] = opb(src1[i], src2[i]);
    }
}


template<class OPB, class OPI, class OPV> static void
bitwiseSOp_( const Mat& srcmat, Mat& dstmat, const Scalar& _scalar )
{
    OPB opb; OPI opi; OPV opv;
    const uchar* src0 = srcmat.data;
    uchar* dst0 = dstmat.data;
    size_t step1 = srcmat.step, step = dstmat.step;
    Size size = getContinuousSize( srcmat, dstmat, (int)srcmat.elemSize() );
    const int delta = 96;
    uchar scalar[delta];
    scalarToRawData(_scalar, scalar, srcmat.type(), (int)(delta/srcmat.elemSize1()) );
    bool useSIMD = checkHardwareSupport(ARITHM_SIMD);

    for( ; size.height--; src0 += step1, dst0 += step )
    {
        const uchar* src = (const uchar*)src0;
        uchar* dst = dst0;
        int i, len = size.width;

        if( (((size_t)src|(size_t)dst) & 3) == 0 )
        {
            while( (len -= delta) >= 0 )
            {
                i = useSIMD ? opv(src, scalar, dst, delta) : 0;
                for( ; i < delta; i += 16 )
                {
                    int t0 = opi(((const int*)(src+i))[0], ((const int*)(scalar+i))[0]);
                    int t1 = opi(((const int*)(src+i))[1], ((const int*)(scalar+i))[1]);
                    ((int*)(dst+i))[0] = t0;
                    ((int*)(dst+i))[1] = t1;

                    t0 = opi(((const int*)(src+i))[2], ((const int*)(scalar+i))[2]);
                    t1 = opi(((const int*)(src+i))[3], ((const int*)(scalar+i))[3]);
                    ((int*)(dst+i))[2] = t0;
                    ((int*)(dst+i))[3] = t1;
                }
                src += delta;
                dst += delta;
            }
        }
        else
        {
            while( (len -= delta) >= 0 )
            {
                for( i = 0; i < delta; i += 4 )
                {
                    uchar t0 = opb(src[i], scalar[i]);
                    uchar t1 = opb(src[i+1], scalar[i+1]);
                    dst[i] = t0; dst[i+1] = t1;

                    t0 = opb(src[i+2], scalar[i+2]);
                    t1 = opb(src[i+3], scalar[i+3]);
                    dst[i+2] = t0; dst[i+3] = t1;
                }
                src += delta;
                dst += delta;
            }
        }

        for( len += delta, i = 0; i < len; i++ )
            dst[i] = opb(src[i],scalar[i]);
    }
}

static void
binaryMaskOp( const Mat& src1, const Mat& src2, Mat& dst,
              const Mat& mask, BinaryFunc func )
{
    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() && func != 0 );
    dst.create( src1.size(), src1.type() );

    if( !mask.data )
        func(src1, src2, dst);
    else
    {
        AutoBuffer<uchar> buf;
        size_t esz = dst.elemSize(), buf_step = dst.cols*esz;
        CopyMaskFunc copym_func = getCopyMaskFunc((int)esz);
        int y, dy;

        CV_Assert(mask.type() == CV_8UC1 && mask.size() == dst.size());
        dy = std::min(std::max((int)(CV_MAX_LOCAL_SIZE/buf_step), 1), dst.rows);
        buf.allocate( buf_step*dy );

        for( y = 0; y < dst.rows; y += dy )
        {
            dy = std::min(dy, dst.rows - y);
            Mat dstpart = dst.rowRange(y, y + dy);
            Mat temp(dy, dst.cols, dst.type(), (uchar*)buf );
            func( src1.rowRange(y, y + dy), src2.rowRange(y, y + dy), temp );
            copym_func( temp, dstpart, mask.rowRange(y, y + dy) );
        }
    }
}


static void
binarySMaskOp( const Mat& src1, const Scalar& s, Mat& dst,
               const Mat& mask, BinarySFuncCn func )
{
    CV_Assert( func != 0 );
    dst.create( src1.size(), src1.type() );

    if( !mask.data )
        func(src1, dst, s);
    else
    {
        AutoBuffer<uchar> buf;
        size_t esz = dst.elemSize(), buf_step = dst.cols*esz;
        CopyMaskFunc copym_func = getCopyMaskFunc((int)esz);
        int y, dy;

        CV_Assert(mask.type() == CV_8UC1 && mask.size() == dst.size());
        dy = std::min(std::max((int)(CV_MAX_LOCAL_SIZE/buf_step), 1), dst.rows);
        buf.allocate( buf_step*dy );

        for( y = 0; y < dst.rows; y += dy )
        {
            dy = std::min(dy, dst.rows - y);
            Mat dstpart = dst.rowRange(y, y + dy);
            Mat temp(dy, dst.cols, dst.type(), (uchar*)buf);
            func( src1.rowRange(y, y + dy), temp, s );
            copym_func( temp, dstpart, mask.rowRange(y, y + dy) );
        }
    }
}


void bitwise_and(const Mat& a, const Mat& b, Mat& c, const Mat& mask)
{
    binaryMaskOp(a, b, c, mask, bitwiseOp_<AndOp<uchar>, AndOp<int>, VAnd8u>);
}

void bitwise_or(const Mat& a, const Mat& b, Mat& c, const Mat& mask)
{
    binaryMaskOp(a, b, c, mask, bitwiseOp_<OrOp<uchar>, OrOp<int>, VOr8u>);
}

void bitwise_xor(const Mat& a, const Mat& b, Mat& c, const Mat& mask)
{
    binaryMaskOp(a, b, c, mask, bitwiseOp_<XorOp<uchar>, XorOp<int>, VXor8u>);
}

void bitwise_and(const Mat& a, const Scalar& s, Mat& c, const Mat& mask)
{
    binarySMaskOp(a, s, c, mask,
        bitwiseSOp_<AndOp<uchar>, AndOp<int>, VAnd8u>);
}

void bitwise_or(const Mat& a, const Scalar& s, Mat& c, const Mat& mask)
{
    binarySMaskOp(a, s, c, mask,
        bitwiseSOp_<OrOp<uchar>, OrOp<int>, VOr8u>);
}

void bitwise_xor(const Mat& a, const Scalar& s, Mat& c, const Mat& mask)
{
    binarySMaskOp(a, s, c, mask,
        bitwiseSOp_<XorOp<uchar>, XorOp<int>, VXor8u>);
}


void bitwise_not(const Mat& src, Mat& dst)
{
    const uchar* sptr = src.data;
    dst.create( src.size(), src.type() );
    uchar* dptr = dst.data;
    Size size = getContinuousSize( src, dst, (int)src.elemSize() );

    for( ; size.height--; sptr += src.step, dptr += dst.step )
    {
        int i = 0;
        if( (((size_t)sptr | (size_t)dptr) & 3) == 0 )
        {
            for( ; i <= size.width - 16; i += 16 )
            {
                int t0 = ~((const int*)(sptr+i))[0];
                int t1 = ~((const int*)(sptr+i))[1];

                ((int*)(dptr+i))[0] = t0;
                ((int*)(dptr+i))[1] = t1;

                t0 = ~((const int*)(sptr+i))[2];
                t1 = ~((const int*)(sptr+i))[3];

                ((int*)(dptr+i))[2] = t0;
                ((int*)(dptr+i))[3] = t1;
            }

            for( ; i <= size.width - 4; i += 4 )
                *(int*)(dptr+i) = ~*(const int*)(sptr+i);
        }

        for( ; i < size.width; i++ )
        {
            dptr[i] = (uchar)(~sptr[i]);
        }
    }
}

/****************************************************************************************\
*                                      add/subtract                                      *
\****************************************************************************************/

template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a + b); }
template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a - b); }

static BinaryFunc addTab[] =
{
    binaryOpC1_<OpAdd<uchar>,VAdd8u>, 0,
    binaryOpC1_<OpAdd<ushort>,VAdd16u>,
    binaryOpC1_<OpAdd<short>,VAdd16s>,
    binaryOpC1_<OpAdd<int>,NoVec>,
    binaryOpC1_<OpAdd<float>,VAdd32f>,
    binaryOpC1_<OpAdd<double>,NoVec>, 0
};

static BinaryFunc subTab[] =
{
    binaryOpC1_<OpSub<uchar>,VSub8u>, 0,
    binaryOpC1_<OpSub<ushort>,VSub16u>,
    binaryOpC1_<OpSub<short>,VSub16s>,
    binaryOpC1_<OpSub<int>,NoVec>,
    binaryOpC1_<OpSub<float>,VSub32f>,
    binaryOpC1_<OpSub<double>,NoVec>, 0
};


void add( const Mat& src1, const Mat& src2, Mat& dst )
{
    Size size = src1.size(); int type = src1.type();
    BinaryFunc func = addTab[CV_MAT_DEPTH(type)];
    CV_Assert( size == src2.size() && type == src2.type() && func != 0 );
    dst.create( size, type );
    func(src1, src2, dst);
}

void subtract( const Mat& src1, const Mat& src2, Mat& dst )
{
    Size size = src1.size(); int type = src1.type();
    BinaryFunc func = subTab[CV_MAT_DEPTH(type)];
    CV_Assert( size == src2.size() && type == src2.type() && func != 0 );
    dst.create( size, type );
    func(src1, src2, dst);
}

void subtract(const Mat& a, const Scalar& s, Mat& c, const Mat& mask)
{
    add(a, -s, c, mask);
}

void add(const Mat& src1, const Mat& src2, Mat& dst, const Mat& mask)
{
    binaryMaskOp(src1, src2, dst, mask, addTab[src1.depth()] );
}

void subtract(const Mat& src1, const Mat& src2, Mat& dst, const Mat& mask)
{
    binaryMaskOp(src1, src2, dst, mask, subTab[src1.depth()] );
}

void add(const Mat& src1, const Scalar& s, Mat& dst, const Mat& mask)
{
    static BinarySFuncCn addSTab[] =
    {
        binarySOpCn_<OpAdd<uchar, int, uchar> >, 0,
        binarySOpCn_<OpAdd<ushort, int, ushort> >,
        binarySOpCn_<OpAdd<short, int, short> >,
        binarySOpCn_<OpAdd<int> >,
        binarySOpCn_<OpAdd<float> >,
        binarySOpCn_<OpAdd<double> >, 0
    };
    int depth = src1.depth();
    binarySMaskOp(src1, s, dst, mask, addSTab[depth]);
}

void subtract(const Scalar& s, const Mat& src1, Mat& dst, const Mat& mask)
{
    static BinarySFuncCn rsubSTab[] =
    {
        binarySOpCn_<OpRSub<uchar, int, uchar> >, 0,
        binarySOpCn_<OpRSub<ushort, int, ushort> >,
        binarySOpCn_<OpRSub<short, int, short> >,
        binarySOpCn_<OpRSub<int> >,
        binarySOpCn_<OpRSub<float> >,
        binarySOpCn_<OpRSub<double> >, 0
    };
    int depth = src1.depth();
    binarySMaskOp(src1, s, dst, mask, rsubSTab[depth]);
}

/****************************************************************************************\
*                                    multiply/divide                                     *
\****************************************************************************************/

template<typename T, typename WT> static void
mul_( const Mat& srcmat1, const Mat& srcmat2, Mat& dstmat, double _scale )
{
    const T* src1 = (const T*)srcmat1.data;
    const T* src2 = (const T*)srcmat2.data;
    T* dst = (T*)dstmat.data;
    size_t step1 = srcmat1.step/sizeof(src1[0]);
    size_t step2 = srcmat2.step/sizeof(src2[0]);
    size_t step = dstmat.step/sizeof(dst[0]);
    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, dstmat.channels() );

    if( fabs(_scale - 1.) < DBL_EPSILON )
    {
        for( ; size.height--; src1+=step1, src2+=step2, dst+=step )
        {
            int i;
            for( i = 0; i <= size.width - 4; i += 4 )
            {
                T t0 = saturate_cast<T>(src1[i] * src2[i]);
                T t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
                dst[i] = t0; dst[i+1] = t1;

                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
                dst[i+2] = t0; dst[i+3] = t1;
            }

            for( ; i < size.width; i++ )
                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
        }
    }
    else
    {
        WT scale = (WT)_scale;
        for( ; size.height--; src1+=step1, src2+=step2, dst+=step )
        {
            int i;
            for( i = 0; i <= size.width - 4; i += 4 )
            {
                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
                dst[i] = t0; dst[i+1] = t1;

                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
                dst[i+2] = t0; dst[i+3] = t1;
            }

            for( ; i < size.width; i++ )
                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
        }
    }
}

typedef void (*MulDivFunc)( const Mat& src1, const Mat& src2,
                            Mat& dst, double scale );

void multiply(const Mat& src1, const Mat& src2, Mat& dst, double scale)
{
    static MulDivFunc tab[] =
    {
        mul_<uchar, float>, 0, mul_<ushort, float>, mul_<short, float>,
        mul_<int, double>, mul_<float, float>, mul_<double, double>, 0
    };

    MulDivFunc func = tab[src1.depth()];
    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() && func != 0 );
    dst.create( src1.size(), src1.type() );
    func( src1, src2, dst, scale );
}


template<typename T> static void
div_( const Mat& srcmat1, const Mat& srcmat2, Mat& dstmat, double scale )
{
    const T* src1 = (const T*)srcmat1.data;
    const T* src2 = (const T*)srcmat2.data;
    T* dst = (T*)dstmat.data;
    size_t step1 = srcmat1.step/sizeof(src1[0]);
    size_t step2 = srcmat2.step/sizeof(src2[0]);
    size_t step = dstmat.step/sizeof(dst[0]);
    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, dstmat.channels() );

    for( ; size.height--; src1+=step1, src2+=step2, dst+=step )
    {
        int i = 0;
        for( ; i <= size.width - 4; i += 4 )
        {
            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
            {
                double a = (double)src2[i] * src2[i+1];
                double b = (double)src2[i+2] * src2[i+3];
                double d = scale/(a * b);
                b *= d;
                a *= d;

                T z0 = saturate_cast<T>(src2[i+1] * src1[i] * b);
                T z1 = saturate_cast<T>(src2[i] * src1[i+1] * b);
                T z2 = saturate_cast<T>(src2[i+3] * src1[i+2] * a);
                T z3 = saturate_cast<T>(src2[i+2] * src1[i+3] * a);

                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
            else
            {
                T z0 = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
                T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
                T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
                T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;

                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
        }

        for( ; i < size.width; i++ )
            dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
    }
}


void divide(const Mat& src1, const Mat& src2, Mat& dst, double scale)
{
    static MulDivFunc tab[] =
    {
        div_<uchar>, 0, div_<ushort>, div_<short>,
        div_<int>, div_<float>, div_<double>, 0
    };

    MulDivFunc func = tab[src1.depth()];
    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() && func != 0 );
    dst.create( src1.size(), src1.type() );
    func( src1, src2, dst, scale );
}

template<typename T> static void
recip_( double scale, const Mat& srcmat2, Mat& dstmat )
{
    const T* src2 = (const T*)srcmat2.data;
    T* dst = (T*)dstmat.data;
    size_t step2 = srcmat2.step/sizeof(src2[0]);
    size_t step = dstmat.step/sizeof(dst[0]);
    Size size = getContinuousSize( srcmat2, dstmat, dstmat.channels() );

    for( ; size.height--; src2+=step2, dst+=step )
    {
        int i = 0;
        for( ; i <= size.width - 4; i += 4 )
        {
            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
            {
                double a = (double)src2[i] * src2[i+1];
                double b = (double)src2[i+2] * src2[i+3];
                double d = scale/(a * b);
                b *= d;
                a *= d;

                T z0 = saturate_cast<T>(src2[i+1] * b);
                T z1 = saturate_cast<T>(src2[i] * b);
                T z2 = saturate_cast<T>(src2[i+3] * a);
                T z3 = saturate_cast<T>(src2[i+2] * a);

                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
            else
            {
                T z0 = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
                T z1 = src2[i+1] != 0 ? saturate_cast<T>(scale/src2[i+1]) : 0;
                T z2 = src2[i+2] != 0 ? saturate_cast<T>(scale/src2[i+2]) : 0;
                T z3 = src2[i+3] != 0 ? saturate_cast<T>(scale/src2[i+3]) : 0;

                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
        }

        for( ; i < size.width; i++ )
            dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
    }
}

typedef void (*RecipFunc)( double scale, const Mat& src, Mat& dst );

void divide(double scale, const Mat& src, Mat& dst)
{
    static RecipFunc tab[] =
    {
        recip_<uchar>, 0, recip_<ushort>, recip_<short>,
        recip_<int>, recip_<float>, recip_<double>, 0
    };

    RecipFunc func = tab[src.depth()];
    CV_Assert( func != 0 );
    dst.create( src.size(), src.type() );
    func( scale, src, dst );
}

/****************************************************************************************\
*                                      addWeighted                                       *
\****************************************************************************************/

template<typename T, typename WT> static void
addWeighted_( const Mat& srcmat1, double _alpha, const Mat& srcmat2,
              double _beta, double _gamma, Mat& dstmat )
{
    const T* src1 = (const T*)srcmat1.data;
    const T* src2 = (const T*)srcmat2.data;
    T* dst = (T*)dstmat.data;
    size_t step1 = srcmat1.step/sizeof(src1[0]);
    size_t step2 = srcmat2.step/sizeof(src2[0]);
    size_t step = dstmat.step/sizeof(dst[0]);
    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, dstmat.channels() );
    WT alpha = (WT)_alpha, beta = (WT)_beta, gamma = (WT)_gamma;

    for( ; size.height--; src1+=step1, src2+=step2, dst+=step )
    {
        int i = 0;
        for( ; i <= size.width - 4; i += 4 )
        {
            T t0 = saturate_cast<T>(src1[i]*alpha + src2[i]*beta + gamma);
            T t1 = saturate_cast<T>(src1[i+1]*alpha + src2[i+1]*beta + gamma);
            dst[i] = t0; dst[i+1] = t1;

            t0 = saturate_cast<T>(src1[i+2]*alpha + src2[i+2]*beta + gamma);
            t1 = saturate_cast<T>(src1[i+3]*alpha + src2[i+3]*beta + gamma);
            dst[i+2] = t0; dst[i+3] = t1;
        }

        for( ; i < size.width; i++ )
            dst[i] = saturate_cast<T>(src1[i]*alpha + src2[i]*beta + gamma);
    }
}


static void
addWeighted8u( const Mat& srcmat1, double alpha,
               const Mat& srcmat2, double beta,
               double gamma, Mat& dstmat )
{
    const int shift = 14;
    if( srcmat1.rows*srcmat1.cols*srcmat1.channels() <= 256 ||
        fabs(alpha) > 256 || fabs(beta) > 256 || fabs(gamma) > 256*256 )
    {
        addWeighted_<uchar, float>(srcmat1, alpha, srcmat2, beta, gamma, dstmat);
        return;
    }
    const uchar* src1 = srcmat1.data;
    const uchar* src2 = srcmat2.data;
    uchar* dst = dstmat.data;
    size_t step1 = srcmat1.step;
    size_t step2 = srcmat2.step;
    size_t step = dstmat.step;
    Size size = getContinuousSize( srcmat1, srcmat2, dstmat, dstmat.channels() );

    int tab1[256], tab2[256];
    double t = 0;
    int j, t0, t1, t2, t3;

    alpha *= 1 << shift;
    gamma = gamma*(1 << shift) + (1 << (shift - 1));
    beta *= 1 << shift;

    for( j = 0; j < 256; j++ )
    {
        tab1[j] = cvRound(t);
        tab2[j] = cvRound(gamma);
        t += alpha;
        gamma += beta;
    }

    t0 = (tab1[0] + tab2[0]) >> shift;
    t1 = (tab1[0] + tab2[255]) >> shift;
    t2 = (tab1[255] + tab2[0]) >> shift;
    t3 = (tab1[255] + tab2[255]) >> shift;

    if( (unsigned)(t0+256) < 768 && (unsigned)(t1+256) < 768 &&
        (unsigned)(t2+256) < 768 && (unsigned)(t3+256) < 768 )
    {
        // use faster table-based convertion back to 8u
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int i;

            for( i = 0; i <= size.width - 4; i += 4 )
            {
                t0 = CV_FAST_CAST_8U((tab1[src1[i]] + tab2[src2[i]]) >> shift);
                t1 = CV_FAST_CAST_8U((tab1[src1[i+1]] + tab2[src2[i+1]]) >> shift);

                dst[i] = (uchar)t0;
                dst[i+1] = (uchar)t1;

                t0 = CV_FAST_CAST_8U((tab1[src1[i+2]] + tab2[src2[i+2]]) >> shift);
                t1 = CV_FAST_CAST_8U((tab1[src1[i+3]] + tab2[src2[i+3]]) >> shift);

                dst[i+2] = (uchar)t0;
                dst[i+3] = (uchar)t1;
            }

            for( ; i < size.width; i++ )
            {
                t0 = CV_FAST_CAST_8U((tab1[src1[i]] + tab2[src2[i]]) >> shift);
                dst[i] = (uchar)t0;
            }
        }
    }
    else
    {
        // use universal macro for convertion back to 8u
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int i;

            for( i = 0; i <= size.width - 4; i += 4 )
            {
                t0 = (tab1[src1[i]] + tab2[src2[i]]) >> shift;
                t1 = (tab1[src1[i+1]] + tab2[src2[i+1]]) >> shift;

                dst[i] = CV_CAST_8U( t0 );
                dst[i+1] = CV_CAST_8U( t1 );

                t0 = (tab1[src1[i+2]] + tab2[src2[i+2]]) >> shift;
                t1 = (tab1[src1[i+3]] + tab2[src2[i+3]]) >> shift;

                dst[i+2] = CV_CAST_8U( t0 );
                dst[i+3] = CV_CAST_8U( t1 );
            }

            for( ; i < size.width; i++ )
            {
                t0 = (tab1[src1[i]] + tab2[src2[i]]) >> shift;
                dst[i] = CV_CAST_8U( t0 );
            }
        }
    }
}

typedef void (*AddWeightedFunc)( const Mat& src1, double alpha, const Mat& src2,
                                 double beta, double gamma, Mat& dst );

void addWeighted( const Mat& src1, double alpha, const Mat& src2,
                  double beta, double gamma, Mat& dst )
{
    static AddWeightedFunc tab[]=
    {
        addWeighted8u, 0, addWeighted_<ushort, float>, addWeighted_<short, float>,
        addWeighted_<int, double>, addWeighted_<float, float>, addWeighted_<double, double>, 0
    };

    AddWeightedFunc func = tab[src1.depth()];
    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() && func != 0 );
    dst.create( src1.size(), src1.type() );
    func( src1, alpha, src2, beta, gamma, dst );
}


/****************************************************************************************\
*                                      absdiff                                           *
\****************************************************************************************/

template<typename T> struct OpAbsDiff
{
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()(T a, T b) { return (T)std::abs(a - b); }
};

template<> inline short OpAbsDiff<short>::operator ()(short a, short b)
{ return saturate_cast<short>(std::abs(a - b)); }

template<typename T, typename WT=T> struct OpAbsDiffS
{
    typedef T type1;
    typedef WT type2;
    typedef T rtype;
    T operator()(T a, WT b) { return saturate_cast<T>(std::abs(a - b)); }
};

void absdiff( const Mat& src1, const Mat& src2, Mat& dst )
{
    static BinaryFunc tab[] =
    {
        binaryOpC1_<OpAbsDiff<uchar>,VAbsDiff8u>, 0,
        binaryOpC1_<OpAbsDiff<ushort>,VAbsDiff16u>,
        binaryOpC1_<OpAbsDiff<short>,VAbsDiff16s>,
        binaryOpC1_<OpAbsDiff<int>,NoVec>,
        binaryOpC1_<OpAbsDiff<float>,VAbsDiff32f>,
        binaryOpC1_<OpAbsDiff<double>,NoVec>, 0
    };

    dst.create(src1.size(), src1.type());
    BinaryFunc func = tab[src1.depth()];
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type() && func != 0);
    func( src1, src2, dst );
}


void absdiff( const Mat& src1, const Scalar& s, Mat& dst )
{
    static BinarySFuncCn tab[] =
    {
        binarySOpCn_<OpAbsDiffS<uchar, int> >, 0,
        binarySOpCn_<OpAbsDiffS<ushort, int> >,
        binarySOpCn_<OpAbsDiffS<short, int> >,
        binarySOpCn_<OpAbsDiffS<int> >,
        binarySOpCn_<OpAbsDiffS<float> >,
        binarySOpCn_<OpAbsDiffS<double> >, 0
    };

    dst.create(src1.size(), src1.type());
    BinarySFuncCn func = tab[src1.depth()];
    CV_Assert(src1.channels() <= 4 && func != 0);
    func( src1, dst, s );
}

/****************************************************************************************\
*                                      inRange[S]                                        *
\****************************************************************************************/

template<typename T, typename WT> struct InRangeC1
{
    typedef T xtype;
    typedef WT btype;
    uchar operator()(xtype x, btype a, btype b) const
    { return (uchar)-(a <= x && x < b); }
};

template<typename T, typename WT> struct InRangeC2
{
    typedef Vec<T,2> xtype;
    typedef Vec<WT,2> btype;
    uchar operator()(const xtype& x, const btype& a, const btype& b) const
    {
        return (uchar)-(a[0] <= x[0] && x[0] < b[0] &&
                        a[1] <= x[1] && x[1] < b[1]);
    }
};

template<typename T, typename WT> struct InRangeC3
{
    typedef Vec<T,3> xtype;
    typedef Vec<WT,3> btype;
    uchar operator()(const xtype& x, const btype& a, const btype& b) const
    {
        return (uchar)-(a[0] <= x[0] && x[0] < b[0] &&
                        a[1] <= x[1] && x[1] < b[1] &&
                        a[2] <= x[2] && x[2] < b[2]);
    }
};

template<typename T, typename WT> struct InRangeC4
{
    typedef Vec<T,4> xtype;
    typedef Vec<WT,4> btype;
    uchar operator()(const xtype& x, const btype& a, const btype& b) const
    {
        return (uchar)-(a[0] <= x[0] && x[0] < b[0] &&
                        a[1] <= x[1] && x[1] < b[1] &&
                        a[2] <= x[2] && x[2] < b[2] &&
                        a[3] <= x[3] && x[3] < b[3]);
    }
};

template<class Op> static void
inRange_( const Mat& srcmat1, const Mat& srcmat2, const Mat& srcmat3, Mat& dstmat )
{
    Op op;
    uchar* dst = dstmat.data;
    size_t dstep = dstmat.step;
    Size size = getContinuousSize( srcmat1, srcmat2, srcmat3, dstmat );

    for( int y = 0; y < size.height; y++, dst += dstep )
    {
        const typename Op::xtype* src1 = (const typename Op::xtype*)(srcmat1.data + srcmat1.step*y);
        const typename Op::xtype* src2 = (const typename Op::xtype*)(srcmat2.data + srcmat2.step*y);
        const typename Op::xtype* src3 = (const typename Op::xtype*)(srcmat3.data + srcmat3.step*y);
        for( int x = 0; x < size.width; x++ )
            dst[x] = op( src1[x], src2[x], src3[x] );
    }
}

template<class Op> static void
inRangeS_( const Mat& srcmat1, const Scalar& _a, const Scalar& _b, Mat& dstmat )
{
    Op op;
    typedef typename Op::btype WT;
    typedef typename DataType<WT>::channel_type WT1;
    WT a, b;
    uchar* dst = dstmat.data;
    size_t dstep = dstmat.step;
    Size size = getContinuousSize( srcmat1, dstmat );
    int cn = srcmat1.channels();
    _a.convertTo((WT1*)&a, cn);
    _b.convertTo((WT1*)&b, cn);

    for( int y = 0; y < size.height; y++, dst += dstep )
    {
        const typename Op::xtype* src1 = (const typename Op::xtype*)(srcmat1.data + srcmat1.step*y);
        for( int x = 0; x < size.width; x++ )
            dst[x] = op( src1[x], a, b );
    }
}

typedef void (*InRangeFunc)( const Mat& src1, const Mat& src2, const Mat& src3, Mat& dst );
typedef void (*InRangeSFunc)( const Mat& src1, const Scalar& a, const Scalar& b, Mat& dst );

void inRange(const Mat& src, const Mat& lowerb,
             const Mat& upperb, Mat& dst)
{
    static InRangeFunc tab[] =
    {
        inRange_<InRangeC1<uchar, uchar> >, 0,
        inRange_<InRangeC1<ushort, ushort> >,
        inRange_<InRangeC1<short, short> >,
        inRange_<InRangeC1<int, int> >,
        inRange_<InRangeC1<float, float> >,
        inRange_<InRangeC1<double, double> >, 0,

        inRange_<InRangeC2<uchar, uchar> >, 0,
        inRange_<InRangeC2<ushort, ushort> >,
        inRange_<InRangeC2<short, short> >,
        inRange_<InRangeC2<int, int> >,
        inRange_<InRangeC2<float, float> >,
        inRange_<InRangeC2<double, double> >, 0,

        inRange_<InRangeC3<uchar, uchar> >, 0,
        inRange_<InRangeC3<ushort, ushort> >,
        inRange_<InRangeC3<short, short> >,
        inRange_<InRangeC3<int, int> >,
        inRange_<InRangeC3<float, float> >,
        inRange_<InRangeC3<double, double> >, 0,

        inRange_<InRangeC4<uchar, uchar> >, 0,
        inRange_<InRangeC4<ushort, ushort> >,
        inRange_<InRangeC4<short, short> >,
        inRange_<InRangeC4<int, int> >,
        inRange_<InRangeC4<float, float> >,
        inRange_<InRangeC4<double, double> >, 0
    };

    CV_Assert( src.size() == lowerb.size() && src.size() == upperb.size() &&
        src.type() == lowerb.type() && src.type() == upperb.type() &&
        src.channels() <= 4 );

    InRangeFunc func = tab[src.type()];
    CV_Assert( func != 0 );

    dst.create(src.size(), CV_8U);
    func( src, lowerb, upperb, dst );
}

void inRange(const Mat& src, const Scalar& lowerb,
             const Scalar& upperb, Mat& dst)
{
    static InRangeSFunc tab[] =
    {
        inRangeS_<InRangeC1<uchar, int> >, 0,
        inRangeS_<InRangeC1<ushort, int> >,
        inRangeS_<InRangeC1<short, int> >,
        inRangeS_<InRangeC1<int, int> >,
        inRangeS_<InRangeC1<float, float> >,
        inRangeS_<InRangeC1<double, double> >, 0,

        inRangeS_<InRangeC2<uchar, int> >, 0,
        inRangeS_<InRangeC2<ushort, int> >,
        inRangeS_<InRangeC2<short, int> >,
        inRangeS_<InRangeC2<int, int> >,
        inRangeS_<InRangeC2<float, float> >,
        inRangeS_<InRangeC2<double, double> >, 0,

        inRangeS_<InRangeC3<uchar, int> >, 0,
        inRangeS_<InRangeC3<ushort, int> >,
        inRangeS_<InRangeC3<short, int> >,
        inRangeS_<InRangeC3<int, int> >,
        inRangeS_<InRangeC3<float, float> >,
        inRangeS_<InRangeC3<double, double> >, 0,

        inRangeS_<InRangeC4<uchar, int> >, 0,
        inRangeS_<InRangeC4<ushort, int> >,
        inRangeS_<InRangeC4<short, int> >,
        inRangeS_<InRangeC4<int, int> >,
        inRangeS_<InRangeC4<float, float> >,
        inRangeS_<InRangeC4<double, double> >, 0
    };

    CV_Assert( src.channels() <= 4 );

    InRangeSFunc func = tab[src.type()];
    CV_Assert( func != 0 );

    dst.create(src.size(), CV_8U);
    func( src, lowerb, upperb, dst );
}

/****************************************************************************************\
*                                          compare                                       *
\****************************************************************************************/

template<typename T, typename WT=T> struct CmpEQ
{
    typedef T type1;
    typedef WT type2;
    typedef uchar rtype;
    uchar operator()(T a, WT b) const { return (uchar)-(a == b); }
};

template<typename T, typename WT=T> struct CmpGT
{
    typedef T type1;
    typedef WT type2;
    typedef uchar rtype;
    uchar operator()(T a, WT b) const { return (uchar)-(a > b); }
};

template<typename T, typename WT=T> struct CmpGE
{
    typedef T type1;
    typedef WT type2;
    typedef uchar rtype;
    uchar operator()(T a, WT b) const { return (uchar)-(a >= b); }
};

void compare( const Mat& src1, const Mat& src2, Mat& dst, int cmpOp )
{
    static BinaryFunc tab[][8] =
    {
        {binaryOpC1_<CmpGT<uchar>,VCmpGT8u>, 0,
        binaryOpC1_<CmpGT<ushort>,NoVec>,
        binaryOpC1_<CmpGT<short>,NoVec>,
        binaryOpC1_<CmpGT<int>,NoVec>,
        binaryOpC1_<CmpGT<float>,NoVec>,
        binaryOpC1_<CmpGT<double>,NoVec>, 0},

        {binaryOpC1_<CmpEQ<uchar>,VCmpEQ8u>, 0,
        binaryOpC1_<CmpEQ<ushort>,NoVec>,
        binaryOpC1_<CmpEQ<ushort>,NoVec>, // same function as for ushort's
        binaryOpC1_<CmpEQ<int>,NoVec>,
        binaryOpC1_<CmpEQ<float>,NoVec>,
        binaryOpC1_<CmpEQ<double>,NoVec>, 0},
    };

    dst.create(src1.rows, src1.cols, CV_8U);
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type() && src1.channels() == 1);

    int depth = src1.depth();
    const Mat *psrc1 = &src1, *psrc2 = &src2;
    bool invflag = false;

    switch( cmpOp )
    {
    case CMP_GT:
    case CMP_EQ:
        break;
    case CMP_GE:
        std::swap( psrc1, psrc2 );
        invflag = true;
        break;
    case CMP_LT:
        std::swap( psrc1, psrc2 );
        break;
    case CMP_LE:
        invflag = true;
        break;
    case CMP_NE:
        cmpOp = CMP_EQ;
        invflag = true;
        break;
    default:
        CV_Error(CV_StsBadArg, "Unknown comparison method");
    }

    BinaryFunc func = tab[cmpOp == CMP_EQ][depth];
    CV_Assert( func != 0 );
    func( *psrc1, *psrc2, dst );
    if( invflag )
        bitwise_not(dst, dst);
}


void compare( const Mat& src1, double value, Mat& dst, int cmpOp )
{
    static BinarySFuncC1 tab[][8] =
    {
        {binarySOpC1_<CmpEQ<uchar, int> >, 0,
        binarySOpC1_<CmpEQ<ushort, int> >,
        binarySOpC1_<CmpEQ<short, int> >,
        binarySOpC1_<CmpEQ<int> >,
        binarySOpC1_<CmpEQ<float> >,
        binarySOpC1_<CmpEQ<double> >, 0},

        {binarySOpC1_<CmpGT<uchar, int> >, 0,
        binarySOpC1_<CmpGT<ushort, int> >,
        binarySOpC1_<CmpGT<short, int> >,
        binarySOpC1_<CmpGT<int> >,
        binarySOpC1_<CmpGT<float> >,
        binarySOpC1_<CmpGT<double> >, 0},

        {binarySOpC1_<CmpGE<uchar, int> >, 0,
        binarySOpC1_<CmpGE<ushort, int> >,
        binarySOpC1_<CmpGE<short, int> >,
        binarySOpC1_<CmpGE<int> >,
        binarySOpC1_<CmpGE<float> >,
        binarySOpC1_<CmpGE<double> >, 0},
    };

    dst.create(src1.rows, src1.cols, CV_8U);
    CV_Assert(src1.channels() == 1);
    int depth = src1.depth();
    bool invflag = false;

    switch( cmpOp )
    {
    case CMP_GT:
    case CMP_EQ:
    case CMP_GE:
        break;
    case CMP_LT:
        invflag = true;
        cmpOp = CMP_GE;
        break;
    case CMP_LE:
        invflag = true;
        cmpOp = CMP_GT;
        break;
    case CMP_NE:
        invflag = true;
        cmpOp = CMP_EQ;
        break;
    default:
        CV_Error(CV_StsBadArg, "Unknown comparison method");
    }

    BinarySFuncC1 func = tab[cmpOp == CMP_EQ ? 0 : cmpOp == CMP_GT ? 1 : 2][depth];
    CV_Assert( func != 0 );
    func( src1, dst, value );
    if( invflag )
        bitwise_not(dst, dst);
}

/****************************************************************************************\
*                                       min/max                                          *
\****************************************************************************************/

template<typename T> struct MinOp
{
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator ()(T a, T b) const { return std::min(a, b); }
};

template<typename T> struct MaxOp
{
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator ()(T a, T b) const { return std::max(a, b); }
};

template<> inline uchar MinOp<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
template<> inline uchar MaxOp<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }

void min( const Mat& src1, const Mat& src2, Mat& dst )
{
    static BinaryFunc tab[] =
    {
        binaryOpC1_<MinOp<uchar>,VMin8u>, 0, binaryOpC1_<MinOp<ushort>,VMin16u>,
        binaryOpC1_<MinOp<short>,VMin16s>, binaryOpC1_<MinOp<int>,NoVec>,
        binaryOpC1_<MinOp<float>,VMin32f>, binaryOpC1_<MinOp<double>,NoVec>, 0
    };

    BinaryFunc func = tab[src1.depth()];
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type() && func != 0);
    dst.create(src1.size(), src1.type());

    return func( src1, src2, dst );
}

void max( const Mat& src1, const Mat& src2, Mat& dst )
{
    static BinaryFunc tab[] =
    {
        binaryOpC1_<MaxOp<uchar>,VMax8u>, 0, binaryOpC1_<MaxOp<ushort>,VMax16u>,
        binaryOpC1_<MaxOp<short>,VMax16s>, binaryOpC1_<MaxOp<int>,NoVec>,
        binaryOpC1_<MaxOp<float>,VMax32f>, binaryOpC1_<MaxOp<double>,NoVec>, 0
    };

    BinaryFunc func = tab[src1.depth()];
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type() && func != 0);
    dst.create(src1.size(), src1.type());

    return func( src1, src2, dst );
}

void min( const Mat& src1, double value, Mat& dst )
{
    static BinarySFuncC1 tab[] =
    {
        binarySOpC1_<MinOp<uchar> >, 0,
        binarySOpC1_<MinOp<ushort> >,
        binarySOpC1_<MinOp<short> >,
        binarySOpC1_<MinOp<int> >,
        binarySOpC1_<MinOp<float> >,
        binarySOpC1_<MinOp<double> >, 0
    };

    BinarySFuncC1 func = tab[src1.depth()];
    CV_Assert(func != 0);
    dst.create(src1.size(), src1.type());
    return func( src1, dst, value );
}

void max( const Mat& src1, double value, Mat& dst )
{
    static BinarySFuncC1 tab[] =
    {
        binarySOpC1_<MaxOp<uchar> >, 0,
        binarySOpC1_<MaxOp<ushort> >,
        binarySOpC1_<MaxOp<short> >,
        binarySOpC1_<MaxOp<int> >,
        binarySOpC1_<MaxOp<float> >,
        binarySOpC1_<MaxOp<double> >, 0
    };

    BinarySFuncC1 func = tab[src1.depth()];
    CV_Assert(func != 0);
    dst.create(src1.size(), src1.type());
    return func( src1, dst, value );
}

}

/****************************************************************************************\
*                                Earlier API: cvAdd etc.                                 *
\****************************************************************************************/

CV_IMPL void
cvNot( const CvArr* srcarr, CvArr* dstarr )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
    cv::bitwise_not( src, dst );
}


CV_IMPL void
cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_and( src1, src2, dst, mask );
}

CV_IMPL void
cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_or( src1, src2, dst, mask );
}


CV_IMPL void
cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_xor( src1, src2, dst, mask );
}


CV_IMPL void
cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_and( src, s, dst, mask );
}


CV_IMPL void
cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_or( src, s, dst, mask );
}


CV_IMPL void
cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_xor( src, s, dst, mask );
}

CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::add( src1, src2, dst, mask );
}

CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::subtract( src1, src2, dst, mask );
}

CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1),
        dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::add( src1, value, dst, mask );
}

CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1),
        dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::subtract( value, src1, dst, mask );
}

CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2,
                    CvArr* dstarr, double scale )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    cv::multiply( src1, src2, dst, scale );
}

CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2,
                    CvArr* dstarr, double scale )
{
    cv::Mat src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
    CV_Assert( src2.size() == dst.size() && src2.type() == dst.type() );

    if( srcarr1 )
        cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale );
    else
        cv::divide( scale, src2, dst );
}


CV_IMPL void
cvAddWeighted( const CvArr* srcarr1, double alpha,
               const CvArr* srcarr2, double beta,
               double gamma, CvArr* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );
    cv::addWeighted( src1, alpha, src2, beta, gamma, dst );
}


CV_IMPL  void
cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );

    cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst );
}


CV_IMPL void
cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );

    cv::absdiff( src1, scalar, dst );
}

CV_IMPL void
cvInRange( const void* srcarr1, const void* srcarr2,
           const void* srcarr3, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && dst.type() == CV_8U );

    cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst );
}

CV_IMPL void
cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && dst.type() == CV_8U );

    cv::inRange( src1, lowerb, upperb, dst );
}


CV_IMPL void
cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && dst.type() == CV_8U );

    cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op );
}


CV_IMPL void
cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && dst.type() == CV_8U );

    cv::compare( src1, value, dst, cmp_op );
}


CV_IMPL void
cvMin( const void* srcarr1, const void* srcarr2, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );

    cv::min( src1, cv::cvarrToMat(srcarr2), dst );
}


CV_IMPL void
cvMax( const void* srcarr1, const void* srcarr2, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );

    cv::max( src1, cv::cvarrToMat(srcarr2), dst );
}

CV_IMPL void
cvMinS( const void* srcarr1, double value, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );

    cv::min( src1, value, dst );
}


CV_IMPL void
cvMaxS( const void* srcarr1, double value, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
    CV_Assert( src1.size() == dst.size() && src1.type() == dst.type() );

    cv::max( src1, value, dst );
}


/* End of file. */