stat.cpp 153 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
Ilya Lavrenov's avatar
Ilya Lavrenov committed
15
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#include "precomp.hpp"
45
#include <climits>
46
#include <limits>
47
#include "opencv2/core/hal/intrin.hpp"
48

Alexander Alekhin's avatar
Alexander Alekhin committed
49
#include "opencl_kernels_core.hpp"
50

51
#include "opencv2/core/openvx/ovx_defs.hpp"
52

53 54 55 56 57 58 59
namespace cv
{

/****************************************************************************************\
*                                        sum                                             *
\****************************************************************************************/

60 61 62 63 64 65 66 67 68
template <typename T, typename ST>
struct Sum_SIMD
{
    int operator () (const T *, const uchar *, ST *, int, int) const
    {
        return 0;
    }
};

69 70 71 72 73 74 75
template <typename ST, typename DT>
inline void addChannels(DT * dst, ST * buf, int cn)
{
    for (int i = 0; i < 4; ++i)
        dst[i % cn] += buf[i];
}

Ilya Lavrenov's avatar
Ilya Lavrenov committed
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
#if CV_SSE2

template <>
struct Sum_SIMD<schar, int>
{
    int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
            return 0;

        int x = 0;
        __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero;

        for ( ; x <= len - 16; x += 16)
        {
            __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
            __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);

            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));

            v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
        }

        for ( ; x <= len - 8; x += 8)
        {
            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);

            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
        }

        int CV_DECL_ALIGNED(16) ar[4];
        _mm_store_si128((__m128i*)ar, v_sum);

113
        addChannels(dst, ar, cn);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140

        return x / cn;
    }
};

template <>
struct Sum_SIMD<int, double>
{
    int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
            return 0;

        int x = 0;
        __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;

        for ( ; x <= len - 4; x += 4)
        {
            __m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x));
            v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src));
            v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8)));
        }

        double CV_DECL_ALIGNED(16) ar[4];
        _mm_store_pd(ar, v_sum0);
        _mm_store_pd(ar + 2, v_sum1);

141
        addChannels(dst, ar, cn);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169

        return x / cn;
    }
};

template <>
struct Sum_SIMD<float, double>
{
    int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
            return 0;

        int x = 0;
        __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;

        for ( ; x <= len - 4; x += 4)
        {
            __m128 v_src = _mm_loadu_ps(src0 + x);
            v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src));
            v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
            v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src));
        }

        double CV_DECL_ALIGNED(16) ar[4];
        _mm_store_pd(ar, v_sum0);
        _mm_store_pd(ar + 2, v_sum1);

170
        addChannels(dst, ar, cn);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
171 172 173 174 175 176 177

        return x / cn;
    }
};


#elif CV_NEON
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194

template <>
struct Sum_SIMD<uchar, int>
{
    int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4))
            return 0;

        int x = 0;
        uint32x4_t v_sum = vdupq_n_u32(0u);

        for ( ; x <= len - 16; x += 16)
        {
            uint8x16_t v_src = vld1q_u8(src0 + x);
            uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));

Ilya Lavrenov's avatar
Ilya Lavrenov committed
195 196
            v_sum = vaddw_u16(v_sum, vget_low_u16(v_half));
            v_sum = vaddw_u16(v_sum, vget_high_u16(v_half));
197 198

            v_half = vmovl_u8(vget_high_u8(v_src));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
199 200
            v_sum = vaddw_u16(v_sum, vget_low_u16(v_half));
            v_sum = vaddw_u16(v_sum, vget_high_u16(v_half));
201 202 203 204 205 206
        }

        for ( ; x <= len - 8; x += 8)
        {
            uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x));

Ilya Lavrenov's avatar
Ilya Lavrenov committed
207 208
            v_sum = vaddw_u16(v_sum, vget_low_u16(v_src));
            v_sum = vaddw_u16(v_sum, vget_high_u16(v_src));
209 210 211 212 213
        }

        unsigned int CV_DECL_ALIGNED(16) ar[4];
        vst1q_u32(ar, v_sum);

214
        addChannels(dst, ar, cn);
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235

        return x / cn;
    }
};

template <>
struct Sum_SIMD<schar, int>
{
    int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4))
            return 0;

        int x = 0;
        int32x4_t v_sum = vdupq_n_s32(0);

        for ( ; x <= len - 16; x += 16)
        {
            int8x16_t v_src = vld1q_s8(src0 + x);
            int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));

Ilya Lavrenov's avatar
Ilya Lavrenov committed
236 237
            v_sum = vaddw_s16(v_sum, vget_low_s16(v_half));
            v_sum = vaddw_s16(v_sum, vget_high_s16(v_half));
238 239

            v_half = vmovl_s8(vget_high_s8(v_src));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
240 241
            v_sum = vaddw_s16(v_sum, vget_low_s16(v_half));
            v_sum = vaddw_s16(v_sum, vget_high_s16(v_half));
242 243 244 245 246 247
        }

        for ( ; x <= len - 8; x += 8)
        {
            int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x));

Ilya Lavrenov's avatar
Ilya Lavrenov committed
248 249
            v_sum = vaddw_s16(v_sum, vget_low_s16(v_src));
            v_sum = vaddw_s16(v_sum, vget_high_s16(v_src));
250 251 252 253 254
        }

        int CV_DECL_ALIGNED(16) ar[4];
        vst1q_s32(ar, v_sum);

255
        addChannels(dst, ar, cn);
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275

        return x / cn;
    }
};

template <>
struct Sum_SIMD<ushort, int>
{
    int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4))
            return 0;

        int x = 0;
        uint32x4_t v_sum = vdupq_n_u32(0u);

        for ( ; x <= len - 8; x += 8)
        {
            uint16x8_t v_src = vld1q_u16(src0 + x);

Ilya Lavrenov's avatar
Ilya Lavrenov committed
276 277
            v_sum = vaddw_u16(v_sum, vget_low_u16(v_src));
            v_sum = vaddw_u16(v_sum, vget_high_u16(v_src));
278 279 280
        }

        for ( ; x <= len - 4; x += 4)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
281
            v_sum = vaddw_u16(v_sum, vld1_u16(src0 + x));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
282

283 284 285
        unsigned int CV_DECL_ALIGNED(16) ar[4];
        vst1q_u32(ar, v_sum);

286
        addChannels(dst, ar, cn);
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306

        return x / cn;
    }
};

template <>
struct Sum_SIMD<short, int>
{
    int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2 && cn != 4))
            return 0;

        int x = 0;
        int32x4_t v_sum = vdupq_n_s32(0u);

        for ( ; x <= len - 8; x += 8)
        {
            int16x8_t v_src = vld1q_s16(src0 + x);

Ilya Lavrenov's avatar
Ilya Lavrenov committed
307 308
            v_sum = vaddw_s16(v_sum, vget_low_s16(v_src));
            v_sum = vaddw_s16(v_sum, vget_high_s16(v_src));
309 310 311
        }

        for ( ; x <= len - 4; x += 4)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
312
            v_sum = vaddw_s16(v_sum, vld1_s16(src0 + x));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
313

314 315 316
        int CV_DECL_ALIGNED(16) ar[4];
        vst1q_s32(ar, v_sum);

317
        addChannels(dst, ar, cn);
318 319 320 321 322 323 324

        return x / cn;
    }
};

#endif

325
template<typename T, typename ST>
326
static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
327
{
328 329
    const T* src = src0;
    if( !mask )
330
    {
331 332 333 334
        Sum_SIMD<T, ST> vop;
        int i = vop(src0, mask, dst, len, cn), k = cn % 4;
        src += i * cn;

335 336 337
        if( k == 1 )
        {
            ST s0 = dst[0];
Victoria Zhislina's avatar
Victoria Zhislina committed
338

339
            #if CV_ENABLE_UNROLLED
Victoria Zhislina's avatar
Victoria Zhislina committed
340
            for(; i <= len - 4; i += 4, src += cn*4 )
341
                s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
Victoria Zhislina's avatar
Victoria Zhislina committed
342
            #endif
343 344 345 346 347
            for( ; i < len; i++, src += cn )
                s0 += src[0];
            dst[0] = s0;
        }
        else if( k == 2 )
348
        {
349
            ST s0 = dst[0], s1 = dst[1];
350
            for( ; i < len; i++, src += cn )
351
            {
352 353
                s0 += src[0];
                s1 += src[1];
354
            }
355 356 357 358 359 360
            dst[0] = s0;
            dst[1] = s1;
        }
        else if( k == 3 )
        {
            ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
361
            for( ; i < len; i++, src += cn )
362
            {
363 364 365
                s0 += src[0];
                s1 += src[1];
                s2 += src[2];
366
            }
367 368 369
            dst[0] = s0;
            dst[1] = s1;
            dst[2] = s2;
370
        }
371

372
        for( ; k < cn; k += 4 )
373
        {
374
            src = src0 + i*cn + k;
375
            ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3];
376
            for( ; i < len; i++, src += cn )
377 378 379 380 381 382 383 384
            {
                s0 += src[0]; s1 += src[1];
                s2 += src[2]; s3 += src[3];
            }
            dst[k] = s0;
            dst[k+1] = s1;
            dst[k+2] = s2;
            dst[k+3] = s3;
385
        }
386
        return len;
387
    }
388

389 390
    int i, nzm = 0;
    if( cn == 1 )
391
    {
392 393 394 395 396 397 398 399 400 401
        ST s = dst[0];
        for( i = 0; i < len; i++ )
            if( mask[i] )
            {
                s += src[i];
                nzm++;
            }
        dst[0] = s;
    }
    else if( cn == 3 )
402
    {
403 404 405 406 407 408 409 410 411 412 413 414
        ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
        for( i = 0; i < len; i++, src += 3 )
            if( mask[i] )
            {
                s0 += src[0];
                s1 += src[1];
                s2 += src[2];
                nzm++;
            }
        dst[0] = s0;
        dst[1] = s1;
        dst[2] = s2;
415
    }
416 417 418 419 420 421
    else
    {
        for( i = 0; i < len; i++, src += cn )
            if( mask[i] )
            {
                int k = 0;
422
                #if CV_ENABLE_UNROLLED
423 424 425 426 427 428 429 430 431 432
                for( ; k <= cn - 4; k += 4 )
                {
                    ST s0, s1;
                    s0 = dst[k] + src[k];
                    s1 = dst[k+1] + src[k+1];
                    dst[k] = s0; dst[k+1] = s1;
                    s0 = dst[k+2] + src[k+2];
                    s1 = dst[k+3] + src[k+3];
                    dst[k+2] = s0; dst[k+3] = s1;
                }
Victoria Zhislina's avatar
Victoria Zhislina committed
433
                #endif
434 435 436 437 438 439
                for( ; k < cn; k++ )
                    dst[k] += src[k];
                nzm++;
            }
    }
    return nzm;
440 441
}

442

443 444
static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn )
{ return sum_(src, mask, dst, len, cn); }
445

446 447
static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn )
{ return sum_(src, mask, dst, len, cn); }
448

449 450
static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn )
{ return sum_(src, mask, dst, len, cn); }
451

452 453
static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn )
{ return sum_(src, mask, dst, len, cn); }
454

455 456
static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn )
{ return sum_(src, mask, dst, len, cn); }
457

458 459
static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn )
{ return sum_(src, mask, dst, len, cn); }
460

461 462
static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn )
{ return sum_(src, mask, dst, len, cn); }
463

464 465
typedef int (*SumFunc)(const uchar*, const uchar* mask, uchar*, int, int);

466
static SumFunc getSumFunc(int depth)
467
{
468 469 470 471 472 473 474 475 476 477 478
    static SumFunc sumTab[] =
    {
        (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
        (SumFunc)sum16u, (SumFunc)sum16s,
        (SumFunc)sum32s,
        (SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f,
        0
    };

    return sumTab[depth];
}
479

480 481 482
template<typename T>
static int countNonZero_(const T* src, int len )
{
Victoria Zhislina's avatar
Victoria Zhislina committed
483
    int i=0, nz = 0;
484
    #if CV_ENABLE_UNROLLED
Victoria Zhislina's avatar
Victoria Zhislina committed
485
    for(; i <= len - 4; i += 4 )
486
        nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
Victoria Zhislina's avatar
Victoria Zhislina committed
487
    #endif
488 489 490
    for( ; i < len; i++ )
        nz += src[i] != 0;
    return nz;
491 492
}

493
static int countNonZero8u( const uchar* src, int len )
494
{
495 496 497 498
    int i=0, nz = 0;
#if CV_SSE2
    if(USE_SSE2)//5x-6x
    {
499 500
        __m128i v_zero = _mm_setzero_si128();
        __m128i sum = _mm_setzero_si128();
501

502 503 504
        for (; i<=len-16; i+=16)
        {
            __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
505
            sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi8(r0, v_zero)), v_zero));
506
        }
507
        nz = i - _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum)));
508
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
#elif CV_NEON
    int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6;
    uint32x4_t v_nz = vdupq_n_u32(0u);
    uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1);
    const uchar * src0 = src;

    while( i < len0 )
    {
        int blockSizei = std::min(len0 - i, blockSize0), j = 0;

        while (j < blockSizei)
        {
            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
            uint8x16_t v_pz = v_zero;

            for( ; k <= blockSizej - 16; k += 16 )
                v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1));

            uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz));
            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz);
            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz);

            src0 += blockSizej;
            j += blockSizej;
        }

        i += blockSizei;
    }

    CV_DECL_ALIGNED(16) unsigned int buf[4];
    vst1q_u32(buf, v_nz);
    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
541 542 543
#endif
    for( ; i < len; i++ )
        nz += src[i] != 0;
544 545 546
    return nz;
}

547
static int countNonZero16u( const ushort* src, int len )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
548 549
{
    int i = 0, nz = 0;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
550 551 552 553
#if CV_SSE2
    if (USE_SSE2)
    {
        __m128i v_zero = _mm_setzero_si128 ();
554
        __m128i sum = _mm_setzero_si128();
Ilya Lavrenov's avatar
Ilya Lavrenov committed
555 556 557

        for ( ; i <= len - 8; i += 8)
        {
558 559
            __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i));
            sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi16(r0, v_zero)), v_zero));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
560 561
        }

562
        nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 1);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
563 564 565
        src += i;
    }
#elif CV_NEON
Ilya Lavrenov's avatar
Ilya Lavrenov committed
566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
    uint32x4_t v_nz = vdupq_n_u32(0u);
    uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);

    while( i < len0 )
    {
        int blockSizei = std::min(len0 - i, blockSize0), j = 0;

        while (j < blockSizei)
        {
            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
            uint16x8_t v_pz = v_zero;

            for( ; k <= blockSizej - 8; k += 8 )
                v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1));

            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);

            src += blockSizej;
            j += blockSizej;
        }

        i += blockSizei;
    }

    CV_DECL_ALIGNED(16) unsigned int buf[4];
    vst1q_u32(buf, v_nz);
    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
#endif
    return nz + countNonZero_(src, len - i);
}
597

598
static int countNonZero32s( const int* src, int len )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
599 600
{
    int i = 0, nz = 0;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
601 602 603 604
#if CV_SSE2
    if (USE_SSE2)
    {
        __m128i v_zero = _mm_setzero_si128 ();
605
        __m128i sum = _mm_setzero_si128();
Ilya Lavrenov's avatar
Ilya Lavrenov committed
606

607
        for ( ; i <= len - 4; i += 4)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
608
        {
609 610
            __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i));
            sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi32(r0, v_zero)), v_zero));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
611 612
        }

613
        nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
614 615 616
        src += i;
    }
#elif CV_NEON
Ilya Lavrenov's avatar
Ilya Lavrenov committed
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
    uint32x4_t v_nz = vdupq_n_u32(0u);
    int32x4_t v_zero = vdupq_n_s32(0.0f);
    uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);

    while( i < len0 )
    {
        int blockSizei = std::min(len0 - i, blockSize0), j = 0;

        while (j < blockSizei)
        {
            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
            uint16x8_t v_pz = v_zerou;

            for( ; k <= blockSizej - 8; k += 8 )
                v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)),
                                                              vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1));

            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);

            src += blockSizej;
            j += blockSizej;
        }

        i += blockSizei;
    }

    CV_DECL_ALIGNED(16) unsigned int buf[4];
    vst1q_u32(buf, v_nz);
    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
#endif
    return nz + countNonZero_(src, len - i);
}
650 651

static int countNonZero32f( const float* src, int len )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
652 653
{
    int i = 0, nz = 0;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
654 655 656 657
#if CV_SSE2
    if (USE_SSE2)
    {
        __m128 v_zero_f = _mm_setzero_ps();
658 659
        __m128i v_zero = _mm_setzero_si128 ();
        __m128i sum = _mm_setzero_si128();
Ilya Lavrenov's avatar
Ilya Lavrenov committed
660

661
        for ( ; i <= len - 4; i += 4)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
662
        {
663 664
            __m128 r0 = _mm_loadu_ps(src + i);
            sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_castps_si128(_mm_cmpeq_ps(r0, v_zero_f))), v_zero));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
665 666
        }

667
        nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
668 669 670
        src += i;
    }
#elif CV_NEON
Ilya Lavrenov's avatar
Ilya Lavrenov committed
671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703
    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
    uint32x4_t v_nz = vdupq_n_u32(0u);
    float32x4_t v_zero = vdupq_n_f32(0.0f);
    uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);

    while( i < len0 )
    {
        int blockSizei = std::min(len0 - i, blockSize0), j = 0;

        while (j < blockSizei)
        {
            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
            uint16x8_t v_pz = v_zerou;

            for( ; k <= blockSizej - 8; k += 8 )
                v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)),
                                                              vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1));

            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);

            src += blockSizej;
            j += blockSizej;
        }

        i += blockSizei;
    }

    CV_DECL_ALIGNED(16) unsigned int buf[4];
    vst1q_u32(buf, v_nz);
    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
#endif
    return nz + countNonZero_(src, len - i);
}
704

705
static int countNonZero64f( const double* src, int len )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
706
{
707
    return countNonZero_(src, len);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
708
}
709

710
typedef int (*CountNonZeroFunc)(const uchar*, int);
711

712
static CountNonZeroFunc getCountNonZeroTab(int depth)
713
{
714 715 716 717 718 719 720
    static CountNonZeroFunc countNonZeroTab[] =
    {
        (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),
        (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),
        (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f),
        (CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0
    };
721

722 723
    return countNonZeroTab[depth];
}
724

Ilya Lavrenov's avatar
Ilya Lavrenov committed
725 726 727 728 729 730 731 732 733
template <typename T, typename ST, typename SQT>
struct SumSqr_SIMD
{
    int operator () (const T *, const uchar *, ST *, SQT *, int, int) const
    {
        return 0;
    }
};

734 735 736 737 738 739 740 741 742 743
template <typename T>
inline void addSqrChannels(T * sum, T * sqsum, T * buf, int cn)
{
    for (int i = 0; i < 4; ++i)
    {
        sum[i % cn] += buf[i];
        sqsum[i % cn] += buf[4 + i];
    }
}

Ilya Lavrenov's avatar
Ilya Lavrenov committed
744 745 746 747 748 749 750 751 752 753 754 755
#if CV_SSE2

template <>
struct SumSqr_SIMD<uchar, int, int>
{
    int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
            return 0;

        int x = 0;
        __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
756
        const int len_16 = len & ~15;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
757

758
        for ( ; x <= len_16 - 16; )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
759
        {
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
            const int len_tmp = min(x + 2048, len_16);
            __m128i v_sum_tmp = v_zero;
            for ( ; x <= len_tmp - 16; x += 16)
            {
                __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
                __m128i v_half_0 = _mm_unpacklo_epi8(v_src, v_zero);
                __m128i v_half_1 = _mm_unpackhi_epi8(v_src, v_zero);
                v_sum_tmp = _mm_add_epi16(v_sum_tmp, _mm_add_epi16(v_half_0, v_half_1));
                __m128i v_half_2 = _mm_unpacklo_epi16(v_half_0, v_half_1);
                __m128i v_half_3 = _mm_unpackhi_epi16(v_half_0, v_half_1);
                v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_2, v_half_2));
                v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_3, v_half_3));
            }
            v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_sum_tmp, v_zero));
            v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_sum_tmp, v_zero));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
775 776 777 778 779
        }

        for ( ; x <= len - 8; x += 8)
        {
            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero);
780 781 782
            __m128i v_half_0 = _mm_unpackhi_epi64(v_src, v_src);
            __m128i v_sum_tmp = _mm_add_epi16(v_src, v_half_0);
            __m128i v_half_1 = _mm_unpacklo_epi16(v_src, v_half_0);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
783

784 785
            v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_sum_tmp, v_zero));
            v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_1, v_half_1));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
786 787 788 789 790 791
        }

        int CV_DECL_ALIGNED(16) ar[8];
        _mm_store_si128((__m128i*)ar, v_sum);
        _mm_store_si128((__m128i*)(ar + 4), v_sqsum);

792
        addSqrChannels(sum, sqsum, ar, cn);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
793 794 795 796 797 798 799 800 801 802 803 804 805 806 807

        return x / cn;
    }
};

template <>
struct SumSqr_SIMD<schar, int, int>
{
    int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
    {
        if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
            return 0;

        int x = 0;
        __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
808
        const int len_16 = len & ~15;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
809

810
        for ( ; x <= len_16 - 16; )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
811
        {
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
            const int len_tmp = min(x + 2048, len_16);
            __m128i v_sum_tmp = v_zero;
            for ( ; x <= len_tmp - 16; x += 16)
            {
                __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
                __m128i v_half_0 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
                __m128i v_half_1 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
                v_sum_tmp = _mm_add_epi16(v_sum_tmp, _mm_add_epi16(v_half_0, v_half_1));
                __m128i v_half_2 = _mm_unpacklo_epi16(v_half_0, v_half_1);
                __m128i v_half_3 = _mm_unpackhi_epi16(v_half_0, v_half_1);
                v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_2, v_half_2));
                v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_3, v_half_3));
            }
            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_sum_tmp), 16));
            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_sum_tmp), 16));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
827 828 829 830 831
        }

        for ( ; x <= len - 8; x += 8)
        {
            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
832 833 834
            __m128i v_half_0 = _mm_unpackhi_epi64(v_src, v_src);
            __m128i v_sum_tmp = _mm_add_epi16(v_src, v_half_0);
            __m128i v_half_1 = _mm_unpacklo_epi16(v_src, v_half_0);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
835

836 837
            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_sum_tmp), 16));
            v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_1, v_half_1));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
838 839 840 841 842 843
        }

        int CV_DECL_ALIGNED(16) ar[8];
        _mm_store_si128((__m128i*)ar, v_sum);
        _mm_store_si128((__m128i*)(ar + 4), v_sqsum);

844
        addSqrChannels(sum, sqsum, ar, cn);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
845 846 847 848 849 850 851

        return x / cn;
    }
};

#endif

852 853
template<typename T, typename ST, typename SQT>
static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
854
{
855
    const T* src = src0;
856

857
    if( !mask )
858
    {
Ilya Lavrenov's avatar
Ilya Lavrenov committed
859 860 861
        SumSqr_SIMD<T, ST, SQT> vop;
        int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4;
        src += i * cn;
862

863 864 865 866
        if( k == 1 )
        {
            ST s0 = sum[0];
            SQT sq0 = sqsum[0];
Ilya Lavrenov's avatar
Ilya Lavrenov committed
867
            for( ; i < len; i++, src += cn )
868 869 870 871 872 873 874 875 876 877 878
            {
                T v = src[0];
                s0 += v; sq0 += (SQT)v*v;
            }
            sum[0] = s0;
            sqsum[0] = sq0;
        }
        else if( k == 2 )
        {
            ST s0 = sum[0], s1 = sum[1];
            SQT sq0 = sqsum[0], sq1 = sqsum[1];
Ilya Lavrenov's avatar
Ilya Lavrenov committed
879
            for( ; i < len; i++, src += cn )
880 881 882 883 884 885 886 887 888 889 890 891
            {
                T v0 = src[0], v1 = src[1];
                s0 += v0; sq0 += (SQT)v0*v0;
                s1 += v1; sq1 += (SQT)v1*v1;
            }
            sum[0] = s0; sum[1] = s1;
            sqsum[0] = sq0; sqsum[1] = sq1;
        }
        else if( k == 3 )
        {
            ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
            SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
Ilya Lavrenov's avatar
Ilya Lavrenov committed
892
            for( ; i < len; i++, src += cn )
893 894 895 896 897 898 899 900 901
            {
                T v0 = src[0], v1 = src[1], v2 = src[2];
                s0 += v0; sq0 += (SQT)v0*v0;
                s1 += v1; sq1 += (SQT)v1*v1;
                s2 += v2; sq2 += (SQT)v2*v2;
            }
            sum[0] = s0; sum[1] = s1; sum[2] = s2;
            sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
        }
902

903
        for( ; k < cn; k += 4 )
904
        {
905 906 907
            src = src0 + k;
            ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
            SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
Ilya Lavrenov's avatar
Ilya Lavrenov committed
908
            for( ; i < len; i++, src += cn )
909 910 911 912 913 914 915 916 917 918 919 920 921
            {
                T v0, v1;
                v0 = src[0], v1 = src[1];
                s0 += v0; sq0 += (SQT)v0*v0;
                s1 += v1; sq1 += (SQT)v1*v1;
                v0 = src[2], v1 = src[3];
                s2 += v0; sq2 += (SQT)v0*v0;
                s3 += v1; sq3 += (SQT)v1*v1;
            }
            sum[k] = s0; sum[k+1] = s1;
            sum[k+2] = s2; sum[k+3] = s3;
            sqsum[k] = sq0; sqsum[k+1] = sq1;
            sqsum[k+2] = sq2; sqsum[k+3] = sq3;
922
        }
923
        return len;
924
    }
925

926
    int i, nzm = 0;
927

928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973
    if( cn == 1 )
    {
        ST s0 = sum[0];
        SQT sq0 = sqsum[0];
        for( i = 0; i < len; i++ )
            if( mask[i] )
            {
                T v = src[i];
                s0 += v; sq0 += (SQT)v*v;
                nzm++;
            }
        sum[0] = s0;
        sqsum[0] = sq0;
    }
    else if( cn == 3 )
    {
        ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
        SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
        for( i = 0; i < len; i++, src += 3 )
            if( mask[i] )
            {
                T v0 = src[0], v1 = src[1], v2 = src[2];
                s0 += v0; sq0 += (SQT)v0*v0;
                s1 += v1; sq1 += (SQT)v1*v1;
                s2 += v2; sq2 += (SQT)v2*v2;
                nzm++;
            }
        sum[0] = s0; sum[1] = s1; sum[2] = s2;
        sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
    }
    else
    {
        for( i = 0; i < len; i++, src += cn )
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
                {
                    T v = src[k];
                    ST s = sum[k] + v;
                    SQT sq = sqsum[k] + (SQT)v*v;
                    sum[k] = s; sqsum[k] = sq;
                }
                nzm++;
            }
    }
    return nzm;
974
}
975 976


977 978
static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
{ return sumsqr_(src, mask, sum, sqsum, len, cn); }
979

980 981
static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
{ return sumsqr_(src, mask, sum, sqsum, len, cn); }
982

983 984
static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
{ return sumsqr_(src, mask, sum, sqsum, len, cn); }
985

986 987
static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
{ return sumsqr_(src, mask, sum, sqsum, len, cn); }
988

989 990
static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
{ return sumsqr_(src, mask, sum, sqsum, len, cn); }
991

992 993
static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
{ return sumsqr_(src, mask, sum, sqsum, len, cn); }
994

995 996
static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
{ return sumsqr_(src, mask, sum, sqsum, len, cn); }
997

998
typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int);
999

1000
static SumSqrFunc getSumSqrTab(int depth)
1001
{
1002 1003 1004 1005 1006 1007 1008 1009
    static SumSqrFunc sumSqrTab[] =
    {
        (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
        (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
    };

    return sumSqrTab[depth];
}
1010

1011 1012
#ifdef HAVE_OPENCL

Ilya Lavrenov's avatar
Ilya Lavrenov committed
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
template <typename T> Scalar ocl_part_sum(Mat m)
{
    CV_Assert(m.rows == 1);

    Scalar s = Scalar::all(0);
    int cn = m.channels();
    const T * const ptr = m.ptr<T>(0);

    for (int x = 0, w = m.cols * cn; x < w; )
        for (int c = 0; c < cn; ++c, ++x)
            s[c] += ptr[x];

    return s;
}

1028
enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1029

1030 1031
static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(),
                     InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1032
{
1033
    CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1034

1035 1036 1037 1038
    const ocl::Device & dev = ocl::Device::getDefault();
    bool doubleSupport = dev.doubleFPConfig() > 0,
        haveMask = _mask.kind() != _InputArray::NONE,
        haveSrc2 = _src2.kind() != _InputArray::NONE;
1039
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
Elena Gvozdeva's avatar
Elena Gvozdeva committed
1040
            kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
1041
            mcn = std::max(cn, kercn);
1042
    CV_Assert(!haveSrc2 || _src2.type() == type);
Elena Gvozdeva's avatar
Elena Gvozdeva committed
1043
    int convert_cn = haveSrc2 ? mcn : cn;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1044

1045
    if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1046 1047
        return false;

1048 1049
    int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
    size_t wgs = dev.maxWorkGroupSize();
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1050

1051 1052
    int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
            dtype = CV_MAKE_TYPE(ddepth, cn);
1053
    CV_Assert(!haveMask || _mask.type() == CV_8UC1);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1054 1055 1056 1057 1058 1059 1060

    int wgs2_aligned = 1;
    while (wgs2_aligned < (int)wgs)
        wgs2_aligned <<= 1;
    wgs2_aligned >>= 1;

    static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
1061
    char cvt[2][40];
1062
    String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
1063
                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s",
1064 1065 1066
                         ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
                         ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
                         ocl::typeToStr(ddepth), ddepth, cn,
1067
                         ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]),
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1068
                         opMap[sum_op], (int)wgs, wgs2_aligned,
1069
                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1070 1071
                         haveMask ? " -D HAVE_MASK" : "",
                         _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
1072 1073
                         haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
                         haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
1074
                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
Elena Gvozdeva's avatar
Elena Gvozdeva committed
1075
                         depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");
1076 1077

    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1078 1079 1080
    if (k.empty())
        return false;

1081 1082
    UMat src = _src.getUMat(), src2 = _src2.getUMat(),
        db(1, dbsize, dtype), mask = _mask.getUMat();
1083 1084 1085

    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
            dbarg = ocl::KernelArg::PtrWriteOnly(db),
1086 1087
            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2);
1088 1089

    if (haveMask)
1090 1091 1092 1093 1094 1095
    {
        if (haveSrc2)
            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg);
        else
            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg);
    }
1096
    else
1097 1098 1099 1100 1101 1102
    {
        if (haveSrc2)
            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg);
        else
            k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg);
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1103

1104
    size_t globalsize = ngroups * wgs;
1105
    if (k.run(1, &globalsize, &wgs, false))
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1106 1107 1108 1109
    {
        typedef Scalar (*part_sum)(Mat m);
        part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
                func = funcs[ddepth - CV_32S];
1110 1111 1112

        Mat mres = db.getMat(ACCESS_READ);
        if (calc2)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1113
            const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));
1114

Ilya Lavrenov's avatar
Ilya Lavrenov committed
1115
        res = func(mres.colRange(0, ngroups));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1116 1117 1118 1119 1120
        return true;
    }
    return false;
}

1121 1122
#endif

1123 1124
#ifdef HAVE_IPP
static bool ipp_sum(Mat &src, Scalar &_res)
1125
{
1126 1127
    CV_INSTRUMENT_REGION_IPP()

1128
#if IPP_VERSION_X100 >= 700
1129
    int cn = src.channels();
1130 1131
    if (cn > 4)
        return false;
1132
    size_t total_size = src.total();
1133 1134
    int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
    if( src.dims == 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1135 1136 1137
    {
        IppiSize sz = { cols, rows };
        int type = src.type();
1138 1139
        typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
        typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
1140
        ippiSumFuncHint ippiSumHint =
1141 1142 1143 1144
            type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
            type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
            type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
            0;
1145
        ippiSumFuncNoHint ippiSum =
1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
            type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
            type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
            type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
            type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
            type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
            type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
            type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
            type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
            type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
            0;
1156 1157
        CV_Assert(!ippiSumHint || !ippiSum);
        if( ippiSumHint || ippiSum )
1158 1159
        {
            Ipp64f res[4];
1160 1161 1162
            IppStatus ret = ippiSumHint ?
                            CV_INSTRUMENT_FUN_IPP(ippiSumHint, src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
                            CV_INSTRUMENT_FUN_IPP(ippiSum, src.ptr(), (int)src.step[0], sz, res);
1163
            if( ret >= 0 )
1164 1165 1166 1167 1168 1169 1170
            {
                for( int i = 0; i < cn; i++ )
                    _res[i] = res[i];
                return true;
            }
        }
    }
1171 1172
#else
    CV_UNUSED(src); CV_UNUSED(_res);
1173 1174 1175 1176 1177
#endif
    return false;
}
#endif

1178 1179
}

1180
cv::Scalar cv::sum( InputArray _src )
1181
{
1182 1183
    CV_INSTRUMENT_REGION()

1184
#if defined HAVE_OPENCL || defined HAVE_IPP
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1185
    Scalar _res;
1186 1187
#endif

1188
#ifdef HAVE_OPENCL
1189
    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
1190 1191
                ocl_sum(_src, _res, OCL_OP_SUM),
                _res)
1192
#endif
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1193

1194
    Mat src = _src.getMat();
1195
    CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_sum(src, _res), _res);
1196

1197
    int k, cn = src.channels(), depth = src.depth();
1198
    SumFunc func = getSumFunc(depth);
1199
    CV_Assert( cn <= 4 && func != 0 );
1200

1201 1202 1203 1204 1205 1206 1207 1208 1209 1210
    const Mat* arrays[] = {&src, 0};
    uchar* ptrs[1];
    NAryMatIterator it(arrays, ptrs);
    Scalar s;
    int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
    int j, count = 0;
    AutoBuffer<int> _buf;
    int* buf = (int*)&s[0];
    size_t esz = 0;
    bool blockSum = depth < CV_32S;
1211

1212
    if( blockSum )
1213
    {
1214 1215 1216 1217
        intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
        blockSize = std::min(blockSize, intSumBlockSize);
        _buf.allocate(cn);
        buf = _buf;
1218

1219 1220 1221 1222
        for( k = 0; k < cn; k++ )
            buf[k] = 0;
        esz = src.elemSize();
    }
1223

1224 1225 1226
    for( size_t i = 0; i < it.nplanes; i++, ++it )
    {
        for( j = 0; j < total; j += blockSize )
1227
        {
1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240
            int bsz = std::min(total - j, blockSize);
            func( ptrs[0], 0, (uchar*)buf, bsz, cn );
            count += bsz;
            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
            {
                for( k = 0; k < cn; k++ )
                {
                    s[k] += buf[k];
                    buf[k] = 0;
                }
                count = 0;
            }
            ptrs[0] += bsz*esz;
1241 1242
        }
    }
1243
    return s;
1244 1245
}

1246 1247
#ifdef HAVE_OPENCL

1248 1249 1250 1251
namespace cv {

static bool ocl_countNonZero( InputArray _src, int & res )
{
1252
    int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

    if (depth == CV_64F && !doubleSupport)
        return false;

    int dbsize = ocl::Device::getDefault().maxComputeUnits();
    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();

    int wgs2_aligned = 1;
    while (wgs2_aligned < (int)wgs)
        wgs2_aligned <<= 1;
    wgs2_aligned >>= 1;

Ilya Lavrenov's avatar
Ilya Lavrenov committed
1266
    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
1267 1268
                  format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
                         " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
1269 1270
                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
                         ocl::typeToStr(depth), (int)wgs, kercn,
1271 1272
                         wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                         _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1273 1274 1275 1276
    if (k.empty())
        return false;

    UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
1277 1278 1279 1280 1281
    k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
           dbsize, ocl::KernelArg::PtrWriteOnly(db));

    size_t globalsize = dbsize * wgs;
    if (k.run(1, &globalsize, &wgs, true))
1282
        return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
1283 1284 1285 1286 1287
    return false;
}

}

1288 1289
#endif

1290 1291 1292
#if defined HAVE_IPP
namespace cv {

1293
static bool ipp_countNonZero( Mat &src, int &res )
1294
{
1295 1296
    CV_INSTRUMENT_REGION_IPP()

1297 1298 1299 1300 1301 1302
#if IPP_VERSION_X100 < 201801
    // Poor performance of SSE42
    if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
        return false;
#endif

1303 1304
    Ipp32s  count = 0;
    int     depth = src.depth();
1305

1306
    if(src.dims <= 2)
1307
    {
1308 1309 1310 1311 1312 1313 1314 1315 1316
        IppStatus status;
        IppiSize  size = {src.cols*src.channels(), src.rows};

        if(depth == CV_8U)
            status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0);
        else if(depth == CV_32F)
            status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0);
        else
            return false;
1317

1318 1319
        if(status < 0)
            return false;
1320

1321 1322 1323
        res = size.width*size.height - count;
    }
    else
1324
    {
1325 1326
        IppStatus       status;
        const Mat      *arrays[] = {&src, NULL};
1327 1328
        Mat            planes[1];
        NAryMatIterator it(arrays, planes, 1);
1329
        IppiSize        size  = {(int)it.size*src.channels(), 1};
1330
        res = 0;
1331 1332 1333
        for (size_t i = 0; i < it.nplanes; i++, ++it)
        {
            if(depth == CV_8U)
1334
                status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0);
1335
            else if(depth == CV_32F)
1336
                status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0);
1337 1338 1339
            else
                return false;

1340
            if(status < 0 || (int)it.planes->total()*src.channels() < count)
1341 1342
                return false;

1343
            res += (int)it.planes->total()*src.channels() - count;
1344
        }
1345
    }
1346 1347

    return true;
1348 1349 1350 1351 1352
}
}
#endif


1353
int cv::countNonZero( InputArray _src )
1354
{
1355 1356
    CV_INSTRUMENT_REGION()

Ilya Lavrenov's avatar
Ilya Lavrenov committed
1357 1358
    int type = _src.type(), cn = CV_MAT_CN(type);
    CV_Assert( cn == 1 );
1359

1360
#if defined HAVE_OPENCL || defined HAVE_IPP
1361
    int res = -1;
1362 1363 1364
#endif

#ifdef HAVE_OPENCL
1365
    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
1366 1367
                ocl_countNonZero(_src, res),
                res)
1368
#endif
1369

1370
    Mat src = _src.getMat();
1371
    CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1372 1373

    CountNonZeroFunc func = getCountNonZeroTab(src.depth());
1374
    CV_Assert( func != 0 );
1375

1376 1377 1378 1379
    const Mat* arrays[] = {&src, 0};
    uchar* ptrs[1];
    NAryMatIterator it(arrays, ptrs);
    int total = (int)it.size, nz = 0;
1380

1381 1382
    for( size_t i = 0; i < it.nplanes; i++, ++it )
        nz += func( ptrs[0], total );
1383

1384
    return nz;
1385
}
1386

1387 1388
#if defined HAVE_IPP
namespace cv
1389
{
1390 1391
static bool ipp_mean( Mat &src, Mat &mask, Scalar &ret )
{
1392 1393
    CV_INSTRUMENT_REGION_IPP()

1394 1395
#if IPP_VERSION_X100 >= 700
    size_t total_size = src.total();
1396 1397 1398
    int cn = src.channels();
    if (cn > 4)
        return false;
1399 1400
    int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
    if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
1401
    {
1402 1403 1404
        IppiSize sz = { cols, rows };
        int type = src.type();
        if( !mask.empty() )
1405
        {
1406
            typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
1407
            ippiMaskMeanFuncC1 ippiMean_C1MR =
1408 1409 1410 1411
            type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
            type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
            type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR :
            0;
1412
            if( ippiMean_C1MR )
1413
            {
1414
                Ipp64f res;
1415
                if( CV_INSTRUMENT_FUN_IPP(ippiMean_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &res) >= 0 )
1416
                {
1417 1418
                    ret = Scalar(res);
                    return true;
1419
                }
1420 1421
            }
            typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
1422
            ippiMaskMeanFuncC3 ippiMean_C3MR =
1423 1424 1425 1426
            type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
            type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
            type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR :
            0;
1427
            if( ippiMean_C3MR )
1428 1429
            {
                Ipp64f res1, res2, res3;
1430 1431 1432
                if( CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &res1) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &res2) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &res3) >= 0 )
1433
                {
1434 1435
                    ret = Scalar(res1, res2, res3);
                    return true;
1436 1437
                }
            }
1438 1439 1440 1441 1442
        }
        else
        {
            typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
            typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
1443
            ippiMeanFuncHint ippiMeanHint =
1444 1445 1446 1447
                type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
                type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
                type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
                0;
1448
            ippiMeanFuncNoHint ippiMean =
1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459
                type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
                type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
                type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
                type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
                type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
                type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
                type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
                type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
                type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
                0;
            // Make sure only zero or one version of the function pointer is valid
1460 1461
            CV_Assert(!ippiMeanHint || !ippiMean);
            if( ippiMeanHint || ippiMean )
1462
            {
1463
                Ipp64f res[4];
1464 1465
                IppStatus status = ippiMeanHint ? CV_INSTRUMENT_FUN_IPP(ippiMeanHint, src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
                                CV_INSTRUMENT_FUN_IPP(ippiMean, src.ptr(), (int)src.step[0], sz, res);
1466
                if( status >= 0 )
1467
                {
1468
                    for( int i = 0; i < cn; i++ )
1469 1470
                        ret[i] = res[i];
                    return true;
1471 1472 1473 1474
                }
            }
        }
    }
1475 1476 1477 1478 1479 1480
    return false;
#else
    return false;
#endif
}
}
1481
#endif
1482

1483 1484
cv::Scalar cv::mean( InputArray _src, InputArray _mask )
{
1485 1486
    CV_INSTRUMENT_REGION()

1487 1488 1489 1490 1491 1492 1493 1494
    Mat src = _src.getMat(), mask = _mask.getMat();
    CV_Assert( mask.empty() || mask.type() == CV_8U );

    int k, cn = src.channels(), depth = src.depth();
    Scalar s;

    CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_mean(src, mask, s), s)

1495
    SumFunc func = getSumFunc(depth);
1496

1497
    CV_Assert( cn <= 4 && func != 0 );
1498

1499 1500 1501 1502 1503 1504 1505 1506 1507
    const Mat* arrays[] = {&src, &mask, 0};
    uchar* ptrs[2];
    NAryMatIterator it(arrays, ptrs);
    int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
    int j, count = 0;
    AutoBuffer<int> _buf;
    int* buf = (int*)&s[0];
    bool blockSum = depth <= CV_16S;
    size_t esz = 0, nz0 = 0;
1508

1509 1510 1511 1512 1513 1514
    if( blockSum )
    {
        intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
        blockSize = std::min(blockSize, intSumBlockSize);
        _buf.allocate(cn);
        buf = _buf;
1515

1516 1517 1518 1519
        for( k = 0; k < cn; k++ )
            buf[k] = 0;
        esz = src.elemSize();
    }
1520

1521
    for( size_t i = 0; i < it.nplanes; i++, ++it )
1522
    {
1523 1524 1525 1526 1527 1528 1529
        for( j = 0; j < total; j += blockSize )
        {
            int bsz = std::min(total - j, blockSize);
            int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn );
            count += nz;
            nz0 += nz;
            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
1530
            {
1531 1532 1533 1534 1535 1536
                for( k = 0; k < cn; k++ )
                {
                    s[k] += buf[k];
                    buf[k] = 0;
                }
                count = 0;
1537
            }
1538 1539 1540 1541
            ptrs[0] += bsz*esz;
            if( ptrs[1] )
                ptrs[1] += bsz;
        }
1542
    }
1543
    return s*(nz0 ? 1./nz0 : 0);
1544
}
1545

1546 1547
#ifdef HAVE_OPENCL

1548 1549
namespace cv {

1550
static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
1551
{
1552 1553
    CV_INSTRUMENT_REGION_OPENCL()

1554
    bool haveMask = _mask.kind() != _InputArray::NONE;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1555
    int nz = haveMask ? -1 : (int)_src.total();
1556
    Scalar mean(0), stddev(0);
1557 1558 1559
    const int cn = _src.channels();
    if (cn > 4)
        return false;
1560

Ilya Lavrenov's avatar
Ilya Lavrenov committed
1561
    {
1562
        int type = _src.type(), depth = CV_MAT_DEPTH(type);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1563
        bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
1564 1565
                isContinuous = _src.isContinuous(),
                isMaskContinuous = _mask.isContinuous();
1566 1567 1568 1569 1570 1571 1572 1573
        const ocl::Device &defDev = ocl::Device::getDefault();
        int groups = defDev.maxComputeUnits();
        if (defDev.isIntel())
        {
            static const int subSliceEUCount = 10;
            groups = (groups / subSliceEUCount) * 2;
        }
        size_t wgs = defDev.maxWorkGroupSize();
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584

        int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
                dtype = CV_MAKE_TYPE(ddepth, cn),
                sqdtype = CV_MAKETYPE(sqddepth, cn);
        CV_Assert(!haveMask || _mask.type() == CV_8UC1);

        int wgs2_aligned = 1;
        while (wgs2_aligned < (int)wgs)
            wgs2_aligned <<= 1;
        wgs2_aligned >>= 1;

1585
        if ( (!doubleSupport && depth == CV_64F) )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1586 1587 1588 1589
            return false;

        char cvt[2][40];
        String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d"
1590
                             " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s"
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1591 1592 1593 1594 1595 1596
                             " -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
                             ocl::typeToStr(type), ocl::typeToStr(depth),
                             ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth,
                             ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth),
                             ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]),
                             cn, isContinuous ? " -D HAVE_SRC_CONT" : "",
1597
                             isMaskContinuous ? " -D HAVE_MASK_CONT" : "",
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619
                             ocl::convertTypeStr(depth, ddepth, cn, cvt[1]),
                             (int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "",
                             doubleSupport ? " -D DOUBLE_SUPPORT" : "");

        ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts);
        if (k.empty())
            return false;

        int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) +
                               CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype));
        UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();

        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
                dbarg = ocl::KernelArg::PtrWriteOnly(db),
                maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);

        if (haveMask)
            k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg);
        else
            k.args(srcarg, src.cols, (int)src.total(), groups, dbarg);

        size_t globalsize = groups * wgs;
1620

1621
        if(!k.run(1, &globalsize, &wgs, false))
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1622 1623 1624 1625 1626 1627
            return false;

        typedef Scalar (* part_sum)(Mat m);
        part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> };
        Mat dbm = db.getMat(ACCESS_READ);

1628 1629
        mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.ptr()));
        stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.ptr() + groups * CV_ELEM_SIZE(dtype)));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1630 1631

        if (haveMask)
1632
            nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.ptr() +
Ilya Lavrenov's avatar
Ilya Lavrenov committed
1633 1634 1635 1636
                                                 groups * (CV_ELEM_SIZE(dtype) +
                                                           CV_ELEM_SIZE(sqdtype))))[0]);
    }

1637
    double total = nz != 0 ? 1.0 / nz : 0;
1638
    int k, j;
1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668
    for (int i = 0; i < cn; ++i)
    {
        mean[i] *= total;
        stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.));
    }

    for( j = 0; j < 2; j++ )
    {
        const double * const sptr = j == 0 ? &mean[0] : &stddev[0];
        _OutputArray _dst = j == 0 ? _mean : _sdv;
        if( !_dst.needed() )
            continue;

        if( !_dst.fixedSize() )
            _dst.create(cn, 1, CV_64F, -1, true);
        Mat dst = _dst.getMat();
        int dcn = (int)dst.total();
        CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
                   (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
        double* dptr = dst.ptr<double>();
        for( k = 0; k < cn; k++ )
            dptr[k] = sptr[k];
        for( ; k < dcn; k++ )
            dptr[k] = 0;
    }

    return true;
}

}
1669

1670 1671
#endif

1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685
#ifdef HAVE_OPENVX
namespace cv
{
    static bool openvx_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& mask)
    {
        size_t total_size = src.total();
        int rows = src.size[0], cols = rows ? (int)(total_size / rows) : 0;
        if (src.type() != CV_8UC1|| !mask.empty() ||
               (src.dims != 2 && !(src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size))
           )
        return false;

        try
        {
1686
            ivx::Context ctx = ovx::getOpenVXContext();
1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
#ifndef VX_VERSION_1_1
            if (ctx.vendorID() == VX_ID_KHRONOS)
                return false; // Do not use OpenVX meanStdDev estimation for sample 1.0.1 implementation due to lack of accuracy
#endif

            ivx::Image
                ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
                    ivx::Image::createAddressing(cols, rows, 1, (vx_int32)(src.step[0])), src.ptr());

            vx_float32 mean_temp, stddev_temp;
            ivx::IVX_CHECK_STATUS(vxuMeanStdDev(ctx, ia, &mean_temp, &stddev_temp));

            if (_mean.needed())
            {
                if (!_mean.fixedSize())
                    _mean.create(1, 1, CV_64F, -1, true);
                Mat mean = _mean.getMat();
                CV_Assert(mean.type() == CV_64F && mean.isContinuous() &&
                    (mean.cols == 1 || mean.rows == 1) && mean.total() >= 1);
                double *pmean = mean.ptr<double>();
                pmean[0] = mean_temp;
                for (int c = 1; c < (int)mean.total(); c++)
                    pmean[c] = 0;
            }

            if (_sdv.needed())
            {
                if (!_sdv.fixedSize())
                    _sdv.create(1, 1, CV_64F, -1, true);
                Mat stddev = _sdv.getMat();
                CV_Assert(stddev.type() == CV_64F && stddev.isContinuous() &&
                    (stddev.cols == 1 || stddev.rows == 1) && stddev.total() >= 1);
                double *pstddev = stddev.ptr<double>();
                pstddev[0] = stddev_temp;
                for (int c = 1; c < (int)stddev.total(); c++)
                    pstddev[c] = 0;
            }
        }
        catch (ivx::RuntimeError & e)
        {
1727
            VX_DbgThrow(e.what());
1728 1729 1730
        }
        catch (ivx::WrapperError & e)
        {
1731
            VX_DbgThrow(e.what());
1732
        }
1733 1734

        return true;
1735 1736 1737 1738
    }
}
#endif

1739
#ifdef HAVE_IPP
1740
namespace cv
1741
{
1742 1743
static bool ipp_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& mask)
{
1744 1745
    CV_INSTRUMENT_REGION_IPP()

1746
#if IPP_VERSION_X100 >= 700
1747
    int cn = src.channels();
1748 1749 1750 1751 1752 1753 1754

#if IPP_VERSION_X100 < 201801
    // IPP_DISABLE: C3C functions can read outside of allocated memory
    if (cn > 1)
        return false;
#endif

1755
    size_t total_size = src.total();
1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790
    int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
    if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
    {
        Ipp64f mean_temp[3];
        Ipp64f stddev_temp[3];
        Ipp64f *pmean = &mean_temp[0];
        Ipp64f *pstddev = &stddev_temp[0];
        Mat mean, stddev;
        int dcn_mean = -1;
        if( _mean.needed() )
        {
            if( !_mean.fixedSize() )
                _mean.create(cn, 1, CV_64F, -1, true);
            mean = _mean.getMat();
            dcn_mean = (int)mean.total();
            pmean = mean.ptr<Ipp64f>();
        }
        int dcn_stddev = -1;
        if( _sdv.needed() )
        {
            if( !_sdv.fixedSize() )
                _sdv.create(cn, 1, CV_64F, -1, true);
            stddev = _sdv.getMat();
            dcn_stddev = (int)stddev.total();
            pstddev = stddev.ptr<Ipp64f>();
        }
        for( int c = cn; c < dcn_mean; c++ )
            pmean[c] = 0;
        for( int c = cn; c < dcn_stddev; c++ )
            pstddev[c] = 0;
        IppiSize sz = { cols, rows };
        int type = src.type();
        if( !mask.empty() )
        {
            typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *);
1791
            ippiMaskMeanStdDevFuncC1 ippiMean_StdDev_C1MR =
1792
            type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
1793 1794 1795
            type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
            type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR :
            0;
1796
            if( ippiMean_StdDev_C1MR )
1797
            {
1798
                if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, pmean, pstddev) >= 0 )
1799 1800 1801
                {
                    return true;
                }
1802
            }
1803
            typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
1804
            ippiMaskMeanStdDevFuncC3 ippiMean_StdDev_C3CMR =
1805
            type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
1806 1807 1808
            type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
            type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR :
            0;
1809
            if( ippiMean_StdDev_C3CMR )
1810
            {
1811 1812 1813
                if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
1814 1815 1816
                {
                    return true;
                }
1817
            }
1818
        }
1819 1820 1821
        else
        {
            typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *);
1822
            ippiMeanStdDevFuncC1 ippiMean_StdDev_C1R =
1823
            type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R :
1824
            type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R :
1825
#if (IPP_VERSION_X100 >= 810)
1826 1827 1828
            type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
#endif
            0;
1829
            if( ippiMean_StdDev_C1R )
1830
            {
1831
                if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C1R, src.ptr(), (int)src.step[0], sz, pmean, pstddev) >= 0 )
1832 1833 1834
                {
                    return true;
                }
1835
            }
1836
            typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
1837
            ippiMeanStdDevFuncC3 ippiMean_StdDev_C3CR =
1838
            type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR :
1839 1840 1841
            type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR :
            type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR :
            0;
1842
            if( ippiMean_StdDev_C3CR )
1843
            {
1844 1845 1846
                if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
1847 1848 1849
                {
                    return true;
                }
1850 1851 1852
            }
        }
    }
1853 1854
#else
    CV_UNUSED(src); CV_UNUSED(_mean); CV_UNUSED(_sdv); CV_UNUSED(mask);
1855
#endif
1856 1857 1858 1859 1860 1861 1862
    return false;
}
}
#endif

void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
{
1863 1864
    CV_INSTRUMENT_REGION()

1865 1866 1867 1868 1869 1870
    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
               ocl_meanStdDev(_src, _mean, _sdv, _mask))

    Mat src = _src.getMat(), mask = _mask.getMat();
    CV_Assert( mask.empty() || mask.type() == CV_8UC1 );

1871
    CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MEAN_STDDEV>(src.cols, src.rows),
1872
               openvx_meanStdDev(src, _mean, _sdv, mask))
1873

1874
    CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_meanStdDev(src, _mean, _sdv, mask));
1875 1876 1877

    int k, cn = src.channels(), depth = src.depth();

1878
    SumSqrFunc func = getSumSqrTab(depth);
1879

1880
    CV_Assert( func != 0 );
1881

1882 1883 1884
    const Mat* arrays[] = {&src, &mask, 0};
    uchar* ptrs[2];
    NAryMatIterator it(arrays, ptrs);
1885
    int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
1886 1887 1888 1889 1890 1891
    int j, count = 0, nz0 = 0;
    AutoBuffer<double> _buf(cn*4);
    double *s = (double*)_buf, *sq = s + cn;
    int *sbuf = (int*)s, *sqbuf = (int*)sq;
    bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S;
    size_t esz = 0;
1892

1893 1894
    for( k = 0; k < cn; k++ )
        s[k] = sq[k] = 0;
1895

1896
    if( blockSum )
1897
    {
1898 1899 1900 1901 1902 1903 1904 1905 1906
        intSumBlockSize = 1 << 15;
        blockSize = std::min(blockSize, intSumBlockSize);
        sbuf = (int*)(sq + cn);
        if( blockSqSum )
            sqbuf = sbuf + cn;
        for( k = 0; k < cn; k++ )
            sbuf[k] = sqbuf[k] = 0;
        esz = src.elemSize();
    }
1907

1908
    for( size_t i = 0; i < it.nplanes; i++, ++it )
1909
    {
1910
        for( j = 0; j < total; j += blockSize )
1911
        {
1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935
            int bsz = std::min(total - j, blockSize);
            int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn );
            count += nz;
            nz0 += nz;
            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
            {
                for( k = 0; k < cn; k++ )
                {
                    s[k] += sbuf[k];
                    sbuf[k] = 0;
                }
                if( blockSqSum )
                {
                    for( k = 0; k < cn; k++ )
                    {
                        sq[k] += sqbuf[k];
                        sqbuf[k] = 0;
                    }
                }
                count = 0;
            }
            ptrs[0] += bsz*esz;
            if( ptrs[1] )
                ptrs[1] += bsz;
1936
        }
1937
    }
1938

1939
    double scale = nz0 ? 1./nz0 : 0.;
1940
    for( k = 0; k < cn; k++ )
1941
    {
1942 1943
        s[k] *= scale;
        sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.));
1944
    }
1945

1946
    for( j = 0; j < 2; j++ )
1947
    {
1948
        const double* sptr = j == 0 ? s : sq;
1949
        _OutputArray _dst = j == 0 ? _mean : _sdv;
1950 1951 1952 1953 1954 1955 1956 1957
        if( !_dst.needed() )
            continue;

        if( !_dst.fixedSize() )
            _dst.create(cn, 1, CV_64F, -1, true);
        Mat dst = _dst.getMat();
        int dcn = (int)dst.total();
        CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
1958
                   (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
1959 1960 1961 1962 1963
        double* dptr = dst.ptr<double>();
        for( k = 0; k < cn; k++ )
            dptr[k] = sptr[k];
        for( ; k < dcn; k++ )
            dptr[k] = 0;
1964 1965 1966 1967 1968 1969 1970
    }
}

/****************************************************************************************\
*                                       minMaxLoc                                        *
\****************************************************************************************/

1971 1972 1973
namespace cv
{

1974
template<typename T, typename WT> static void
1975 1976
minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal,
            size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx )
1977
{
1978
    WT minVal = *_minVal, maxVal = *_maxVal;
1979
    size_t minIdx = *_minIdx, maxIdx = *_maxIdx;
1980

1981
    if( !mask )
1982
    {
1983
        for( int i = 0; i < len; i++ )
1984
        {
1985
            T val = src[i];
1986
            if( val < minVal )
1987
            {
1988
                minVal = val;
1989
                minIdx = startIdx + i;
1990
            }
1991
            if( val > maxVal )
1992
            {
1993
                maxVal = val;
1994
                maxIdx = startIdx + i;
1995 1996 1997
            }
        }
    }
1998
    else
1999
    {
2000
        for( int i = 0; i < len; i++ )
2001
        {
2002 2003
            T val = src[i];
            if( mask[i] && val < minVal )
2004
            {
2005
                minVal = val;
2006
                minIdx = startIdx + i;
2007
            }
2008
            if( mask[i] && val > maxVal )
2009
            {
2010
                maxVal = val;
2011
                maxIdx = startIdx + i;
2012 2013 2014 2015
            }
        }
    }

2016 2017 2018 2019
    *_minIdx = minIdx;
    *_maxIdx = maxIdx;
    *_minVal = minVal;
    *_maxVal = maxVal;
2020 2021
}

2022 2023 2024
static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval,
                         size_t* minidx, size_t* maxidx, int len, size_t startidx )
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
2025

2026 2027 2028
static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval,
                         size_t* minidx, size_t* maxidx, int len, size_t startidx )
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
2029

2030 2031 2032
static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval,
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
2033

2034 2035 2036
static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval,
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
2037

2038 2039 2040
static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval,
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
2041

2042 2043 2044
static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval,
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
2045

2046 2047
static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval,
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
2048
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
2049

2050
typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t);
2051

2052
static MinMaxIdxFunc getMinmaxTab(int depth)
2053
{
2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064
    static MinMaxIdxFunc minmaxTab[] =
    {
        (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s),
        (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s),
        (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32s),
        (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_64f),
        0
    };

    return minmaxTab[depth];
}
2065

2066 2067 2068
static void ofs2idx(const Mat& a, size_t ofs, int* idx)
{
    int i, d = a.dims;
2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079
    if( ofs > 0 )
    {
        ofs--;
        for( i = d-1; i >= 0; i-- )
        {
            int sz = a.size[i];
            idx[i] = (int)(ofs % sz);
            ofs /= sz;
        }
    }
    else
2080
    {
2081 2082
        for( i = d-1; i >= 0; i-- )
            idx[i] = -1;
2083 2084
    }
}
2085

2086
#ifdef HAVE_OPENCL
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2087

2088 2089
#define MINMAX_STRUCT_ALIGNMENT 8 // sizeof double

Konstantin Matskevich's avatar
Konstantin Matskevich committed
2090
template <typename T>
2091
void getMinMaxRes(const Mat & db, double * minVal, double * maxVal,
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2092
                  int* minLoc, int* maxLoc,
2093
                  int groupnum, int cols, double * maxVal2)
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2094
{
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2095 2096
    uint index_max = std::numeric_limits<uint>::max();
    T minval = std::numeric_limits<T>::max();
2097
    T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min(), maxval2 = maxval;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2098 2099
    uint minloc = index_max, maxloc = index_max;

2100
    size_t index = 0;
2101
    const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2102 2103 2104
    const uint * minlocptr = NULL, * maxlocptr = NULL;
    if (minVal || minLoc)
    {
2105
        minptr = db.ptr<T>();
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2106
        index += sizeof(T) * groupnum;
2107
        index = alignSize(index, MINMAX_STRUCT_ALIGNMENT);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2108 2109 2110
    }
    if (maxVal || maxLoc)
    {
2111
        maxptr = (const T *)(db.ptr() + index);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2112
        index += sizeof(T) * groupnum;
2113
        index = alignSize(index, MINMAX_STRUCT_ALIGNMENT);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2114 2115 2116
    }
    if (minLoc)
    {
2117
        minlocptr = (const uint *)(db.ptr() + index);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2118
        index += sizeof(uint) * groupnum;
2119
        index = alignSize(index, MINMAX_STRUCT_ALIGNMENT);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2120 2121
    }
    if (maxLoc)
2122
    {
2123
        maxlocptr = (const uint *)(db.ptr() + index);
2124
        index += sizeof(uint) * groupnum;
2125
        index = alignSize(index, MINMAX_STRUCT_ALIGNMENT);
2126 2127
    }
    if (maxVal2)
2128
        maxptr2 = (const T *)(db.ptr() + index);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2129

2130
    for (int i = 0; i < groupnum; i++)
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2131
    {
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2132
        if (minptr && minptr[i] <= minval)
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2133
        {
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158
            if (minptr[i] == minval)
            {
                if (minlocptr)
                    minloc = std::min(minlocptr[i], minloc);
            }
            else
            {
                if (minlocptr)
                    minloc = minlocptr[i];
                minval = minptr[i];
            }
        }
        if (maxptr && maxptr[i] >= maxval)
        {
            if (maxptr[i] == maxval)
            {
                if (maxlocptr)
                    maxloc = std::min(maxlocptr[i], maxloc);
            }
            else
            {
                if (maxlocptr)
                    maxloc = maxlocptr[i];
                maxval = maxptr[i];
            }
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2159
        }
2160 2161
        if (maxptr2 && maxptr2[i] > maxval2)
            maxval2 = maxptr2[i];
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2162
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2163 2164 2165
    bool zero_mask = (minLoc && minloc == index_max) ||
            (maxLoc && maxloc == index_max);

2166
    if (minVal)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2167
        *minVal = zero_mask ? 0 : (double)minval;
2168
    if (maxVal)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2169
        *maxVal = zero_mask ? 0 : (double)maxval;
2170 2171
    if (maxVal2)
        *maxVal2 = zero_mask ? 0 : (double)maxval2;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2172

2173
    if (minLoc)
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2174
    {
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2175 2176
        minLoc[0] = zero_mask ? -1 : minloc / cols;
        minLoc[1] = zero_mask ? -1 : minloc % cols;
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2177
    }
2178
    if (maxLoc)
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2179
    {
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2180 2181
        maxLoc[0] = zero_mask ? -1 : maxloc / cols;
        maxLoc[1] = zero_mask ? -1 : maxloc % cols;
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2182 2183 2184
    }
}

2185 2186
typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal,
                                 int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2);
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2187

2188
static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
2189
                           int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL)
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2190
{
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2191
    const ocl::Device & dev = ocl::Device::getDefault();
2192

2193
#ifdef __ANDROID__
2194 2195 2196 2197
    if (dev.isNVidia())
        return false;
#endif

2198 2199
    bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
        haveSrc2 = _src2.kind() != _InputArray::NONE;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2200
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
2201
            kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src, _src2));
2202

2203 2204
    // disabled following modes since it occasionally fails on AMD devices (e.g. A10-6800K, sep. 2014)
    if ((haveMask || type == CV_32FC1) && dev.isAMD())
2205
        return false;
2206

Ilya Lavrenov's avatar
Ilya Lavrenov committed
2207
    CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
2208
              (cn >= 1 && !minLoc && !maxLoc) );
2209

2210 2211
    if (ddepth < 0)
        ddepth = depth;
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2212

2213 2214
    CV_Assert(!haveSrc2 || _src2.type() == type);

2215
    if (depth == CV_32S)
2216 2217
        return false;

2218
    if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2219 2220
        return false;

Ilya Lavrenov's avatar
Ilya Lavrenov committed
2221 2222
    int groupnum = dev.maxComputeUnits();
    size_t wgs = dev.maxWorkGroupSize();
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2223 2224 2225 2226 2227 2228

    int wgs2_aligned = 1;
    while (wgs2_aligned < (int)wgs)
        wgs2_aligned <<= 1;
    wgs2_aligned >>= 1;

Ilya Lavrenov's avatar
Ilya Lavrenov committed
2229 2230 2231 2232 2233 2234
    bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL,
            needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL;

    // in case of mask we must know whether mask is filled with zeros or not
    // so let's calculate min or max location, if it's undefined, so mask is zeros
    if (!(needMaxLoc || needMinLoc) && haveMask)
2235
    {
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2236 2237 2238
        if (needMinVal)
            needMinLoc = true;
        else
2239
            needMaxLoc = true;
2240
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2241

2242
    char cvt[2][40];
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2243
    String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
2244
                         " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
2245 2246
                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s"
                         " -D MINMAX_STRUCT_ALIGNMENT=%d",
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2247 2248 2249
                         depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
2250
                         _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2251 2252
                         _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
                         needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
2253 2254
                         needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
                         ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
2255 2256
                         ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]),
                         absValues ? " -D OP_ABS" : "",
2257
                         haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
2258
                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth,
2259 2260
                         depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert",
                         MINMAX_STRUCT_ALIGNMENT);
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2261

Ilya Lavrenov's avatar
Ilya Lavrenov committed
2262
    ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2263 2264 2265
    if (k.empty())
        return false;

2266
    int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2267
            dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
2268
                                 (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
2269 2270
                                 (maxVal2 ? esz : 0))
                     + 5 * MINMAX_STRUCT_ALIGNMENT;
2271
    UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2272

2273
    if (cn > 1 && !haveMask)
2274
    {
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2275
        src = src.reshape(1);
2276 2277
        src2 = src2.reshape(1);
    }
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2278

2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
    if (haveSrc2)
    {
        if (!haveMask)
            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2));
        else
            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask),
                   ocl::KernelArg::ReadOnlyNoSize(src2));
    }
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2289
    else
2290 2291 2292 2293 2294 2295 2296 2297
    {
        if (!haveMask)
            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
                   groupnum, ocl::KernelArg::PtrWriteOnly(db));
        else
            k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
                   groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
    }
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2298 2299

    size_t globalsize = groupnum * wgs;
2300
    if (!k.run(1, &globalsize, &wgs, true))
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2301 2302
        return false;

Ilya Lavrenov's avatar
Ilya Lavrenov committed
2303
    static const getMinMaxResFunc functab[7] =
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2304 2305 2306 2307 2308 2309 2310 2311 2312 2313
    {
        getMinMaxRes<uchar>,
        getMinMaxRes<char>,
        getMinMaxRes<ushort>,
        getMinMaxRes<short>,
        getMinMaxRes<int>,
        getMinMaxRes<float>,
        getMinMaxRes<double>
    };

2314
    getMinMaxResFunc func = functab[ddepth];
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2315

Ilya Lavrenov's avatar
Ilya Lavrenov committed
2316 2317 2318
    int locTemp[2];
    func(db.getMat(ACCESS_READ), minVal, maxVal,
         needMinLoc ? minLoc ? minLoc : locTemp : minLoc,
2319 2320
         needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc,
         groupnum, src.cols, maxVal2);
Konstantin Matskevich's avatar
Konstantin Matskevich committed
2321 2322 2323

    return true;
}
2324 2325 2326

#endif

2327
#ifdef HAVE_OPENVX
2328 2329 2330
namespace ovx {
    template <> inline bool skipSmallImages<VX_KERNEL_MINMAXLOC>(int w, int h) { return w*h < 3840 * 2160; }
}
2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342
static bool openvx_minMaxIdx(Mat &src, double* minVal, double* maxVal, int* minIdx, int* maxIdx, Mat &mask)
{
    int stype = src.type();
    size_t total_size = src.total();
    int rows = src.size[0], cols = rows ? (int)(total_size / rows) : 0;
    if ((stype != CV_8UC1 && stype != CV_16SC1) || !mask.empty() ||
        (src.dims != 2 && !(src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size))
        )
        return false;

    try
    {
2343
        ivx::Context ctx = ovx::getOpenVXContext();
2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391
        ivx::Image
            ia = ivx::Image::createFromHandle(ctx, stype == CV_8UC1 ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_S16,
                ivx::Image::createAddressing(cols, rows, stype == CV_8UC1 ? 1 : 2, (vx_int32)(src.step[0])), src.ptr());

        ivx::Scalar vxMinVal = ivx::Scalar::create(ctx, stype == CV_8UC1 ? VX_TYPE_UINT8 : VX_TYPE_INT16, 0);
        ivx::Scalar vxMaxVal = ivx::Scalar::create(ctx, stype == CV_8UC1 ? VX_TYPE_UINT8 : VX_TYPE_INT16, 0);
        ivx::Array vxMinInd, vxMaxInd;
        ivx::Scalar vxMinCount, vxMaxCount;
        if (minIdx)
        {
            vxMinInd = ivx::Array::create(ctx, VX_TYPE_COORDINATES2D, 1);
            vxMinCount = ivx::Scalar::create(ctx, VX_TYPE_UINT32, 0);
        }
        if (maxIdx)
        {
            vxMaxInd = ivx::Array::create(ctx, VX_TYPE_COORDINATES2D, 1);
            vxMaxCount = ivx::Scalar::create(ctx, VX_TYPE_UINT32, 0);
        }

        ivx::IVX_CHECK_STATUS(vxuMinMaxLoc(ctx, ia, vxMinVal, vxMaxVal, vxMinInd, vxMaxInd, vxMinCount, vxMaxCount));

        if (minVal)
        {
            *minVal = stype == CV_8UC1 ? vxMinVal.getValue<vx_uint8>() : vxMinVal.getValue<vx_int16>();
        }
        if (maxVal)
        {
            *maxVal = stype == CV_8UC1 ? vxMaxVal.getValue<vx_uint8>() : vxMaxVal.getValue<vx_int16>();
        }
        if (minIdx)
        {
            if(vxMinCount.getValue<vx_uint32>()<1) throw ivx::RuntimeError(VX_ERROR_INVALID_VALUE, std::string(__func__) + "(): minimum value location not found");
            vx_coordinates2d_t loc;
            vxMinInd.copyRangeTo(0, 1, &loc);
            size_t minidx = loc.y * cols + loc.x + 1;
            ofs2idx(src, minidx, minIdx);
        }
        if (maxIdx)
        {
            if (vxMaxCount.getValue<vx_uint32>()<1) throw ivx::RuntimeError(VX_ERROR_INVALID_VALUE, std::string(__func__) + "(): maximum value location not found");
            vx_coordinates2d_t loc;
            vxMaxInd.copyRangeTo(0, 1, &loc);
            size_t maxidx = loc.y * cols + loc.x + 1;
            ofs2idx(src, maxidx, maxIdx);
        }
    }
    catch (ivx::RuntimeError & e)
    {
2392
        VX_DbgThrow(e.what());
2393 2394 2395
    }
    catch (ivx::WrapperError & e)
    {
2396
        VX_DbgThrow(e.what());
2397
    }
2398 2399

    return true;
2400 2401 2402
}
#endif

2403
#ifdef HAVE_IPP
2404 2405
static IppStatus ipp_minMaxIndex_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType,
    float* pMinVal, float* pMaxVal, IppiPoint* pMinIndex, IppiPoint* pMaxIndex, const Ipp8u*, int)
2406
{
2407 2408 2409 2410 2411 2412 2413 2414
    switch(dataType)
    {
    case ipp8u:  return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_8u_C1R, (const Ipp8u*)pSrc, srcStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex);
    case ipp16u: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_16u_C1R, (const Ipp16u*)pSrc, srcStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex);
    case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_32f_C1R, (const Ipp32f*)pSrc, srcStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex);
    default:     return ippStsDataTypeErr;
    }
}
2415

2416 2417 2418 2419
static IppStatus ipp_minMaxIndexMask_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType,
    float* pMinVal, float* pMaxVal, IppiPoint* pMinIndex, IppiPoint* pMaxIndex, const Ipp8u* pMask, int maskStep)
{
    switch(dataType)
2420
    {
2421 2422 2423 2424 2425 2426
    case ipp8u:  return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_8u_C1MR, (const Ipp8u*)pSrc, srcStep, pMask, maskStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex);
    case ipp16u: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_16u_C1MR, (const Ipp16u*)pSrc, srcStep, pMask, maskStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex);
    case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_32f_C1MR, (const Ipp32f*)pSrc, srcStep, pMask, maskStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex);
    default:     return ippStsDataTypeErr;
    }
}
2427

2428 2429 2430 2431
static IppStatus ipp_minMax_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType,
    float* pMinVal, float* pMaxVal, IppiPoint*, IppiPoint*, const Ipp8u*, int)
{
    IppStatus status;
2432

2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443
    switch(dataType)
    {
#if IPP_VERSION_X100 > 201701 // wrong min values
    case ipp8u:
    {
        Ipp8u val[2];
        status = CV_INSTRUMENT_FUN_IPP(ippiMinMax_8u_C1R, (const Ipp8u*)pSrc, srcStep, size, &val[0], &val[1]);
        *pMinVal = val[0];
        *pMaxVal = val[1];
        return status;
    }
2444
#endif
2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464
    case ipp16u:
    {
        Ipp16u val[2];
        status = CV_INSTRUMENT_FUN_IPP(ippiMinMax_16u_C1R, (const Ipp16u*)pSrc, srcStep, size, &val[0], &val[1]);
        *pMinVal = val[0];
        *pMaxVal = val[1];
        return status;
    }
    case ipp16s:
    {
        Ipp16s val[2];
        status = CV_INSTRUMENT_FUN_IPP(ippiMinMax_16s_C1R, (const Ipp16s*)pSrc, srcStep, size, &val[0], &val[1]);
        *pMinVal = val[0];
        *pMaxVal = val[1];
        return status;
    }
    case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMinMax_32f_C1R, (const Ipp32f*)pSrc, srcStep, size, pMinVal, pMaxVal);
    default:     return ipp_minMaxIndex_wrap(pSrc, srcStep, size, dataType, pMinVal, pMaxVal, NULL, NULL, NULL, 0);
    }
}
2465

2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539
static IppStatus ipp_minIdx_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType,
    float* pMinVal, float*, IppiPoint* pMinIndex, IppiPoint*, const Ipp8u*, int)
{
    IppStatus status;

    switch(dataType)
    {
    case ipp8u:
    {
        Ipp8u val;
        status = CV_INSTRUMENT_FUN_IPP(ippiMinIndx_8u_C1R, (const Ipp8u*)pSrc, srcStep, size, &val, &pMinIndex->x, &pMinIndex->y);
        *pMinVal = val;
        return status;
    }
    case ipp16u:
    {
        Ipp16u val;
        status = CV_INSTRUMENT_FUN_IPP(ippiMinIndx_16u_C1R, (const Ipp16u*)pSrc, srcStep, size, &val, &pMinIndex->x, &pMinIndex->y);
        *pMinVal = val;
        return status;
    }
    case ipp16s:
    {
        Ipp16s val;
        status = CV_INSTRUMENT_FUN_IPP(ippiMinIndx_16s_C1R, (const Ipp16s*)pSrc, srcStep, size, &val, &pMinIndex->x, &pMinIndex->y);
        *pMinVal = val;
        return status;
    }
    case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMinIndx_32f_C1R, (const Ipp32f*)pSrc, srcStep, size, pMinVal, &pMinIndex->x, &pMinIndex->y);
    default:     return ipp_minMaxIndex_wrap(pSrc, srcStep, size, dataType, pMinVal, NULL, pMinIndex, NULL, NULL, 0);
    }
}

static IppStatus ipp_maxIdx_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType,
    float*, float* pMaxVal, IppiPoint*, IppiPoint* pMaxIndex, const Ipp8u*, int)
{
    IppStatus status;

    switch(dataType)
    {
    case ipp8u:
    {
        Ipp8u val;
        status = CV_INSTRUMENT_FUN_IPP(ippiMaxIndx_8u_C1R, (const Ipp8u*)pSrc, srcStep, size, &val, &pMaxIndex->x, &pMaxIndex->y);
        *pMaxVal = val;
        return status;
    }
    case ipp16u:
    {
        Ipp16u val;
        status = CV_INSTRUMENT_FUN_IPP(ippiMaxIndx_16u_C1R, (const Ipp16u*)pSrc, srcStep, size, &val, &pMaxIndex->x, &pMaxIndex->y);
        *pMaxVal = val;
        return status;
    }
    case ipp16s:
    {
        Ipp16s val;
        status = CV_INSTRUMENT_FUN_IPP(ippiMaxIndx_16s_C1R, (const Ipp16s*)pSrc, srcStep, size, &val, &pMaxIndex->x, &pMaxIndex->y);
        *pMaxVal = val;
        return status;
    }
    case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMaxIndx_32f_C1R, (const Ipp32f*)pSrc, srcStep, size, pMaxVal, &pMaxIndex->x, &pMaxIndex->y);
    default:     return ipp_minMaxIndex_wrap(pSrc, srcStep, size, dataType, NULL, pMaxVal, NULL, pMaxIndex, NULL, 0);
    }
}

typedef IppStatus (*IppMinMaxSelector)(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType,
    float* pMinVal, float* pMaxVal, IppiPoint* pMinIndex, IppiPoint* pMaxIndex, const Ipp8u* pMask, int maskStep);

static bool ipp_minMaxIdx(Mat &src, double* _minVal, double* _maxVal, int* _minIdx, int* _maxIdx, Mat &mask)
{
#if IPP_VERSION_X100 >= 700
    CV_INSTRUMENT_REGION_IPP()

2540 2541
#if IPP_VERSION_X100 < 201800
    // cv::minMaxIdx problem with NaN input
2542
    // Disable 32F processing only
2543
    if(src.depth() == CV_32F && cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
2544 2545 2546
        return false;
#endif

2547
#if IPP_VERSION_X100 < 201801
2548
    // cv::minMaxIdx problem with index positions on AVX
2549
    if(!mask.empty() && _maxIdx && cv::ipp::getIppTopFeatures() != ippCPUID_SSE42)
2550 2551 2552
        return false;
#endif

2553 2554 2555 2556 2557 2558 2559
    IppStatus   status;
    IppDataType dataType = ippiGetDataType(src.depth());
    float       minVal = 0;
    float       maxVal = 0;
    IppiPoint   minIdx = {-1, -1};
    IppiPoint   maxIdx = {-1, -1};

2560 2561
    float       *pMinVal = (_minVal || _minIdx)?&minVal:NULL;
    float       *pMaxVal = (_maxVal || _maxIdx)?&maxVal:NULL;
2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573
    IppiPoint   *pMinIdx = (_minIdx)?&minIdx:NULL;
    IppiPoint   *pMaxIdx = (_maxIdx)?&maxIdx:NULL;

    IppMinMaxSelector ippMinMaxFun = ipp_minMaxIndexMask_wrap;
    if(mask.empty())
    {
        if(_maxVal && _maxIdx && !_minVal && !_minIdx)
            ippMinMaxFun = ipp_maxIdx_wrap;
        else if(!_maxVal && !_maxIdx && _minVal && _minIdx)
            ippMinMaxFun = ipp_minIdx_wrap;
        else if(_maxVal && !_maxIdx && _minVal && !_minIdx)
            ippMinMaxFun = ipp_minMax_wrap;
2574 2575
        else if(!_maxVal && !_maxIdx && !_minVal && !_minIdx)
            return false;
2576 2577 2578 2579 2580 2581 2582 2583 2584 2585
        else
            ippMinMaxFun = ipp_minMaxIndex_wrap;
    }

    if(src.dims <= 2)
    {
        IppiSize size = ippiSize(src.size());
        size.width *= src.channels();

        status = ippMinMaxFun(src.ptr(), (int)src.step, size, dataType, pMinVal, pMaxVal, pMinIdx, pMaxIdx, (Ipp8u*)mask.ptr(), (int)mask.step);
2586
        if(status < 0)
2587 2588 2589 2590 2591 2592 2593
            return false;
        if(_minVal)
            *_minVal = minVal;
        if(_maxVal)
            *_maxVal = maxVal;
        if(_minIdx)
        {
2594
#if IPP_VERSION_X100 < 201801
2595 2596
            // Should be just ippStsNoOperation check, but there is a bug in the function so we need additional checks
            if(status == ippStsNoOperation && !mask.empty() && !pMinIdx->x && !pMinIdx->y)
2597 2598 2599
#else
            if(status == ippStsNoOperation)
#endif
2600
            {
2601 2602 2603 2604 2605 2606 2607
                _minIdx[0] = -1;
                _minIdx[1] = -1;
            }
            else
            {
                _minIdx[0] = minIdx.y;
                _minIdx[1] = minIdx.x;
2608
            }
2609
        }
2610
        if(_maxIdx)
2611
        {
2612
#if IPP_VERSION_X100 < 201801
2613 2614
            // Should be just ippStsNoOperation check, but there is a bug in the function so we need additional checks
            if(status == ippStsNoOperation && !mask.empty() && !pMaxIdx->x && !pMaxIdx->y)
2615 2616 2617
#else
            if(status == ippStsNoOperation)
#endif
2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659
            {
                _maxIdx[0] = -1;
                _maxIdx[1] = -1;
            }
            else
            {
                _maxIdx[0] = maxIdx.y;
                _maxIdx[1] = maxIdx.x;
            }
        }
    }
    else
    {
        const Mat *arrays[] = {&src, mask.empty()?NULL:&mask, NULL};
        uchar     *ptrs[3]  = {NULL};
        NAryMatIterator it(arrays, ptrs);
        IppiSize size = ippiSize(it.size*src.channels(), 1);
        int srcStep      = (int)(size.width*src.elemSize1());
        int maskStep     = size.width;
        size_t idxPos    = 1;
        size_t minIdxAll = 0;
        size_t maxIdxAll = 0;
        float  minValAll = IPP_MAXABS_32F;
        float  maxValAll = -IPP_MAXABS_32F;

        for(size_t i = 0; i < it.nplanes; i++, ++it, idxPos += size.width)
        {
            status = ippMinMaxFun(ptrs[0], srcStep, size, dataType, pMinVal, pMaxVal, pMinIdx, pMaxIdx, ptrs[1], maskStep);
            if(status < 0)
                return false;
#if IPP_VERSION_X100 > 201701
            // Zero-mask check, function should return ippStsNoOperation warning
            if(status == ippStsNoOperation)
                    continue;
#else
            // Crude zero-mask check, waiting for fix in IPP function
            if(ptrs[1])
            {
                Mat localMask(Size(size.width, 1), CV_8U, ptrs[1], maskStep);
                if(!cv::countNonZero(localMask))
                    continue;
            }
2660 2661
#endif

2662
            if(_minVal && minVal < minValAll)
2663
            {
2664 2665 2666 2667 2668 2669 2670
                minValAll = minVal;
                minIdxAll = idxPos+minIdx.x;
            }
            if(_maxVal && maxVal > maxValAll)
            {
                maxValAll = maxVal;
                maxIdxAll = idxPos+maxIdx.x;
2671 2672
            }
        }
2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688
        if(!src.empty() && mask.empty())
        {
            if(minIdxAll == 0)
                minIdxAll = 1;
            if(maxValAll == 0)
                maxValAll = 1;
        }

        if(_minVal)
            *_minVal = minValAll;
        if(_maxVal)
            *_maxVal = maxValAll;
        if(_minIdx)
            ofs2idx(src, minIdxAll, _minIdx);
        if(_maxIdx)
            ofs2idx(src, maxIdxAll, _maxIdx);
2689
    }
2690 2691

    return true;
2692 2693
#else
    CV_UNUSED(src); CV_UNUSED(minVal); CV_UNUSED(maxVal); CV_UNUSED(minIdx); CV_UNUSED(maxIdx); CV_UNUSED(mask);
2694
    return false;
2695
#endif
2696 2697 2698 2699 2700 2701 2702 2703 2704
}
#endif

}

void cv::minMaxIdx(InputArray _src, double* minVal,
                   double* maxVal, int* minIdx, int* maxIdx,
                   InputArray _mask)
{
2705 2706
    CV_INSTRUMENT_REGION()

2707 2708 2709 2710 2711 2712 2713 2714
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
        (cn > 1 && _mask.empty() && !minIdx && !maxIdx) );

    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
               ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask))

    Mat src = _src.getMat(), mask = _mask.getMat();
2715

elenagvo's avatar
elenagvo committed
2716 2717 2718 2719
    if (src.dims <= 2)
        CALL_HAL(minMaxIdx, cv_hal_minMaxIdx, src.data, src.step, src.cols, src.rows, src.depth(), minVal, maxVal,
                 minIdx, maxIdx, mask.data);

2720
    CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MINMAXLOC>(src.cols, src.rows),
2721
               openvx_minMaxIdx(src, minVal, maxVal, minIdx, maxIdx, mask))
2722

2723
    CV_IPP_RUN_FAST(ipp_minMaxIdx(src, minVal, maxVal, minIdx, maxIdx, mask))
2724

2725
    MinMaxIdxFunc func = getMinmaxTab(depth);
2726
    CV_Assert( func != 0 );
2727

2728 2729 2730
    const Mat* arrays[] = {&src, &mask, 0};
    uchar* ptrs[2];
    NAryMatIterator it(arrays, ptrs);
2731

2732
    size_t minidx = 0, maxidx = 0;
2733
    int iminval = INT_MAX, imaxval = INT_MIN;
2734 2735
    float  fminval = std::numeric_limits<float>::infinity(),  fmaxval = -fminval;
    double dminval = std::numeric_limits<double>::infinity(), dmaxval = -dminval;
2736 2737
    size_t startidx = 1;
    int *minval = &iminval, *maxval = &imaxval;
2738
    int planeSize = (int)it.size*cn;
2739

2740 2741 2742 2743
    if( depth == CV_32F )
        minval = (int*)&fminval, maxval = (int*)&fmaxval;
    else if( depth == CV_64F )
        minval = (int*)&dminval, maxval = (int*)&dmaxval;
2744

2745 2746
    for( size_t i = 0; i < it.nplanes; i++, ++it, startidx += planeSize )
        func( ptrs[0], ptrs[1], minval, maxval, &minidx, &maxidx, planeSize, startidx );
2747

2748 2749 2750 2751 2752 2753 2754 2755
    if (!src.empty() && mask.empty())
    {
        if( minidx == 0 )
             minidx = 1;
         if( maxidx == 0 )
             maxidx = 1;
    }

2756
    if( minidx == 0 )
2757 2758 2759 2760 2761
        dminval = dmaxval = 0;
    else if( depth == CV_32F )
        dminval = fminval, dmaxval = fmaxval;
    else if( depth <= CV_32S )
        dminval = iminval, dmaxval = imaxval;
2762

2763
    if( minVal )
2764
        *minVal = dminval;
2765
    if( maxVal )
2766
        *maxVal = dmaxval;
2767

2768
    if( minIdx )
2769
        ofs2idx(src, minidx, minIdx);
2770
    if( maxIdx )
2771
        ofs2idx(src, maxidx, maxIdx);
2772
}
2773

2774
void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
2775
                    Point* minLoc, Point* maxLoc, InputArray mask )
2776
{
2777 2778
    CV_INSTRUMENT_REGION()

Konstantin Matskevich's avatar
Konstantin Matskevich committed
2779
    CV_Assert(_img.dims() <= 2);
2780

2781 2782 2783 2784 2785 2786
    minMaxIdx(_img, minVal, maxVal, (int*)minLoc, (int*)maxLoc, mask);
    if( minLoc )
        std::swap(minLoc->x, minLoc->y);
    if( maxLoc )
        std::swap(maxLoc->x, maxLoc->y);
}
2787

2788 2789 2790 2791
/****************************************************************************************\
*                                         norm                                           *
\****************************************************************************************/

2792
namespace cv
2793 2794
{

2795 2796
template<typename T, typename ST> int
normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
2797
{
2798 2799
    ST result = *_result;
    if( !mask )
2800
    {
2801
        result = std::max(result, normInf<T, ST>(src, len*cn));
2802 2803 2804 2805 2806
    }
    else
    {
        for( int i = 0; i < len; i++, src += cn )
            if( mask[i] )
2807
            {
2808
                for( int k = 0; k < cn; k++ )
2809
                    result = std::max(result, ST(cv_abs(src[k])));
2810
            }
2811 2812 2813 2814
    }
    *_result = result;
    return 0;
}
2815

2816 2817 2818 2819 2820 2821
template<typename T, typename ST> int
normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
{
    ST result = *_result;
    if( !mask )
    {
2822
        result += normL1<T, ST>(src, len*cn);
2823 2824 2825 2826 2827
    }
    else
    {
        for( int i = 0; i < len; i++, src += cn )
            if( mask[i] )
2828
            {
2829
                for( int k = 0; k < cn; k++ )
2830
                    result += cv_abs(src[k]);
2831 2832
            }
    }
2833 2834
    *_result = result;
    return 0;
2835 2836
}

2837 2838
template<typename T, typename ST> int
normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
2839
{
2840 2841
    ST result = *_result;
    if( !mask )
2842
    {
2843
        result += normL2Sqr<T, ST>(src, len*cn);
2844
    }
2845
    else
2846
    {
2847 2848
        for( int i = 0; i < len; i++, src += cn )
            if( mask[i] )
2849
            {
2850
                for( int k = 0; k < cn; k++ )
2851
                {
2852 2853
                    T v = src[k];
                    result += (ST)v*v;
2854
                }
2855 2856
            }
    }
2857 2858
    *_result = result;
    return 0;
2859
}
2860

2861 2862 2863 2864 2865
template<typename T, typename ST> int
normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
{
    ST result = *_result;
    if( !mask )
2866
    {
2867
        result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
2868 2869 2870 2871 2872
    }
    else
    {
        for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
            if( mask[i] )
2873
            {
2874
                for( int k = 0; k < cn; k++ )
2875
                    result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
2876
            }
2877
    }
2878 2879
    *_result = result;
    return 0;
2880 2881
}

2882 2883
template<typename T, typename ST> int
normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
2884
{
2885 2886
    ST result = *_result;
    if( !mask )
2887
    {
2888
        result += normL1<T, ST>(src1, src2, len*cn);
2889 2890 2891 2892 2893
    }
    else
    {
        for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
            if( mask[i] )
2894
            {
2895 2896
                for( int k = 0; k < cn; k++ )
                    result += std::abs(src1[k] - src2[k]);
2897 2898
            }
    }
2899 2900
    *_result = result;
    return 0;
2901 2902
}

2903 2904
template<typename T, typename ST> int
normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
2905
{
2906 2907
    ST result = *_result;
    if( !mask )
2908
    {
2909
        result += normL2Sqr<T, ST>(src1, src2, len*cn);
2910
    }
2911
    else
2912
    {
2913 2914
        for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
            if( mask[i] )
2915
            {
2916
                for( int k = 0; k < cn; k++ )
2917
                {
2918 2919
                    ST v = src1[k] - src2[k];
                    result += v*v;
2920
                }
2921 2922
            }
    }
2923 2924
    *_result = result;
    return 0;
2925
}
2926

Maksim Shabunin's avatar
Maksim Shabunin committed
2927 2928
Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
{
2929
    return cv::hal::normHamming(a, b, size);
Maksim Shabunin's avatar
Maksim Shabunin committed
2930
}
2931 2932

#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
2933
    static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
2934
{ return norm##L##_(src, mask, r, len, cn); } \
2935 2936
    static int normDiff##L##_##suffix(const type* src1, const type* src2, \
    const uchar* mask, ntype* r, int len, int cn) \
2937
{ return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
2938

2939
#define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
2940 2941 2942
    CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
    CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
    CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
2943 2944 2945 2946 2947 2948 2949 2950

CV_DEF_NORM_ALL(8u, uchar, int, int, int)
CV_DEF_NORM_ALL(8s, schar, int, int, int)
CV_DEF_NORM_ALL(16u, ushort, int, int, double)
CV_DEF_NORM_ALL(16s, short, int, int, double)
CV_DEF_NORM_ALL(32s, int, int, double, double)
CV_DEF_NORM_ALL(32f, float, float, double, double)
CV_DEF_NORM_ALL(64f, double, double, double, double)
2951

2952

2953
typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
2954
typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
2955

2956
static NormFunc getNormFunc(int normType, int depth)
2957
{
2958
    static NormFunc normTab[3][8] =
2959
    {
2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972
        {
            (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
            (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
        },
        {
            (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
            (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
        },
        {
            (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
            (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
        }
    };
2973

2974 2975 2976 2977
    return normTab[normType][depth];
}

static NormDiffFunc getNormDiffFunc(int normType, int depth)
2978
{
2979
    static NormDiffFunc normDiffTab[3][8] =
2980
    {
2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002
        {
            (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
            (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
            (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
            (NormDiffFunc)normDiffInf_64f, 0
        },
        {
            (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
            (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
            (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
            (NormDiffFunc)normDiffL1_64f, 0
        },
        {
            (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
            (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
            (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
            (NormDiffFunc)normDiffL2_64f, 0
        }
    };

    return normDiffTab[normType][depth];
}
3003

3004
#ifdef HAVE_OPENCL
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3005

3006
static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
3007
{
3008
    const ocl::Device & d = ocl::Device::getDefault();
3009

3010
#ifdef __ANDROID__
3011 3012 3013
    if (d.isNVidia())
        return false;
#endif
3014 3015 3016 3017
    const int cn = _src.channels();
    if (cn > 4)
        return false;
    int type = _src.type(), depth = CV_MAT_DEPTH(type);
3018
    bool doubleSupport = d.doubleFPConfig() > 0,
3019
            haveMask = _mask.kind() != _InputArray::NONE;
3020

3021
    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
3022
         (!doubleSupport && depth == CV_64F))
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3023 3024 3025 3026 3027 3028
        return false;

    UMat src = _src.getUMat();

    if (normType == NORM_INF)
    {
3029 3030 3031
        if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
                           std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U))
            return false;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3032
    }
3033
    else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3034
    {
3035
        Scalar sc;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3036 3037
        bool unstype = depth == CV_8U || depth == CV_16U;

3038
        if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
3039
                    OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
3040
            return false;
3041 3042

        double s = 0.0;
3043
        for (int i = 0; i < (haveMask ? cn : 1); ++i)
3044 3045
            s += sc[i];

3046
        result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3047 3048 3049 3050 3051
    }

    return true;
}

3052 3053
#endif

3054 3055
#ifdef HAVE_IPP
static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
3056
{
3057 3058
    CV_INSTRUMENT_REGION_IPP()

3059
#if IPP_VERSION_X100 >= 700
3060
    size_t total_size = src.total();
3061 3062 3063
    int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;

    if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous()))
3064
        && cols > 0 && (size_t)rows*cols == total_size )
3065
    {
3066 3067
        if( !mask.empty() )
        {
3068 3069 3070
            IppiSize sz = { cols, rows };
            int type = src.type();

3071
            typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
3072
            ippiMaskNormFuncC1 ippiNorm_C1MR =
3073 3074
                normType == NORM_INF ?
                (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR :
3075
                type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR :
3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087
                type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR :
                0) :
            normType == NORM_L1 ?
                (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR :
                type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR :
                type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR :
                0) :
            normType == NORM_L2 || normType == NORM_L2SQR ?
                (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR :
                type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR :
                type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR :
                0) : 0;
3088
            if( ippiNorm_C1MR )
3089
            {
3090
                Ipp64f norm;
3091
                if( CV_INSTRUMENT_FUN_IPP(ippiNorm_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
3092
                {
3093 3094
                    result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
                    return true;
3095
                }
3096
            }
3097
            typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
3098
            ippiMaskNormFuncC3 ippiNorm_C3CMR =
3099 3100 3101 3102 3103
                normType == NORM_INF ?
                (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
                type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
                type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
                0) :
3104
            normType == NORM_L1 ?
3105 3106 3107 3108
                (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
                type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
                type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
                0) :
3109
            normType == NORM_L2 || normType == NORM_L2SQR ?
3110 3111 3112 3113
                (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
                type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
                type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
                0) : 0;
3114
            if( ippiNorm_C3CMR )
3115
            {
3116
                Ipp64f norm1, norm2, norm3;
3117 3118 3119
                if( CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiNorm_C3CMR, src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
3120
                {
3121 3122 3123 3124 3125 3126 3127
                    Ipp64f norm =
                        normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
                        normType == NORM_L1 ? norm1 + norm2 + norm3 :
                        normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
                        0;
                    result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
                    return true;
3128
                }
3129
            }
3130 3131 3132
        }
        else
        {
3133 3134 3135
            IppiSize sz = { cols*src.channels(), rows };
            int type = src.depth();

3136 3137
            typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
            typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
3138
            ippiNormFuncHint ippiNormHint =
3139 3140 3141 3142 3143 3144
                normType == NORM_L1 ?
                (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
                0) :
                normType == NORM_L2 || normType == NORM_L2SQR ?
                (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
                0) : 0;
3145
            ippiNormFuncNoHint ippiNorm =
3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161
                normType == NORM_INF ?
                (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
                type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
                type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
                type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
                0) :
                normType == NORM_L1 ?
                (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
                type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
                type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
                0) :
                normType == NORM_L2 || normType == NORM_L2SQR ?
                (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
                type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
                type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
                0) : 0;
3162
            if( ippiNormHint || ippiNorm )
3163
            {
3164 3165 3166
                Ipp64f norm;
                IppStatus ret = ippiNormHint ? CV_INSTRUMENT_FUN_IPP(ippiNormHint, src.ptr(), (int)src.step[0], sz, &norm, ippAlgHintAccurate) :
                                CV_INSTRUMENT_FUN_IPP(ippiNorm, src.ptr(), (int)src.step[0], sz, &norm);
3167
                if( ret >= 0 )
3168
                {
3169
                    result = (normType == NORM_L2SQR) ? norm * norm : norm;
3170
                    return true;
3171 3172 3173 3174
                }
            }
        }
    }
3175 3176 3177 3178
#else
    CV_UNUSED(src); CV_UNUSED(normType); CV_UNUSED(mask); CV_UNUSED(result);
#endif
    return false;
3179 3180
}
#endif
3181
}
3182 3183 3184

double cv::norm( InputArray _src, int normType, InputArray _mask )
{
3185 3186
    CV_INSTRUMENT_REGION()

3187 3188 3189 3190 3191 3192 3193
    normType &= NORM_TYPE_MASK;
    CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
               normType == NORM_L2 || normType == NORM_L2SQR ||
               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );

#if defined HAVE_OPENCL || defined HAVE_IPP
    double _result = 0;
3194 3195
#endif

3196 3197 3198 3199 3200 3201 3202
#ifdef HAVE_OPENCL
    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
                ocl_norm(_src, normType, _mask, _result),
                _result)
#endif

    Mat src = _src.getMat(), mask = _mask.getMat();
3203
    CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(src, normType, mask, _result), _result);
3204

3205
    int depth = src.depth(), cn = src.channels();
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3206
    if( src.isContinuous() && mask.empty() )
3207
    {
3208 3209
        size_t len = src.total()*cn;
        if( len == (size_t)(int)len )
3210
        {
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3211
            if( depth == CV_32F )
3212
            {
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3213
                const float* data = src.ptr<float>();
3214

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238
                if( normType == NORM_L2 )
                {
                    double result = 0;
                    GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
                    return std::sqrt(result);
                }
                if( normType == NORM_L2SQR )
                {
                    double result = 0;
                    GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
                    return result;
                }
                if( normType == NORM_L1 )
                {
                    double result = 0;
                    GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
                    return result;
                }
                if( normType == NORM_INF )
                {
                    float result = 0;
                    GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
                    return result;
                }
3239
            }
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3240
            if( depth == CV_8U )
3241
            {
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3242
                const uchar* data = src.ptr<uchar>();
3243

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3244
                if( normType == NORM_HAMMING )
Maksim Shabunin's avatar
Maksim Shabunin committed
3245
                {
3246
                    return hal::normHamming(data, (int)len);
Maksim Shabunin's avatar
Maksim Shabunin committed
3247
                }
3248

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3249
                if( normType == NORM_HAMMING2 )
Maksim Shabunin's avatar
Maksim Shabunin committed
3250
                {
3251
                    return hal::normHamming(data, (int)len, 2);
Maksim Shabunin's avatar
Maksim Shabunin committed
3252
                }
3253
            }
3254 3255
        }
    }
3256

3257
    CV_Assert( mask.empty() || mask.type() == CV_8U );
3258

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3259 3260 3261 3262 3263 3264 3265 3266 3267
    if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
    {
        if( !mask.empty() )
        {
            Mat temp;
            bitwise_and(src, mask, temp);
            return norm(temp, normType);
        }
        int cellSize = normType == NORM_HAMMING ? 1 : 2;
3268

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3269 3270 3271 3272 3273
        const Mat* arrays[] = {&src, 0};
        uchar* ptrs[1];
        NAryMatIterator it(arrays, ptrs);
        int total = (int)it.size;
        int result = 0;
3274

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3275
        for( size_t i = 0; i < it.nplanes; i++, ++it )
Maksim Shabunin's avatar
Maksim Shabunin committed
3276
        {
3277
            result += hal::normHamming(ptrs[0], total, cellSize);
Maksim Shabunin's avatar
Maksim Shabunin committed
3278
        }
3279

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3280 3281
        return result;
    }
3282

3283
    NormFunc func = getNormFunc(normType >> 1, depth);
3284
    CV_Assert( func != 0 );
3285

3286 3287
    const Mat* arrays[] = {&src, &mask, 0};
    uchar* ptrs[2];
3288 3289 3290 3291 3292 3293 3294 3295
    union
    {
        double d;
        int i;
        float f;
    }
    result;
    result.d = 0;
3296 3297 3298
    NAryMatIterator it(arrays, ptrs);
    int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
    bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3299
            ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
3300
    int isum = 0;
3301
    int *ibuf = &result.i;
3302
    size_t esz = 0;
3303

3304
    if( blockSum )
3305
    {
3306 3307 3308 3309 3310
        intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
        blockSize = std::min(blockSize, intSumBlockSize);
        ibuf = &isum;
        esz = src.elemSize();
    }
3311

3312
    for( size_t i = 0; i < it.nplanes; i++, ++it )
3313
    {
3314
        for( j = 0; j < total; j += blockSize )
3315
        {
3316
            int bsz = std::min(total - j, blockSize);
3317 3318
            func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
            count += bsz;
3319 3320
            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
            {
3321
                result.d += isum;
3322 3323 3324 3325 3326 3327
                isum = 0;
                count = 0;
            }
            ptrs[0] += bsz*esz;
            if( ptrs[1] )
                ptrs[1] += bsz;
3328 3329
        }
    }
3330

3331
    if( normType == NORM_INF )
3332
    {
3333 3334 3335
        if( depth == CV_64F )
            ;
        else if( depth == CV_32F )
3336
            result.d = result.f;
3337
        else
3338
            result.d = result.i;
3339
    }
3340
    else if( normType == NORM_L2 )
3341
        result.d = std::sqrt(result.d);
3342

3343
    return result.d;
3344 3345
}

3346 3347
#ifdef HAVE_OPENCL

Ilya Lavrenov's avatar
Ilya Lavrenov committed
3348 3349
namespace cv {

3350
static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3351
{
3352
#ifdef __ANDROID__
3353 3354 3355 3356
    if (ocl::Device::getDefault().isNVidia())
        return false;
#endif

3357
    Scalar sc1, sc2;
3358 3359 3360 3361
    int cn = _src1.channels();
    if (cn > 4)
        return false;
    int type = _src1.type(), depth = CV_MAT_DEPTH(type);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3362
    bool relative = (normType & NORM_RELATIVE) != 0;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3363
    normType &= ~NORM_RELATIVE;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3364
    bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3365

3366 3367 3368 3369 3370
#ifdef __APPLE__
    if(normType == NORM_L1 && type == CV_16UC3 && !_mask.empty())
        return false;
#endif

3371 3372 3373 3374 3375 3376 3377 3378
    if (normsum)
    {
        if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
                     OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
            return false;
    }
    else
    {
3379 3380
        if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth),
                           false, _src2, relative ? &sc2[0] : NULL))
3381
            return false;
3382
        cn = 1;
3383
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3384

3385 3386 3387 3388 3389 3390 3391
    double s2 = 0;
    for (int i = 0; i < cn; ++i)
    {
        result += sc1[i];
        if (relative)
            s2 += sc2[i];
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3392

3393 3394 3395 3396 3397 3398
    if (normType == NORM_L2)
    {
        result = std::sqrt(result);
        if (relative)
            s2 = std::sqrt(s2);
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3399 3400

    if (relative)
3401
        result /= (s2 + DBL_EPSILON);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3402 3403 3404 3405 3406

    return true;
}

}
3407

3408 3409
#endif

3410
#ifdef HAVE_IPP
3411
namespace cv
3412
{
3413 3414
static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask, double &result)
{
3415 3416
    CV_INSTRUMENT_REGION_IPP()

3417
#if IPP_VERSION_X100 >= 700
3418
    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
3419

3420 3421
    if( normType & CV_RELATIVE )
    {
3422
        normType &= NORM_TYPE_MASK;
3423

3424
        size_t total_size = src1.total();
3425 3426
        int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
        if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
3427
            && cols > 0 && (size_t)rows*cols == total_size )
3428
        {
3429
            if( !mask.empty() )
3430
            {
3431 3432 3433 3434 3435
                IppiSize sz = { cols, rows };
                int type = src1.type();

                typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
                ippiMaskNormDiffFuncC1 ippiNormRel_C1MR =
3436
                    normType == NORM_INF ?
3437 3438 3439
                    (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_8u_C1MR :
                    type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_16u_C1MR :
                    type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_Inf_32f_C1MR :
3440 3441
                    0) :
                    normType == NORM_L1 ?
3442 3443 3444
                    (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_8u_C1MR :
                    type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_16u_C1MR :
                    type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L1_32f_C1MR :
3445 3446
                    0) :
                    normType == NORM_L2 || normType == NORM_L2SQR ?
3447 3448 3449
                    (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_8u_C1MR :
                    type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_16u_C1MR :
                    type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormRel_L2_32f_C1MR :
3450
                    0) : 0;
3451
                if( ippiNormRel_C1MR )
3452 3453
                {
                    Ipp64f norm;
3454
                    if( CV_INSTRUMENT_FUN_IPP(ippiNormRel_C1MR, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
3455
                    {
3456
                        result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
3457
                        return true;
3458 3459
                    }
                }
3460 3461 3462
            }
            else
            {
3463 3464 3465
                IppiSize sz = { cols*src1.channels(), rows };
                int type = src1.depth();

3466
                typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
3467 3468
                typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
                ippiNormRelFuncHint ippiNormRelHint =
3469
                    normType == NORM_L1 ?
3470
                    (type == CV_32F ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R :
3471 3472
                    0) :
                    normType == NORM_L2 || normType == NORM_L2SQR ?
3473
                    (type == CV_32F ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R :
3474
                    0) : 0;
3475 3476 3477 3478 3479 3480 3481
                ippiNormRelFuncNoHint ippiNormRel =
                    normType == NORM_INF ?
                    (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R :
                    type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R :
                    type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R :
                    type == CV_32F ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R :
                    0) :
3482
                    normType == NORM_L1 ?
3483 3484 3485
                    (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R :
                    type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R :
                    type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R :
3486 3487
                    0) :
                    normType == NORM_L2 || normType == NORM_L2SQR ?
3488 3489 3490
                    (type == CV_8U ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R :
                    type == CV_16U ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R :
                    type == CV_16S ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R :
3491
                    0) : 0;
3492
                if( ippiNormRelHint || ippiNormRel )
3493 3494
                {
                    Ipp64f norm;
3495 3496 3497
                    IppStatus ret = ippiNormRelHint ? CV_INSTRUMENT_FUN_IPP(ippiNormRelHint, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) :
                                    CV_INSTRUMENT_FUN_IPP(ippiNormRel, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm);
                    if( ret >= 0 )
3498
                    {
3499
                        result = (normType == NORM_L2SQR) ? norm * norm : norm;
3500
                        return true;
3501 3502 3503 3504
                    }
                }
            }
        }
3505
        return false;
3506
    }
3507

3508
    normType &= NORM_TYPE_MASK;
3509 3510 3511 3512

    size_t total_size = src1.total();
    int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
    if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
3513
        && cols > 0 && (size_t)rows*cols == total_size )
3514
    {
3515
        if( !mask.empty() )
3516
        {
3517 3518 3519
            IppiSize sz = { cols, rows };
            int type = src1.type();

3520
            typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
3521
            ippiMaskNormDiffFuncC1 ippiNormDiff_C1MR =
3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536
                normType == NORM_INF ?
                (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR :
                type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR :
                type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR :
                0) :
                normType == NORM_L1 ?
                (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
                type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
                type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
                0) :
                normType == NORM_L2 || normType == NORM_L2SQR ?
                (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR :
                type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR :
                type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR :
                0) : 0;
3537
            if( ippiNormDiff_C1MR )
3538 3539
            {
                Ipp64f norm;
3540
                if( CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C1MR, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
3541
                {
3542 3543
                    result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
                    return true;
3544
                }
3545 3546
            }
            typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
3547
            ippiMaskNormDiffFuncC3 ippiNormDiff_C3CMR =
3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562
                normType == NORM_INF ?
                (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
                type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR :
                type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR :
                0) :
                normType == NORM_L1 ?
                (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR :
                type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR :
                type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR :
                0) :
                normType == NORM_L2 || normType == NORM_L2SQR ?
                (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR :
                type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
                type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
                0) : 0;
3563
            if( ippiNormDiff_C3CMR )
3564 3565
            {
                Ipp64f norm1, norm2, norm3;
3566 3567 3568
                if( CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
                    CV_INSTRUMENT_FUN_IPP(ippiNormDiff_C3CMR, src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
3569
                {
3570 3571 3572 3573 3574 3575 3576
                    Ipp64f norm =
                        normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
                        normType == NORM_L1 ? norm1 + norm2 + norm3 :
                        normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
                        0;
                    result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
                    return true;
3577 3578
                }
            }
3579 3580 3581
        }
        else
        {
3582 3583 3584
            IppiSize sz = { cols*src1.channels(), rows };
            int type = src1.depth();

3585 3586
            typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
            typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
3587
            ippiNormDiffFuncHint ippiNormDiffHint =
3588
                normType == NORM_L1 ?
3589
                (type == CV_32F ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
3590 3591
                0) :
                normType == NORM_L2 || normType == NORM_L2SQR ?
3592
                (type == CV_32F ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
3593
                0) : 0;
3594
            ippiNormDiffFuncNoHint ippiNormDiff =
3595
                normType == NORM_INF ?
3596 3597 3598 3599
                (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
                type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
                type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
                type == CV_32F ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
3600 3601
                0) :
                normType == NORM_L1 ?
3602 3603 3604
                (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
                type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
                type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
3605 3606
                0) :
                normType == NORM_L2 || normType == NORM_L2SQR ?
3607 3608 3609
                (type == CV_8U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
                type == CV_16U ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
                type == CV_16S ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
3610
                0) : 0;
3611
            if( ippiNormDiffHint || ippiNormDiff )
3612
            {
3613 3614 3615
                Ipp64f norm;
                IppStatus ret = ippiNormDiffHint ? CV_INSTRUMENT_FUN_IPP(ippiNormDiffHint, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) :
                                CV_INSTRUMENT_FUN_IPP(ippiNormDiff, src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm);
3616
                if( ret >= 0 )
3617
                {
3618
                    result = (normType == NORM_L2SQR) ? norm * norm : norm;
3619
                    return true;
3620 3621 3622 3623
                }
            }
        }
    }
3624 3625
#else
    CV_UNUSED(_src1); CV_UNUSED(_src2); CV_UNUSED(normType); CV_UNUSED(_mask); CV_UNUSED(result);
3626
#endif
3627 3628
    return false;
}
3629
}
3630
#endif
3631 3632 3633 3634


double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
{
3635 3636
    CV_INSTRUMENT_REGION()

3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648
    CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() );

#if defined HAVE_OPENCL || defined HAVE_IPP
    double _result = 0;
#endif

#ifdef HAVE_OPENCL
    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()),
                ocl_norm(_src1, _src2, normType, _mask, _result),
                _result)
#endif

3649
    CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(_src1, _src2, normType, _mask, _result), _result);
3650

3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662
    if( normType & CV_RELATIVE )
    {
        return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON);
    }

    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
    int depth = src1.depth(), cn = src1.channels();

    normType &= 7;
    CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
               normType == NORM_L2 || normType == NORM_L2SQR ||
              ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
3663

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3664
    if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
3665
    {
3666 3667
        size_t len = src1.total()*src1.channels();
        if( len == (size_t)(int)len )
3668
        {
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3669
            if( src1.depth() == CV_32F )
3670
            {
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3671 3672
                const float* data1 = src1.ptr<float>();
                const float* data2 = src2.ptr<float>();
3673

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697
                if( normType == NORM_L2 )
                {
                    double result = 0;
                    GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
                    return std::sqrt(result);
                }
                if( normType == NORM_L2SQR )
                {
                    double result = 0;
                    GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
                    return result;
                }
                if( normType == NORM_L1 )
                {
                    double result = 0;
                    GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
                    return result;
                }
                if( normType == NORM_INF )
                {
                    float result = 0;
                    GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
                    return result;
                }
3698
            }
3699 3700
        }
    }
3701

3702
    CV_Assert( mask.empty() || mask.type() == CV_8U );
3703

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3704 3705 3706 3707 3708 3709 3710 3711 3712 3713
    if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
    {
        if( !mask.empty() )
        {
            Mat temp;
            bitwise_xor(src1, src2, temp);
            bitwise_and(temp, mask, temp);
            return norm(temp, normType);
        }
        int cellSize = normType == NORM_HAMMING ? 1 : 2;
3714

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3715 3716 3717 3718 3719
        const Mat* arrays[] = {&src1, &src2, 0};
        uchar* ptrs[2];
        NAryMatIterator it(arrays, ptrs);
        int total = (int)it.size;
        int result = 0;
3720

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3721
        for( size_t i = 0; i < it.nplanes; i++, ++it )
Maksim Shabunin's avatar
Maksim Shabunin committed
3722
        {
3723
            result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
Maksim Shabunin's avatar
Maksim Shabunin committed
3724
        }
3725

Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3726 3727
        return result;
    }
3728

3729
    NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
3730
    CV_Assert( func != 0 );
3731

3732 3733
    const Mat* arrays[] = {&src1, &src2, &mask, 0};
    uchar* ptrs[3];
3734 3735 3736 3737 3738 3739 3740 3741 3742
    union
    {
        double d;
        float f;
        int i;
        unsigned u;
    }
    result;
    result.d = 0;
3743 3744 3745
    NAryMatIterator it(arrays, ptrs);
    int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
    bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
Vadim Pisarevsky's avatar
Vadim Pisarevsky committed
3746
            ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
3747 3748
    unsigned isum = 0;
    unsigned *ibuf = &result.u;
3749
    size_t esz = 0;
3750

3751
    if( blockSum )
3752
    {
3753 3754 3755 3756
        intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
        blockSize = std::min(blockSize, intSumBlockSize);
        ibuf = &isum;
        esz = src1.elemSize();
3757
    }
3758

3759
    for( size_t i = 0; i < it.nplanes; i++, ++it )
3760
    {
3761
        for( j = 0; j < total; j += blockSize )
3762
        {
3763 3764 3765 3766 3767
            int bsz = std::min(total - j, blockSize);
            func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
            count += bsz;
            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
            {
3768
                result.d += isum;
3769 3770 3771 3772 3773 3774 3775
                isum = 0;
                count = 0;
            }
            ptrs[0] += bsz*esz;
            ptrs[1] += bsz*esz;
            if( ptrs[2] )
                ptrs[2] += bsz;
3776 3777
        }
    }
3778

3779
    if( normType == NORM_INF )
3780
    {
3781 3782 3783
        if( depth == CV_64F )
            ;
        else if( depth == CV_32F )
3784
            result.d = result.f;
3785
        else
3786
            result.d = result.u;
3787
    }
3788
    else if( normType == NORM_L2 )
3789
        result.d = std::sqrt(result.d);
3790

3791
    return result.d;
3792 3793 3794
}


3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860
///////////////////////////////////// batch distance ///////////////////////////////////////

namespace cv
{

template<typename _Tp, typename _Rt>
void batchDistL1_(const _Tp* src1, const _Tp* src2, size_t step2,
                  int nvecs, int len, _Rt* dist, const uchar* mask)
{
    step2 /= sizeof(src2[0]);
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
            dist[i] = normL1<_Tp, _Rt>(src1, src2 + step2*i, len);
    }
    else
    {
        _Rt val0 = std::numeric_limits<_Rt>::max();
        for( int i = 0; i < nvecs; i++ )
            dist[i] = mask[i] ? normL1<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
    }
}

template<typename _Tp, typename _Rt>
void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2,
                     int nvecs, int len, _Rt* dist, const uchar* mask)
{
    step2 /= sizeof(src2[0]);
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
            dist[i] = normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len);
    }
    else
    {
        _Rt val0 = std::numeric_limits<_Rt>::max();
        for( int i = 0; i < nvecs; i++ )
            dist[i] = mask[i] ? normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
    }
}

template<typename _Tp, typename _Rt>
void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2,
                  int nvecs, int len, _Rt* dist, const uchar* mask)
{
    step2 /= sizeof(src2[0]);
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
            dist[i] = std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len));
    }
    else
    {
        _Rt val0 = std::numeric_limits<_Rt>::max();
        for( int i = 0; i < nvecs; i++ )
            dist[i] = mask[i] ? std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)) : val0;
    }
}

static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
                             int nvecs, int len, int* dist, const uchar* mask)
{
    step2 /= sizeof(src2[0]);
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
3861
             dist[i] = hal::normHamming(src1, src2 + step2*i, len);
3862 3863 3864 3865 3866
    }
    else
    {
        int val0 = INT_MAX;
        for( int i = 0; i < nvecs; i++ )
Maksim Shabunin's avatar
Maksim Shabunin committed
3867 3868
        {
            if (mask[i])
3869
                dist[i] = hal::normHamming(src1, src2 + step2*i, len);
Maksim Shabunin's avatar
Maksim Shabunin committed
3870 3871 3872
            else
                dist[i] = val0;
        }
3873 3874 3875 3876 3877 3878 3879 3880 3881 3882
    }
}

static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2,
                              int nvecs, int len, int* dist, const uchar* mask)
{
    step2 /= sizeof(src2[0]);
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
3883
            dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
3884 3885 3886 3887 3888
    }
    else
    {
        int val0 = INT_MAX;
        for( int i = 0; i < nvecs; i++ )
Maksim Shabunin's avatar
Maksim Shabunin committed
3889 3890
        {
            if (mask[i])
3891
                dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
Maksim Shabunin's avatar
Maksim Shabunin committed
3892 3893 3894
            else
                dist[i] = val0;
        }
3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948
    }
}

static void batchDistL1_8u32s(const uchar* src1, const uchar* src2, size_t step2,
                               int nvecs, int len, int* dist, const uchar* mask)
{
    batchDistL1_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
}

static void batchDistL1_8u32f(const uchar* src1, const uchar* src2, size_t step2,
                               int nvecs, int len, float* dist, const uchar* mask)
{
    batchDistL1_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
}

static void batchDistL2Sqr_8u32s(const uchar* src1, const uchar* src2, size_t step2,
                                  int nvecs, int len, int* dist, const uchar* mask)
{
    batchDistL2Sqr_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
}

static void batchDistL2Sqr_8u32f(const uchar* src1, const uchar* src2, size_t step2,
                                  int nvecs, int len, float* dist, const uchar* mask)
{
    batchDistL2Sqr_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
}

static void batchDistL2_8u32f(const uchar* src1, const uchar* src2, size_t step2,
                               int nvecs, int len, float* dist, const uchar* mask)
{
    batchDistL2_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
}

static void batchDistL1_32f(const float* src1, const float* src2, size_t step2,
                             int nvecs, int len, float* dist, const uchar* mask)
{
    batchDistL1_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
}

static void batchDistL2Sqr_32f(const float* src1, const float* src2, size_t step2,
                                int nvecs, int len, float* dist, const uchar* mask)
{
    batchDistL2Sqr_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
}

static void batchDistL2_32f(const float* src1, const float* src2, size_t step2,
                             int nvecs, int len, float* dist, const uchar* mask)
{
    batchDistL2_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
}

typedef void (*BatchDistFunc)(const uchar* src1, const uchar* src2, size_t step2,
                              int nvecs, int len, uchar* dist, const uchar* mask);

3949

3950
struct BatchDistInvoker : public ParallelLoopBody
3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965
{
    BatchDistInvoker( const Mat& _src1, const Mat& _src2,
                      Mat& _dist, Mat& _nidx, int _K,
                      const Mat& _mask, int _update,
                      BatchDistFunc _func)
    {
        src1 = &_src1;
        src2 = &_src2;
        dist = &_dist;
        nidx = &_nidx;
        K = _K;
        mask = &_mask;
        update = _update;
        func = _func;
    }
3966

3967
    void operator()(const Range& range) const
3968 3969 3970
    {
        AutoBuffer<int> buf(src2->rows);
        int* bufptr = buf;
3971

3972
        for( int i = range.start; i < range.end; i++ )
3973 3974 3975
        {
            func(src1->ptr(i), src2->ptr(), src2->step, src2->rows, src2->cols,
                 K > 0 ? (uchar*)bufptr : dist->ptr(i), mask->data ? mask->ptr(i) : 0);
3976

3977 3978 3979 3980 3981 3982
            if( K > 0 )
            {
                int* nidxptr = nidx->ptr<int>(i);
                // since positive float's can be compared just like int's,
                // we handle both CV_32S and CV_32F cases with a single branch
                int* distptr = (int*)dist->ptr(i);
3983

3984
                int j, k;
3985

3986 3987 3988
                for( j = 0; j < src2->rows; j++ )
                {
                    int d = bufptr[j];
3989
                    if( d < distptr[K-1] )
3990
                    {
3991
                        for( k = K-2; k >= 0 && distptr[k] > d; k-- )
3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002
                        {
                            nidxptr[k+1] = nidxptr[k];
                            distptr[k+1] = distptr[k];
                        }
                        nidxptr[k+1] = j + update;
                        distptr[k+1] = d;
                    }
                }
            }
        }
    }
4003

4004 4005 4006 4007 4008 4009 4010 4011 4012
    const Mat *src1;
    const Mat *src2;
    Mat *dist;
    Mat *nidx;
    const Mat *mask;
    int K;
    int update;
    BatchDistFunc func;
};
4013

4014
}
4015

4016 4017 4018 4019 4020
void cv::batchDistance( InputArray _src1, InputArray _src2,
                        OutputArray _dist, int dtype, OutputArray _nidx,
                        int normType, int K, InputArray _mask,
                        int update, bool crosscheck )
{
4021 4022
    CV_INSTRUMENT_REGION()

4023 4024 4025 4026 4027
    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
    int type = src1.type();
    CV_Assert( type == src2.type() && src1.cols == src2.cols &&
               (type == CV_32F || type == CV_8U));
    CV_Assert( _nidx.needed() == (K > 0) );
4028

4029 4030 4031 4032 4033 4034 4035
    if( dtype == -1 )
    {
        dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 ? CV_32S : CV_32F;
    }
    CV_Assert( (type == CV_8U && dtype == CV_32S) || dtype == CV_32F);

    K = std::min(K, src2.rows);
4036

4037 4038 4039 4040 4041 4042 4043
    _dist.create(src1.rows, (K > 0 ? K : src2.rows), dtype);
    Mat dist = _dist.getMat(), nidx;
    if( _nidx.needed() )
    {
        _nidx.create(dist.size(), CV_32S);
        nidx = _nidx.getMat();
    }
4044

4045 4046 4047 4048 4049
    if( update == 0 && K > 0 )
    {
        dist = Scalar::all(dtype == CV_32S ? (double)INT_MAX : (double)FLT_MAX);
        nidx = Scalar::all(-1);
    }
4050

4051 4052 4053 4054 4055
    if( crosscheck )
    {
        CV_Assert( K == 1 && update == 0 && mask.empty() );
        Mat tdist, tidx;
        batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false);
4056

4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070
        // if an idx-th element from src1 appeared to be the nearest to i-th element of src2,
        // we update the minimum mutual distance between idx-th element of src1 and the whole src2 set.
        // As a result, if nidx[idx] = i*, it means that idx-th element of src1 is the nearest
        // to i*-th element of src2 and i*-th element of src2 is the closest to idx-th element of src1.
        // If nidx[idx] = -1, it means that there is no such ideal couple for it in src2.
        // This O(N) procedure is called cross-check and it helps to eliminate some false matches.
        if( dtype == CV_32S )
        {
            for( int i = 0; i < tdist.rows; i++ )
            {
                int idx = tidx.at<int>(i);
                int d = tdist.at<int>(i), d0 = dist.at<int>(idx);
                if( d < d0 )
                {
4071
                    dist.at<int>(idx) = d;
4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083
                    nidx.at<int>(idx) = i + update;
                }
            }
        }
        else
        {
            for( int i = 0; i < tdist.rows; i++ )
            {
                int idx = tidx.at<int>(i);
                float d = tdist.at<float>(i), d0 = dist.at<float>(idx);
                if( d < d0 )
                {
4084
                    dist.at<float>(idx) = d;
4085 4086 4087 4088 4089 4090
                    nidx.at<int>(idx) = i + update;
                }
            }
        }
        return;
    }
4091

4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118
    BatchDistFunc func = 0;
    if( type == CV_8U )
    {
        if( normType == NORM_L1 && dtype == CV_32S )
            func = (BatchDistFunc)batchDistL1_8u32s;
        else if( normType == NORM_L1 && dtype == CV_32F )
            func = (BatchDistFunc)batchDistL1_8u32f;
        else if( normType == NORM_L2SQR && dtype == CV_32S )
            func = (BatchDistFunc)batchDistL2Sqr_8u32s;
        else if( normType == NORM_L2SQR && dtype == CV_32F )
            func = (BatchDistFunc)batchDistL2Sqr_8u32f;
        else if( normType == NORM_L2 && dtype == CV_32F )
            func = (BatchDistFunc)batchDistL2_8u32f;
        else if( normType == NORM_HAMMING && dtype == CV_32S )
            func = (BatchDistFunc)batchDistHamming;
        else if( normType == NORM_HAMMING2 && dtype == CV_32S )
            func = (BatchDistFunc)batchDistHamming2;
    }
    else if( type == CV_32F && dtype == CV_32F )
    {
        if( normType == NORM_L1 )
            func = (BatchDistFunc)batchDistL1_32f;
        else if( normType == NORM_L2SQR )
            func = (BatchDistFunc)batchDistL2Sqr_32f;
        else if( normType == NORM_L2 )
            func = (BatchDistFunc)batchDistL2_32f;
    }
4119

4120 4121 4122 4123
    if( func == 0 )
        CV_Error_(CV_StsUnsupportedFormat,
                  ("The combination of type=%d, dtype=%d and normType=%d is not supported",
                   type, dtype, normType));
4124

4125 4126
    parallel_for_(Range(0, src1.rows),
                  BatchDistInvoker(src1, src2, dist, nidx, K, mask, update, func));
4127 4128 4129
}


4130 4131
void cv::findNonZero( InputArray _src, OutputArray _idx )
{
4132 4133
    CV_INSTRUMENT_REGION()

4134 4135 4136
    Mat src = _src.getMat();
    CV_Assert( src.type() == CV_8UC1 );
    int n = countNonZero(src);
4137 4138 4139 4140 4141
    if( n == 0 )
    {
        _idx.release();
        return;
    }
4142 4143 4144 4145 4146
    if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
        _idx.release();
    _idx.create(n, 1, CV_32SC2);
    Mat idx = _idx.getMat();
    CV_Assert(idx.isContinuous());
4147
    Point* idx_ptr = idx.ptr<Point>();
4148

4149 4150 4151 4152 4153 4154 4155 4156 4157
    for( int i = 0; i < src.rows; i++ )
    {
        const uchar* bin_ptr = src.ptr(i);
        for( int j = 0; j < src.cols; j++ )
            if( bin_ptr[j] )
                *idx_ptr++ = Point(j, i);
    }
}

4158 4159
double cv::PSNR(InputArray _src1, InputArray _src2)
{
4160 4161
    CV_INSTRUMENT_REGION()

4162
    //Input arrays must have depth CV_8U
4163 4164
    CV_Assert( _src1.depth() == CV_8U && _src2.depth() == CV_8U );

4165
    double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
4166 4167 4168
    return 20*log10(255./(diff+DBL_EPSILON));
}

4169

4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251
CV_IMPL CvScalar cvSum( const CvArr* srcarr )
{
    cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
    if( CV_IS_IMAGE(srcarr) )
    {
        int coi = cvGetImageCOI((IplImage*)srcarr);
        if( coi )
        {
            CV_Assert( 0 < coi && coi <= 4 );
            sum = cv::Scalar(sum[coi-1]);
        }
    }
    return sum;
}

CV_IMPL int cvCountNonZero( const CvArr* imgarr )
{
    cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
    if( img.channels() > 1 )
        cv::extractImageCOI(imgarr, img);
    return countNonZero(img);
}


CV_IMPL  CvScalar
cvAvg( const void* imgarr, const void* maskarr )
{
    cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
    cv::Scalar mean = !maskarr ? cv::mean(img) : cv::mean(img, cv::cvarrToMat(maskarr));
    if( CV_IS_IMAGE(imgarr) )
    {
        int coi = cvGetImageCOI((IplImage*)imgarr);
        if( coi )
        {
            CV_Assert( 0 < coi && coi <= 4 );
            mean = cv::Scalar(mean[coi-1]);
        }
    }
    return mean;
}


CV_IMPL  void
cvAvgSdv( const CvArr* imgarr, CvScalar* _mean, CvScalar* _sdv, const void* maskarr )
{
    cv::Scalar mean, sdv;

    cv::Mat mask;
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);

    cv::meanStdDev(cv::cvarrToMat(imgarr, false, true, 1), mean, sdv, mask );

    if( CV_IS_IMAGE(imgarr) )
    {
        int coi = cvGetImageCOI((IplImage*)imgarr);
        if( coi )
        {
            CV_Assert( 0 < coi && coi <= 4 );
            mean = cv::Scalar(mean[coi-1]);
            sdv = cv::Scalar(sdv[coi-1]);
        }
    }

    if( _mean )
        *(cv::Scalar*)_mean = mean;
    if( _sdv )
        *(cv::Scalar*)_sdv = sdv;
}


CV_IMPL void
cvMinMaxLoc( const void* imgarr, double* _minVal, double* _maxVal,
             CvPoint* _minLoc, CvPoint* _maxLoc, const void* maskarr )
{
    cv::Mat mask, img = cv::cvarrToMat(imgarr, false, true, 1);
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    if( img.channels() > 1 )
        cv::extractImageCOI(imgarr, img);

    cv::minMaxLoc( img, _minVal, _maxVal,
4252
                   (cv::Point*)_minLoc, (cv::Point*)_maxLoc, mask );
4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280
}


CV_IMPL  double
cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
{
    cv::Mat a, mask;
    if( !imgA )
    {
        imgA = imgB;
        imgB = 0;
    }

    a = cv::cvarrToMat(imgA, false, true, 1);
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);

    if( a.channels() > 1 && CV_IS_IMAGE(imgA) && cvGetImageCOI((const IplImage*)imgA) > 0 )
        cv::extractImageCOI(imgA, a);

    if( !imgB )
        return !maskarr ? cv::norm(a, normType) : cv::norm(a, normType, mask);

    cv::Mat b = cv::cvarrToMat(imgB, false, true, 1);
    if( b.channels() > 1 && CV_IS_IMAGE(imgB) && cvGetImageCOI((const IplImage*)imgB) > 0 )
        cv::extractImageCOI(imgB, b);

    return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
4281
}
4282 4283 4284

namespace cv { namespace hal {

4285
extern const uchar popCountTable[256] =
4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356
{
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};

static const uchar popCountTable2[] =
{
    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
};

static const uchar popCountTable4[] =
{
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
};


int normHamming(const uchar* a, int n, int cellSize)
{
    if( cellSize == 1 )
        return normHamming(a, n);
    const uchar* tab = 0;
    if( cellSize == 2 )
        tab = popCountTable2;
    else if( cellSize == 4 )
        tab = popCountTable4;
    else
        return -1;
    int i = 0;
    int result = 0;
#if CV_ENABLE_UNROLLED
    for( ; i <= n - 4; i += 4 )
        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
#endif
    for( ; i < n; i++ )
        result += tab[a[i]];
    return result;
}

int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
{
    if( cellSize == 1 )
        return normHamming(a, b, n);
    const uchar* tab = 0;
    if( cellSize == 2 )
        tab = popCountTable2;
    else if( cellSize == 4 )
        tab = popCountTable4;
    else
        return -1;
    int i = 0;
    int result = 0;
4357
#if CV_ENABLE_UNROLLED
4358 4359 4360
    for( ; i <= n - 4; i += 4 )
        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
4361
#endif
4362 4363 4364 4365 4366 4367 4368 4369
    for( ; i < n; i++ )
        result += tab[a[i] ^ b[i]];
    return result;
}

float normL2Sqr_(const float* a, const float* b, int n)
{
    int j = 0; float d = 0.f;
4370 4371 4372 4373 4374 4375 4376
#if CV_AVX2
    float CV_DECL_ALIGNED(32) buf[8];
    __m256 d0 = _mm256_setzero_ps();

    for( ; j <= n - 8; j += 8 )
    {
        __m256 t0 = _mm256_sub_ps(_mm256_loadu_ps(a + j), _mm256_loadu_ps(b + j));
4377
#if CV_FMA3
4378 4379 4380 4381 4382 4383 4384 4385
        d0 = _mm256_fmadd_ps(t0, t0, d0);
#else
        d0 = _mm256_add_ps(d0, _mm256_mul_ps(t0, t0));
#endif
    }
    _mm256_store_ps(buf, d0);
    d = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
#elif CV_SSE
4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504
    float CV_DECL_ALIGNED(16) buf[4];
    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();

    for( ; j <= n - 8; j += 8 )
    {
        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
    }
    _mm_store_ps(buf, _mm_add_ps(d0, d1));
    d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
        }
    }

    for( ; j < n; j++ )
    {
        float t = a[j] - b[j];
        d += t*t;
    }
    return d;
}


float normL1_(const float* a, const float* b, int n)
{
    int j = 0; float d = 0.f;
#if CV_SSE
    float CV_DECL_ALIGNED(16) buf[4];
    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
    __m128 absmask = _mm_load_ps((const float*)absbuf);

    for( ; j <= n - 8; j += 8 )
    {
        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
    }
    _mm_store_ps(buf, _mm_add_ps(d0, d1));
    d = buf[0] + buf[1] + buf[2] + buf[3];
#elif CV_NEON
    float32x4_t v_sum = vdupq_n_f32(0.0f);
    for ( ; j <= n - 4; j += 4)
        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));

    float CV_DECL_ALIGNED(16) buf[4];
    vst1q_f32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }

    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
}

int normL1_(const uchar* a, const uchar* b, int n)
{
    int j = 0, d = 0;
#if CV_SSE
    __m128i d0 = _mm_setzero_si128();

    for( ; j <= n - 16; j += 16 )
    {
        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }

    for( ; j <= n - 4; j += 4 )
    {
        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
#elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
    {
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    }

    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
}

}} //cv::hal