matmul.cpp 121 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
Ilya Lavrenov's avatar
Ilya Lavrenov committed
15
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

44
#include <sstream>
45
#include "precomp.hpp"
Alexander Alekhin's avatar
Alexander Alekhin committed
46
#include "opencl_kernels_core.hpp"
Ilya Lavrenov's avatar
Ilya Lavrenov committed
47
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
48 49
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
#include "intel_gpu_gemm.inl.hpp"
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67

namespace cv
{

/****************************************************************************************\
*                                         GEMM                                           *
\****************************************************************************************/

static void
GEMM_CopyBlock( const uchar* src, size_t src_step,
                uchar* dst, size_t dst_step,
                Size size, size_t pix_size )
{
    int j;
    size.width *= (int)(pix_size / sizeof(int));

    for( ; size.height--; src += src_step, dst += dst_step )
    {
Andrey Kamaev's avatar
Andrey Kamaev committed
68
        j=0;
Victoria Zhislina's avatar
Victoria Zhislina committed
69 70
         #if CV_ENABLE_UNROLLED
        for( ; j <= size.width - 4; j += 4 )
71 72 73 74 75 76 77 78 79 80
        {
            int t0 = ((const int*)src)[j];
            int t1 = ((const int*)src)[j+1];
            ((int*)dst)[j] = t0;
            ((int*)dst)[j+1] = t1;
            t0 = ((const int*)src)[j+2];
            t1 = ((const int*)src)[j+3];
            ((int*)dst)[j+2] = t0;
            ((int*)dst)[j+3] = t1;
        }
Victoria Zhislina's avatar
Victoria Zhislina committed
81
        #endif
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
        for( ; j < size.width; j++ )
            ((int*)dst)[j] = ((const int*)src)[j];
    }
}


static void
GEMM_TransposeBlock( const uchar* src, size_t src_step,
                     uchar* dst, size_t dst_step,
                     Size size, size_t pix_size )
{
    int i, j;
    for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size )
    {
        const uchar* _src = src;
        switch( pix_size )
        {
        case sizeof(int):
            for( j = 0; j < size.height; j++, _src += src_step )
                ((int*)dst)[j] = ((int*)_src)[0];
            break;
        case sizeof(int)*2:
            for( j = 0; j < size.height*2; j += 2, _src += src_step )
            {
                int t0 = ((int*)_src)[0];
                int t1 = ((int*)_src)[1];
                ((int*)dst)[j] = t0;
                ((int*)dst)[j+1] = t1;
            }
            break;
        case sizeof(int)*4:
            for( j = 0; j < size.height*4; j += 4, _src += src_step )
            {
                int t0 = ((int*)_src)[0];
                int t1 = ((int*)_src)[1];
                ((int*)dst)[j] = t0;
                ((int*)dst)[j+1] = t1;
                t0 = ((int*)_src)[2];
                t1 = ((int*)_src)[3];
                ((int*)dst)[j+2] = t0;
                ((int*)dst)[j+3] = t1;
            }
            break;
        default:
            assert(0);
            return;
        }
    }
}


template<typename T, typename WT> static void
GEMMSingleMul( const T* a_data, size_t a_step,
               const T* b_data, size_t b_step,
               const T* c_data, size_t c_step,
               T* d_data, size_t d_step,
               Size a_size, Size d_size,
               double alpha, double beta, int flags )
{
    int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height;
    const T *_a_data = a_data, *_b_data = b_data, *_c_data = c_data;
143
    cv::AutoBuffer<T> _a_buf;
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
    T* a_buf = 0;
    size_t a_step0, a_step1, c_step0, c_step1, t_step;

    a_step /= sizeof(a_data[0]);
    b_step /= sizeof(b_data[0]);
    c_step /= sizeof(c_data[0]);
    d_step /= sizeof(d_data[0]);
    a_step0 = a_step;
    a_step1 = 1;

    if( !c_data )
        c_step0 = c_step1 = 0;
    else if( !(flags & GEMM_3_T) )
        c_step0 = c_step, c_step1 = 1;
    else
        c_step0 = 1, c_step1 = c_step;

    if( flags & GEMM_1_T )
    {
        CV_SWAP( a_step0, a_step1, t_step );
        n = a_size.height;
        if( a_step > 1 && n > 1 )
166 167
        {
            _a_buf.allocate(n);
168
            a_buf = _a_buf.data();
169
        }
170 171 172 173
    }

    if( n == 1 ) /* external product */
    {
174
        cv::AutoBuffer<T> _b_buf;
175 176 177 178
        T* b_buf = 0;

        if( a_step > 1 && a_size.height > 1 )
        {
179
            _a_buf.allocate(drows);
180
            a_buf = _a_buf.data();
181 182 183 184 185 186 187
            for( k = 0; k < drows; k++ )
                a_buf[k] = a_data[a_step*k];
            a_data = a_buf;
        }

        if( b_step > 1 )
        {
188
            _b_buf.allocate(d_size.width);
189
            b_buf = _b_buf.data();
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
            for( j = 0; j < d_size.width; j++ )
                b_buf[j] = b_data[j*b_step];
            b_data = b_buf;
        }

        for( i = 0; i < drows; i++, _c_data += c_step0, d_data += d_step )
        {
            WT al = WT(a_data[i])*alpha;
            c_data = _c_data;
            for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 )
            {
                WT s0 = al*WT(b_data[j]);
                WT s1 = al*WT(b_data[j+1]);
                if( !c_data )
                {
                    d_data[j] = T(s0);
                    d_data[j+1] = T(s1);
                }
                else
                {
                    d_data[j] = T(s0 + WT(c_data[0])*beta);
                    d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta);
                }
            }

            for( ; j < d_size.width; j++, c_data += c_step1 )
            {
                WT s0 = al*WT(b_data[j]);
                if( !c_data )
                    d_data[j] = T(s0);
                else
                    d_data[j] = T(s0 + WT(c_data[0])*beta);
            }
        }
    }
    else if( flags & GEMM_2_T ) /* A * Bt */
    {
        for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step )
        {
            a_data = _a_data;
            b_data = _b_data;
            c_data = _c_data;

            if( a_buf )
            {
                for( k = 0; k < n; k++ )
                    a_buf[k] = a_data[a_step1*k];
                a_data = a_buf;
            }

            for( j = 0; j < d_size.width; j++, b_data += b_step,
                                               c_data += c_step1 )
            {
                WT s0(0), s1(0), s2(0), s3(0);
Victoria Zhislina's avatar
Victoria Zhislina committed
244 245 246
                k = 0;
                 #if CV_ENABLE_UNROLLED
                for( ; k <= n - 4; k += 4 )
247 248 249 250 251 252
                {
                    s0 += WT(a_data[k])*WT(b_data[k]);
                    s1 += WT(a_data[k+1])*WT(b_data[k+1]);
                    s2 += WT(a_data[k+2])*WT(b_data[k+2]);
                    s3 += WT(a_data[k+3])*WT(b_data[k+3]);
                }
Victoria Zhislina's avatar
Victoria Zhislina committed
253
                #endif
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
                for( ; k < n; k++ )
                    s0 += WT(a_data[k])*WT(b_data[k]);
                s0 = (s0+s1+s2+s3)*alpha;

                if( !c_data )
                    d_data[j] = T(s0);
                else
                    d_data[j] = T(s0 + WT(c_data[0])*beta);
            }
        }
    }
    else if( d_size.width*sizeof(d_data[0]) <= 1600 )
    {
        for( i = 0; i < drows; i++, _a_data += a_step0,
                                    _c_data += c_step0,
                                    d_data += d_step )
        {
            a_data = _a_data, c_data = _c_data;

            if( a_buf )
            {
                for( k = 0; k < n; k++ )
                    a_buf[k] = a_data[a_step1*k];
                a_data = a_buf;
            }

            for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 )
            {
                const T* b = _b_data + j;
                WT s0(0), s1(0), s2(0), s3(0);

                for( k = 0; k < n; k++, b += b_step )
                {
                    WT a(a_data[k]);
                    s0 += a * WT(b[0]); s1 += a * WT(b[1]);
                    s2 += a * WT(b[2]); s3 += a * WT(b[3]);
                }

                if( !c_data )
                {
                    d_data[j] = T(s0*alpha);
                    d_data[j+1] = T(s1*alpha);
                    d_data[j+2] = T(s2*alpha);
                    d_data[j+3] = T(s3*alpha);
                }
                else
                {
                    s0 = s0*alpha; s1 = s1*alpha;
                    s2 = s2*alpha; s3 = s3*alpha;
                    d_data[j] = T(s0 + WT(c_data[0])*beta);
                    d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta);
                    d_data[j+2] = T(s2 + WT(c_data[c_step1*2])*beta);
                    d_data[j+3] = T(s3 + WT(c_data[c_step1*3])*beta);
                }
            }

            for( ; j < m; j++, c_data += c_step1 )
            {
                const T* b = _b_data + j;
                WT s0(0);

                for( k = 0; k < n; k++, b += b_step )
                    s0 += WT(a_data[k]) * WT(b[0]);

                s0 = s0*alpha;
                if( !c_data )
                    d_data[j] = T(s0);
                else
                    d_data[j] = T(s0 + WT(c_data[0])*beta);
            }
        }
    }
    else
    {
328
        cv::AutoBuffer<WT> _d_buf(m);
329
        WT* d_buf = _d_buf.data();
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349

        for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step )
        {
            a_data = _a_data;
            b_data = _b_data;
            c_data = _c_data;

            if( a_buf )
            {
                for( k = 0; k < n; k++ )
                    a_buf[k] = _a_data[a_step1*k];
                a_data = a_buf;
            }

            for( j = 0; j < m; j++ )
                d_buf[j] = WT(0);

            for( k = 0; k < n; k++, b_data += b_step )
            {
                WT al(a_data[k]);
Andrey Kamaev's avatar
Andrey Kamaev committed
350
                j=0;
Victoria Zhislina's avatar
Victoria Zhislina committed
351 352
                 #if CV_ENABLE_UNROLLED
                for(; j <= m - 4; j += 4 )
353 354 355 356 357 358 359 360 361 362
                {
                    WT t0 = d_buf[j] + WT(b_data[j])*al;
                    WT t1 = d_buf[j+1] + WT(b_data[j+1])*al;
                    d_buf[j] = t0;
                    d_buf[j+1] = t1;
                    t0 = d_buf[j+2] + WT(b_data[j+2])*al;
                    t1 = d_buf[j+3] + WT(b_data[j+3])*al;
                    d_buf[j+2] = t0;
                    d_buf[j+3] = t1;
                }
Victoria Zhislina's avatar
Victoria Zhislina committed
363
                #endif
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
                for( ; j < m; j++ )
                    d_buf[j] += WT(b_data[j])*al;
            }

            if( !c_data )
                for( j = 0; j < m; j++ )
                    d_data[j] = T(d_buf[j]*alpha);
            else
                for( j = 0; j < m; j++, c_data += c_step1 )
                {
                    WT t = d_buf[j]*alpha;
                    d_data[j] = T(t + WT(c_data[0])*beta);
                }
        }
    }
}


template<typename T, typename WT> static void
GEMMBlockMul( const T* a_data, size_t a_step,
              const T* b_data, size_t b_step,
              WT* d_data, size_t d_step,
              Size a_size, Size d_size, int flags )
{
    int i, j, k, n = a_size.width, m = d_size.width;
    const T *_a_data = a_data, *_b_data = b_data;
390
    cv::AutoBuffer<T> _a_buf;
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
    T* a_buf = 0;
    size_t a_step0, a_step1, t_step;
    int do_acc = flags & 16;

    a_step /= sizeof(a_data[0]);
    b_step /= sizeof(b_data[0]);
    d_step /= sizeof(d_data[0]);

    a_step0 = a_step;
    a_step1 = 1;

    if( flags & GEMM_1_T )
    {
        CV_SWAP( a_step0, a_step1, t_step );
        n = a_size.height;
406
        _a_buf.allocate(n);
407
        a_buf = _a_buf.data();
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
    }

    if( flags & GEMM_2_T )
    {
        /* second operand is transposed */
        for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step )
        {
            a_data = _a_data; b_data = _b_data;

            if( a_buf )
            {
                for( k = 0; k < n; k++ )
                    a_buf[k] = a_data[a_step1*k];
                a_data = a_buf;
            }

            for( j = 0; j < d_size.width; j++, b_data += b_step )
            {
                WT s0 = do_acc ? d_data[j]:WT(0), s1(0);
                for( k = 0; k <= n - 2; k += 2 )
                {
                    s0 += WT(a_data[k])*WT(b_data[k]);
                    s1 += WT(a_data[k+1])*WT(b_data[k+1]);
                }

                for( ; k < n; k++ )
                    s0 += WT(a_data[k])*WT(b_data[k]);

                d_data[j] = s0 + s1;
            }
        }
    }
    else
    {
        for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step )
        {
            a_data = _a_data, b_data = _b_data;

            if( a_buf )
            {
                for( k = 0; k < n; k++ )
                    a_buf[k] = a_data[a_step1*k];
                a_data = a_buf;
            }

            for( j = 0; j <= m - 4; j += 4 )
            {
                WT s0, s1, s2, s3;
                const T* b = b_data + j;

                if( do_acc )
                {
                    s0 = d_data[j]; s1 = d_data[j+1];
                    s2 = d_data[j+2]; s3 = d_data[j+3];
                }
                else
                    s0 = s1 = s2 = s3 = WT(0);

                for( k = 0; k < n; k++, b += b_step )
                {
                    WT a(a_data[k]);
                    s0 += a * WT(b[0]); s1 += a * WT(b[1]);
                    s2 += a * WT(b[2]); s3 += a * WT(b[3]);
                }

                d_data[j] = s0; d_data[j+1] = s1;
                d_data[j+2] = s2; d_data[j+3] = s3;
            }

            for( ; j < m; j++ )
            {
                const T* b = b_data + j;
                WT s0 = do_acc ? d_data[j] : WT(0);

                for( k = 0; k < n; k++, b += b_step )
                    s0 += WT(a_data[k]) * WT(b[0]);

                d_data[j] = s0;
            }
        }
    }
}


template<typename T, typename WT> static void
GEMMStore( const T* c_data, size_t c_step,
           const WT* d_buf, size_t d_buf_step,
           T* d_data, size_t d_step, Size d_size,
           double alpha, double beta, int flags )
{
    const T* _c_data = c_data;
    int j;
    size_t c_step0, c_step1;

    c_step /= sizeof(c_data[0]);
    d_buf_step /= sizeof(d_buf[0]);
    d_step /= sizeof(d_data[0]);

    if( !c_data )
        c_step0 = c_step1 = 0;
    else if( !(flags & GEMM_3_T) )
        c_step0 = c_step, c_step1 = 1;
    else
        c_step0 = 1, c_step1 = c_step;

    for( ; d_size.height--; _c_data += c_step0, d_buf += d_buf_step, d_data += d_step )
    {
        if( _c_data )
        {
            c_data = _c_data;
Andrey Kamaev's avatar
Andrey Kamaev committed
518 519
            j=0;
             #if CV_ENABLE_UNROLLED
Victoria Zhislina's avatar
Victoria Zhislina committed
520
            for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
521 522 523 524 525 526 527 528 529 530 531 532 533 534
            {
                WT t0 = alpha*d_buf[j];
                WT t1 = alpha*d_buf[j+1];
                t0 += beta*WT(c_data[0]);
                t1 += beta*WT(c_data[c_step1]);
                d_data[j] = T(t0);
                d_data[j+1] = T(t1);
                t0 = alpha*d_buf[j+2];
                t1 = alpha*d_buf[j+3];
                t0 += beta*WT(c_data[c_step1*2]);
                t1 += beta*WT(c_data[c_step1*3]);
                d_data[j+2] = T(t0);
                d_data[j+3] = T(t1);
            }
Victoria Zhislina's avatar
Victoria Zhislina committed
535
            #endif
536 537 538 539 540 541 542 543
            for( ; j < d_size.width; j++, c_data += c_step1 )
            {
                WT t0 = alpha*d_buf[j];
                d_data[j] = T(t0 + WT(c_data[0])*beta);
            }
        }
        else
        {
Andrey Kamaev's avatar
Andrey Kamaev committed
544 545
            j = 0;
             #if CV_ENABLE_UNROLLED
Victoria Zhislina's avatar
Victoria Zhislina committed
546
            for( ; j <= d_size.width - 4; j += 4 )
547 548 549 550 551 552 553 554 555 556
            {
                WT t0 = alpha*d_buf[j];
                WT t1 = alpha*d_buf[j+1];
                d_data[j] = T(t0);
                d_data[j+1] = T(t1);
                t0 = alpha*d_buf[j+2];
                t1 = alpha*d_buf[j+3];
                d_data[j+2] = T(t0);
                d_data[j+3] = T(t1);
            }
Andrey Kamaev's avatar
Andrey Kamaev committed
557
            #endif
558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
            for( ; j < d_size.width; j++ )
                d_data[j] = T(alpha*d_buf[j]);
        }
    }
}


typedef void (*GEMMSingleMulFunc)( const void* src1, size_t step1,
                   const void* src2, size_t step2, const void* src3, size_t step3,
                   void* dst, size_t dststep, Size srcsize, Size dstsize,
                   double alpha, double beta, int flags );

typedef void (*GEMMBlockMulFunc)( const void* src1, size_t step1,
                   const void* src2, size_t step2, void* dst, size_t dststep,
                   Size srcsize, Size dstsize, int flags );

typedef void (*GEMMStoreFunc)( const void* src1, size_t step1,
                   const void* src2, size_t step2, void* dst, size_t dststep,
                   Size dstsize, double alpha, double beta, int flags );

static void GEMMSingleMul_32f( const float* a_data, size_t a_step,
              const float* b_data, size_t b_step,
              const float* c_data, size_t c_step,
              float* d_data, size_t d_step,
              Size a_size, Size d_size,
              double alpha, double beta, int flags )
{
    GEMMSingleMul<float,double>(a_data, a_step, b_data, b_step, c_data,
                                c_step, d_data, d_step, a_size, d_size,
                                alpha, beta, flags);
}

static void GEMMSingleMul_64f( const double* a_data, size_t a_step,
                              const double* b_data, size_t b_step,
                              const double* c_data, size_t c_step,
                              double* d_data, size_t d_step,
                              Size a_size, Size d_size,
                              double alpha, double beta, int flags )
{
    GEMMSingleMul<double,double>(a_data, a_step, b_data, b_step, c_data,
                                c_step, d_data, d_step, a_size, d_size,
                                alpha, beta, flags);
}

Andrey Kamaev's avatar
Andrey Kamaev committed
602

603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
static void GEMMSingleMul_32fc( const Complexf* a_data, size_t a_step,
                              const Complexf* b_data, size_t b_step,
                              const Complexf* c_data, size_t c_step,
                              Complexf* d_data, size_t d_step,
                              Size a_size, Size d_size,
                              double alpha, double beta, int flags )
{
    GEMMSingleMul<Complexf,Complexd>(a_data, a_step, b_data, b_step, c_data,
                                c_step, d_data, d_step, a_size, d_size,
                                alpha, beta, flags);
}

static void GEMMSingleMul_64fc( const Complexd* a_data, size_t a_step,
                              const Complexd* b_data, size_t b_step,
                              const Complexd* c_data, size_t c_step,
                              Complexd* d_data, size_t d_step,
                              Size a_size, Size d_size,
                              double alpha, double beta, int flags )
{
    GEMMSingleMul<Complexd,Complexd>(a_data, a_step, b_data, b_step, c_data,
                                 c_step, d_data, d_step, a_size, d_size,
                                 alpha, beta, flags);
Andrey Kamaev's avatar
Andrey Kamaev committed
625
}
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660

static void GEMMBlockMul_32f( const float* a_data, size_t a_step,
             const float* b_data, size_t b_step,
             double* d_data, size_t d_step,
             Size a_size, Size d_size, int flags )
{
    GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
}


static void GEMMBlockMul_64f( const double* a_data, size_t a_step,
                             const double* b_data, size_t b_step,
                             double* d_data, size_t d_step,
                             Size a_size, Size d_size, int flags )
{
    GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
}


static void GEMMBlockMul_32fc( const Complexf* a_data, size_t a_step,
                             const Complexf* b_data, size_t b_step,
                             Complexd* d_data, size_t d_step,
                             Size a_size, Size d_size, int flags )
{
    GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
}


static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step,
                             const Complexd* b_data, size_t b_step,
                             Complexd* d_data, size_t d_step,
                             Size a_size, Size d_size, int flags )
{
    GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
}
661 662


663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
static void GEMMStore_32f( const float* c_data, size_t c_step,
          const double* d_buf, size_t d_buf_step,
          float* d_data, size_t d_step, Size d_size,
          double alpha, double beta, int flags )
{
    GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
}


static void GEMMStore_64f( const double* c_data, size_t c_step,
                      const double* d_buf, size_t d_buf_step,
                      double* d_data, size_t d_step, Size d_size,
                      double alpha, double beta, int flags )
{
    GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
}
679

680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697

static void GEMMStore_32fc( const Complexf* c_data, size_t c_step,
                          const Complexd* d_buf, size_t d_buf_step,
                          Complexf* d_data, size_t d_step, Size d_size,
                          double alpha, double beta, int flags )
{
    GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
}


static void GEMMStore_64fc( const Complexd* c_data, size_t c_step,
                          const Complexd* d_buf, size_t d_buf_step,
                          Complexd* d_data, size_t d_step, Size d_size,
                          double alpha, double beta, int flags )
{
    GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
}

Ilya Lavrenov's avatar
Ilya Lavrenov committed
698 699
#ifdef HAVE_CLAMDBLAS

Elena Gvozdeva's avatar
Elena Gvozdeva committed
700
static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
Ilya Lavrenov's avatar
Ilya Lavrenov committed
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726
                      InputArray matC, double beta, OutputArray matD, int flags )
{
    int type = matA.type(), esz = CV_ELEM_SIZE(type);
    bool haveC = matC.kind() != cv::_InputArray::NONE;
    Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
    bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;

    if (atrans)
        sizeA = Size(sizeA.height, sizeA.width);
    if (btrans)
        sizeB = Size(sizeB.height, sizeB.width);
    if (haveC && ctrans)
        sizeC = Size(sizeC.height, sizeC.width);

    Size sizeD(sizeB.width, sizeA.height);

    CV_Assert( matB.type() == type && (!haveC || matC.type() == type) );
    CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );

    matD.create(sizeD, type);
    if ( matA.offset() % esz != 0 || matA.step() % esz != 0 ||
         matB.offset() % esz != 0 || matB.step() % esz != 0 ||
         (haveC && (matC.offset() % esz != 0 || matC.step() % esz != 0)) )
        return false;

    UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
727 728 729 730 731 732 733 734 735 736
    if (!ocl::internal::isCLBuffer(A) || !ocl::internal::isCLBuffer(B) || !ocl::internal::isCLBuffer(D))
    {
        return false;
    }
    if (haveC)
    {
        UMat C = matC.getUMat();
        if (!ocl::internal::isCLBuffer(C))
            return false;
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
737
    if (haveC)
738
        ctrans ? transpose(matC, D) : matC.copyTo(D);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
    else
        D.setTo(Scalar::all(0));

    int M = sizeD.height, N = sizeD.width, K = sizeA.width;
    int lda = (int)A.step / esz, ldb = (int)B.step / esz, ldc = (int)D.step / esz;
    int offa = (int)A.offset / esz, offb = (int)B.offset / esz, offc = (int)D.offset / esz;

    cl_command_queue clq = (cl_command_queue)ocl::Queue::getDefault().ptr();
    clAmdBlasTranspose transA = atrans ? clAmdBlasTrans : clAmdBlasNoTrans;
    clAmdBlasTranspose transB = btrans ? clAmdBlasTrans : clAmdBlasNoTrans;
    clAmdBlasOrder order = clAmdBlasRowMajor;
    clAmdBlasStatus status = clAmdBlasSuccess;

    if (type == CV_32FC1)
        status = clAmdBlasSgemmEx(order, transA, transB, M, N, K,
                                  (cl_float)alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
                                  (const cl_mem)B.handle(ACCESS_READ), offb, ldb,
                                  (cl_float)beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
                                  1, &clq, 0, NULL, NULL);
    else if (type == CV_64FC1)
        status = clAmdBlasDgemmEx(order, transA, transB, M, N, K,
                                  alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
                                  (const cl_mem)B.handle(ACCESS_READ), offb, ldb,
                                  beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
                                  1, &clq, 0, NULL, NULL);
    else if (type == CV_32FC2)
    {
         cl_float2 alpha_2 = { { (cl_float)alpha, 0 } };
         cl_float2 beta_2  = { { (cl_float)beta, 0 } };
         status = clAmdBlasCgemmEx(order, transA, transB, M, N, K,
                                   alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
                                   (const cl_mem)B.handle(ACCESS_READ), offb, ldb,
                                   beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
                                   1, &clq, 0, NULL, NULL);
    }
    else if (type == CV_64FC2)
    {
        cl_double2 alpha_2 = { { alpha, 0 } };
        cl_double2 beta_2  = { { beta, 0 } };
        status = clAmdBlasZgemmEx(order, transA, transB, M, N, K,
                                  alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
                                  (const cl_mem)B.handle(ACCESS_READ), offb, ldb,
                                  beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
                                  1, &clq, 0, NULL, NULL);
    }
    else
        CV_Error(Error::StsUnsupportedFormat, "");

    return status == clAmdBlasSuccess;
}

#endif

Elena Gvozdeva's avatar
Elena Gvozdeva committed
792 793 794 795 796 797 798
#ifdef HAVE_OPENCL
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
                      InputArray matC, double beta, OutputArray matD, int flags )
{
    int depth = matA.depth(), cn = matA.channels();
    int type = CV_MAKETYPE(depth, cn);

799
    CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
Elena Gvozdeva's avatar
Elena Gvozdeva committed
800 801 802 803

    const ocl::Device & dev = ocl::Device::getDefault();
    bool doubleSupport = dev.doubleFPConfig() > 0;

ElenaGvozdeva's avatar
ElenaGvozdeva committed
804
    if (!doubleSupport && depth == CV_64F)
Elena Gvozdeva's avatar
Elena Gvozdeva committed
805 806 807 808 809 810
        return false;

    bool haveC = matC.kind() != cv::_InputArray::NONE;
    Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
    bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;

ElenaGvozdeva's avatar
ElenaGvozdeva committed
811
    CV_Assert( !haveC || matC.type() == type );
Elena Gvozdeva's avatar
Elena Gvozdeva committed
812

813 814
    Size sizeD(((btrans)? sizeB.height : sizeB.width),
               ((atrans)? sizeA.width : sizeA.height));
Elena Gvozdeva's avatar
Elena Gvozdeva committed
815 816 817 818 819
    matD.create(sizeD, type);

    UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();


820 821 822
    if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
    {
        String opts;
Elena Gvozdeva's avatar
Elena Gvozdeva committed
823

824 825 826 827 828 829 830 831
        if (atrans)
            sizeA = Size(sizeA.height, sizeA.width);
        if (btrans)
            sizeB = Size(sizeB.height, sizeB.width);
        if (haveC && ctrans)
            sizeC = Size(sizeC.height, sizeC.width);

        CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
Elena Gvozdeva's avatar
Elena Gvozdeva committed
832

833 834
        int max_wg_size = (int)dev.maxWorkGroupSize();
        int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
Elena Gvozdeva's avatar
Elena Gvozdeva committed
835

836 837 838 839 840 841 842 843 844 845 846 847
        if (atrans)
            A = A.t();

        if (btrans)
            B = B.t();

        if (haveC)
            ctrans ? transpose(matC, D) : matC.copyTo(D);

        int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
        int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);

848
        opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
Elena Gvozdeva's avatar
Elena Gvozdeva committed
849 850
                          ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
                          cn, kercn, block_size,
851 852
                          (sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
                          haveC ? " -D HAVE_C" : "",
Elena Gvozdeva's avatar
Elena Gvozdeva committed
853 854
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");

855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871
        ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
        if (k.empty())
            return false;

        if (depth == CV_64F)
            k.args(ocl::KernelArg::ReadOnlyNoSize(A),
                   ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
                   ocl::KernelArg::ReadWrite(D, cn, kercn),
                   sizeA.width, alpha, beta);
        else
            k.args(ocl::KernelArg::ReadOnlyNoSize(A),
                   ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
                   ocl::KernelArg::ReadWrite(D, cn, kercn),
                   sizeA.width, (float)alpha, (float)beta);

        size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
        size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
Elena Gvozdeva's avatar
Elena Gvozdeva committed
872

873 874
        return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
    }
Elena Gvozdeva's avatar
Elena Gvozdeva committed
875
    else
876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892
    {
        if (haveC && beta != 0.0)
        {
            ctrans ? transpose(matC, D) : matC.copyTo(D);
        }
        else
        {
            beta = 0.0;
        }

        return intel_gpu_gemm(A, sizeA,
                              B, sizeB,
                              D, sizeD,
                              alpha,
                              beta,
                              atrans, btrans);
    }
Elena Gvozdeva's avatar
Elena Gvozdeva committed
893 894
}
#endif
895

896
static void gemmImpl( Mat A, Mat B, double alpha,
897
           Mat C, double beta, Mat D, int flags )
898
{
899 900
    CV_INSTRUMENT_REGION()

901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933
    const int block_lin_size = 128;
    const int block_size = block_lin_size * block_lin_size;

    static double zero[] = {0,0,0,0};
    static float zerof[] = {0,0,0,0};

    Size a_size = A.size(), d_size;
    int i, len = 0, type = A.type();

    switch( flags & (GEMM_1_T|GEMM_2_T) )
    {
    case 0:
        d_size = Size( B.cols, a_size.height );
        len = B.rows;
        break;
    case 1:
        d_size = Size( B.cols, a_size.width );
        len = B.rows;
        break;
    case 2:
        d_size = Size( B.rows, a_size.height );
        len = B.cols;
        break;
    case 3:
        d_size = Size( B.rows, a_size.width );
        len = B.cols;
        break;
    }

    if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) )
    {
        if( type == CV_32F )
        {
934 935 936
            float* d = D.ptr<float>();
            const float *a = A.ptr<float>(),
                        *b = B.ptr<float>(),
937
                        *c = (const float*)C.data;
938 939 940
            size_t d_step = D.step/sizeof(d[0]),
                a_step = A.step/sizeof(a[0]),
                b_step = B.step/sizeof(b[0]),
941
                c_step = C.data ? C.step/sizeof(c[0]) : 0;
942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061

            if( !c )
                c = zerof;

            switch( len )
            {
            case 2:
                if( len == d_size.width && b != d )
                {
                    for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
                    {
                        float t0 = a[0]*b[0] + a[1]*b[b_step];
                        float t1 = a[0]*b[1] + a[1]*b[b_step+1];
                        d[0] = (float)(t0*alpha + c[0]*beta);
                        d[1] = (float)(t1*alpha + c[1]*beta);
                    }
                }
                else if( a != d )
                {
                    int c_step0 = 1;
                    if( c == zerof )
                    {
                        c_step0 = 0;
                        c_step = 1;
                    }

                    for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
                    {
                        float t0 = a[0]*b[0] + a[1]*b[b_step];
                        float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
                        d[0] = (float)(t0*alpha + c[0]*beta);
                        d[d_step] = (float)(t1*alpha + c[c_step]*beta);
                    }
                }
                else
                    break;
                return;
            case 3:
                if( len == d_size.width && b != d )
                {
                    for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
                    {
                        float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
                        float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
                        float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
                        d[0] = (float)(t0*alpha + c[0]*beta);
                        d[1] = (float)(t1*alpha + c[1]*beta);
                        d[2] = (float)(t2*alpha + c[2]*beta);
                    }
                }
                else if( a != d )
                {
                    int c_step0 = 1;
                    if( c == zerof )
                    {
                        c_step0 = 0;
                        c_step = 1;
                    }

                    for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
                    {
                        float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
                        float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
                        float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];

                        d[0] = (float)(t0*alpha + c[0]*beta);
                        d[d_step] = (float)(t1*alpha + c[c_step]*beta);
                        d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
                    }
                }
                else
                    break;
                return;
            case 4:
                if( len == d_size.width && b != d )
                {
                    for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
                    {
                        float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
                        float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
                        float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
                        float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
                        d[0] = (float)(t0*alpha + c[0]*beta);
                        d[1] = (float)(t1*alpha + c[1]*beta);
                        d[2] = (float)(t2*alpha + c[2]*beta);
                        d[3] = (float)(t3*alpha + c[3]*beta);
                    }
                }
                else if( len <= 16 && a != d )
                {
                    int c_step0 = 1;
                    if( c == zerof )
                    {
                        c_step0 = 0;
                        c_step = 1;
                    }

                    for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
                    {
                        float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
                        float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
                                   a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
                        float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
                                   a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
                        float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
                                   a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
                        d[0] = (float)(t0*alpha + c[0]*beta);
                        d[d_step] = (float)(t1*alpha + c[c_step]*beta);
                        d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
                        d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta);
                    }
                }
                else
                    break;
                return;
            }
        }

        if( type == CV_64F )
        {
1062 1063 1064
            double* d = D.ptr<double>();
            const double *a = A.ptr<double>(),
                         *b = B.ptr<double>(),
1065
                         *c = (const double*)C.data;
1066 1067 1068
            size_t d_step = D.step/sizeof(d[0]),
                a_step = A.step/sizeof(a[0]),
                b_step = B.step/sizeof(b[0]),
1069
                c_step = C.data ? C.step/sizeof(c[0]) : 0;
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192
            if( !c )
                c = zero;

            switch( len )
            {
            case 2:
                if( len == d_size.width && b != d )
                {
                    for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
                    {
                        double t0 = a[0]*b[0] + a[1]*b[b_step];
                        double t1 = a[0]*b[1] + a[1]*b[b_step+1];
                        d[0] = t0*alpha + c[0]*beta;
                        d[1] = t1*alpha + c[1]*beta;
                    }
                }
                else if( a != d )
                {
                    int c_step0 = 1;
                    if( c == zero )
                    {
                        c_step0 = 0;
                        c_step = 1;
                    }

                    for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
                    {
                        double t0 = a[0]*b[0] + a[1]*b[b_step];
                        double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
                        d[0] = t0*alpha + c[0]*beta;
                        d[d_step] = t1*alpha + c[c_step]*beta;
                    }
                }
                else
                    break;
                return;
            case 3:
                if( len == d_size.width && b != d )
                {
                    for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
                    {
                        double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
                        double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
                        double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
                        d[0] = t0*alpha + c[0]*beta;
                        d[1] = t1*alpha + c[1]*beta;
                        d[2] = t2*alpha + c[2]*beta;
                    }
                }
                else if( a != d )
                {
                    int c_step0 = 1;
                    if( c == zero )
                    {
                        c_step0 = 0;
                        c_step = 1;
                    }

                    for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
                    {
                        double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
                        double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
                        double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];

                        d[0] = t0*alpha + c[0]*beta;
                        d[d_step] = t1*alpha + c[c_step]*beta;
                        d[d_step*2] = t2*alpha + c[c_step*2]*beta;
                    }
                }
                else
                    break;
                return;
            case 4:
                if( len == d_size.width && b != d )
                {
                    for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
                    {
                        double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
                        double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
                        double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
                        double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
                        d[0] = t0*alpha + c[0]*beta;
                        d[1] = t1*alpha + c[1]*beta;
                        d[2] = t2*alpha + c[2]*beta;
                        d[3] = t3*alpha + c[3]*beta;
                    }
                }
                else if( d_size.width <= 16 && a != d )
                {
                    int c_step0 = 1;
                    if( c == zero )
                    {
                        c_step0 = 0;
                        c_step = 1;
                    }

                    for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
                    {
                        double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
                        double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
                                    a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
                        double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
                                    a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
                        double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
                                    a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
                        d[0] = t0*alpha + c[0]*beta;
                        d[d_step] = t1*alpha + c[c_step]*beta;
                        d[d_step*2] = t2*alpha + c[c_step*2]*beta;
                        d[d_step*3] = t3*alpha + c[c_step*3]*beta;
                    }
                }
                else
                    break;
                return;
            }
        }
    }

    {
    size_t b_step = B.step;
    GEMMSingleMulFunc singleMulFunc;
    GEMMBlockMulFunc blockMulFunc;
    GEMMStoreFunc storeFunc;
1193
    Mat *matD = &D;
1194 1195
    const uchar* Cdata = C.data;
    size_t Cstep = C.data ? (size_t)C.step : 0;
1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296
    AutoBuffer<uchar> buf;

    if( type == CV_32FC1 )
    {
        singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32f;
        blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32f;
        storeFunc = (GEMMStoreFunc)GEMMStore_32f;
    }
    else if( type == CV_64FC1 )
    {
        singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64f;
        blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64f;
        storeFunc = (GEMMStoreFunc)GEMMStore_64f;
    }
    else if( type == CV_32FC2 )
    {
        singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32fc;
        blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32fc;
        storeFunc = (GEMMStoreFunc)GEMMStore_32fc;
    }
    else
    {
        CV_Assert( type == CV_64FC2 );
        singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64fc;
        blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64fc;
        storeFunc = (GEMMStoreFunc)GEMMStore_64fc;
    }

    if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() )
    {
        b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type);
        flags |= GEMM_2_T;
    }

    /*if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 )
    {
        blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p :
                    type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p :
                    type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p :
                    type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0;
    }

    if( blas_func )
    {
        const char* transa = flags & GEMM_1_T ? "t" : "n";
        const char* transb = flags & GEMM_2_T ? "t" : "n";
        int lda, ldb, ldd;

        if( C->data.ptr )
        {
            if( C->data.ptr != D->data.ptr )
            {
                if( !(flags & GEMM_3_T) )
                    cvCopy( C, D );
                else
                    cvTranspose( C, D );
            }
        }

        if( CV_MAT_DEPTH(type) == CV_32F )
        {
            Complex32f _alpha, _beta;

            lda = A->step/sizeof(float);
            ldb = b_step/sizeof(float);
            ldd = D->step/sizeof(float);
            _alpha.re = (float)alpha;
            _alpha.im = 0;
            _beta.re = C->data.ptr ? (float)beta : 0;
            _beta.im = 0;
            if( CV_MAT_CN(type) == 2 )
                lda /= 2, ldb /= 2, ldd /= 2;

            blas_func( transb, transa, &d_size.width, &d_size.height, &len,
                   &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
                   &_beta, D->data.ptr, &ldd );
        }
        else
        {
            CvComplex64f _alpha, _beta;

            lda = A->step/sizeof(double);
            ldb = b_step/sizeof(double);
            ldd = D->step/sizeof(double);
            _alpha.re = alpha;
            _alpha.im = 0;
            _beta.re = C->data.ptr ? beta : 0;
            _beta.im = 0;
            if( CV_MAT_CN(type) == 2 )
                lda /= 2, ldb /= 2, ldd /= 2;

            blas_func( transb, transa, &d_size.width, &d_size.height, &len,
                   &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
                   &_beta, D->data.ptr, &ldd );
        }
    }
    else*/ if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) &&
        len <= 10000) || len <= 10 ||
        (d_size.width <= block_lin_size &&
        d_size.height <= block_lin_size && len <= block_lin_size) )
    {
1297 1298
        singleMulFunc( A.ptr(), A.step, B.ptr(), b_step, Cdata, Cstep,
                       matD->ptr(), matD->step, a_size, d_size, alpha, beta, flags );
1299 1300 1301 1302 1303 1304 1305
    }
    else
    {
        int is_a_t = flags & GEMM_1_T;
        int is_b_t = flags & GEMM_2_T;
        int elem_size = CV_ELEM_SIZE(type);
        int dk0_1, dk0_2;
1306
        size_t a_buf_size = 0, b_buf_size, d_buf_size;
1307 1308 1309 1310 1311 1312 1313
        uchar* a_buf = 0;
        uchar* b_buf = 0;
        uchar* d_buf = 0;
        int j, k, di = 0, dj = 0, dk = 0;
        int dm0, dn0, dk0;
        size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
        int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
1314

1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
        if( !is_a_t )
            a_step0 = A.step, a_step1 = elem_size;
        else
            a_step0 = elem_size, a_step1 = A.step;

        if( !is_b_t )
            b_step0 = b_step, b_step1 = elem_size;
        else
            b_step0 = elem_size, b_step1 = b_step;

1325
        if( C.empty() )
1326 1327 1328 1329 1330
        {
            c_step0 = c_step1 = 0;
            flags &= ~GEMM_3_T;
        }
        else if( !(flags & GEMM_3_T) )
1331
            c_step0 = C.step, c_step1 = elem_size;
1332
        else
1333
            c_step0 = elem_size, c_step1 = C.step;
1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346

        dm0 = std::min( block_lin_size, d_size.height );
        dn0 = std::min( block_lin_size, d_size.width );
        dk0_1 = block_size / dm0;
        dk0_2 = block_size / dn0;
        dk0 = std::min( dk0_1, dk0_2 );
        dk0 = std::min( dk0, len );
        if( dk0*dm0 > block_size )
            dm0 = block_size / dk0;
        if( dk0*dn0 > block_size )
            dn0 = block_size / dk0;

        dk0_1 = (dn0+dn0/8+2) & -2;
1347 1348
        b_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*elem_size;
        d_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*work_elem_size;
1349 1350 1351

        if( is_a_t )
        {
1352
            a_buf_size = (size_t)(dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
1353 1354 1355
            flags &= ~GEMM_1_T;
        }

1356
        buf.allocate(d_buf_size + b_buf_size + a_buf_size);
1357
        d_buf = buf.data();
1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370
        b_buf = d_buf + d_buf_size;

        if( is_a_t )
            a_buf = b_buf + b_buf_size;

        for( i = 0; i < d_size.height; i += di )
        {
            di = dm0;
            if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height )
                di = d_size.height - i;

            for( j = 0; j < d_size.width; j += dj )
            {
1371
                uchar* _d = matD->ptr() + i*matD->step + j*elem_size;
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387
                const uchar* _c = Cdata + i*c_step0 + j*c_step1;
                size_t _d_step = matD->step;
                dj = dn0;

                if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width )
                    dj = d_size.width - j;

                flags &= 15;
                if( dk0 < len )
                {
                    _d = d_buf;
                    _d_step = dj*work_elem_size;
                }

                for( k = 0; k < len; k += dk )
                {
1388
                    const uchar* _a = A.ptr() + i*a_step0 + k*a_step1;
1389
                    size_t _a_step = A.step;
1390
                    const uchar* _b = B.ptr() + k*b_step0 + j*b_step1;
1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434
                    size_t _b_step = b_step;
                    Size a_bl_size;

                    dk = dk0;
                    if( k + dk >= len || 8*(k + dk) + dk > 8*len )
                        dk = len - k;

                    if( !is_a_t )
                        a_bl_size.width = dk, a_bl_size.height = di;
                    else
                        a_bl_size.width = di, a_bl_size.height = dk;

                    if( a_buf && is_a_t )
                    {
                        _a_step = dk*elem_size;
                        GEMM_TransposeBlock( _a, A.step, a_buf, _a_step, a_bl_size, elem_size );
                        std::swap( a_bl_size.width, a_bl_size.height );
                        _a = a_buf;
                    }

                    if( dj < d_size.width )
                    {
                        Size b_size;
                        if( !is_b_t )
                            b_size.width = dj, b_size.height = dk;
                        else
                            b_size.width = dk, b_size.height = dj;

                        _b_step = b_size.width*elem_size;
                        GEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size );
                        _b = b_buf;
                    }

                    if( dk0 < len )
                        blockMulFunc( _a, _a_step, _b, _b_step, _d, _d_step,
                                      a_bl_size, Size(dj,di), flags );
                    else
                        singleMulFunc( _a, _a_step, _b, _b_step, _c, Cstep,
                                       _d, _d_step, a_bl_size, Size(dj,di), alpha, beta, flags );
                    flags |= 16;
                }

                if( dk0 < len )
                    storeFunc( _c, Cstep, _d, _d_step,
1435
                               matD->ptr(i) + j*elem_size,
1436 1437 1438 1439
                               matD->step, Size(dj,di), alpha, beta, flags );
            }
        }
    }
1440 1441 1442 1443
    }
}

template <typename fptype>inline static void
1444
callGemmImpl(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480
          const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int m_a, int n_a, int n_d, int flags, int type)
{
    CV_StaticAssert(GEMM_1_T == CV_HAL_GEMM_1_T, "Incompatible GEMM_1_T flag in HAL");
    CV_StaticAssert(GEMM_2_T == CV_HAL_GEMM_2_T, "Incompatible GEMM_2_T flag in HAL");
    CV_StaticAssert(GEMM_3_T == CV_HAL_GEMM_3_T, "Incompatible GEMM_3_T flag in HAL");

    int b_m, b_n, c_m, c_n, m_d;

    if(flags & GEMM_2_T)
    {
        b_m = n_d;
        if(flags & GEMM_1_T )
        {
            b_n = m_a;
            m_d = n_a;
        }
        else
        {
            b_n = n_a;
            m_d = m_a;
        }
    }
    else
    {
        b_n = n_d;
        if(flags & GEMM_1_T )
        {
            b_m = m_a;
            m_d = n_a;
        }
        else
        {
            m_d = m_a;
            b_m = n_a;
        }
    }
1481

1482 1483 1484 1485 1486 1487 1488 1489 1490
    if(flags & GEMM_3_T)
    {
        c_m = n_d;
        c_n = m_d;
    }
    else
    {
        c_m = m_d;
        c_n = n_d;
1491
    }
1492 1493 1494 1495 1496 1497 1498 1499 1500 1501

    Mat A, B, C;
    if(src1 != NULL)
        A = Mat(m_a, n_a, type, (void*)src1, src1_step);
    if(src2 != NULL)
        B = Mat(b_m, b_n, type, (void*)src2, src2_step);
    if(src3 != NULL && beta != 0.0)
        C = Mat(c_m, c_n, type, (void*)src3, src3_step);
    Mat D(m_d, n_d, type, (void*)dst, dst_step);

1502
    gemmImpl(A, B, alpha, C, beta, D, flags);
1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
}

}

void cv::hal::gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
                        int m_a, int n_a, int n_d, int flags)
{

    CALL_HAL(gemm32f, cv_hal_gemm32f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
1513
    callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32F);
1514 1515 1516 1517 1518 1519 1520
}

void cv::hal::gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
                        int m_a, int n_a, int n_d, int flags)
{
    CALL_HAL(gemm64f, cv_hal_gemm64f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
1521
    callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64F);
1522 1523 1524 1525 1526 1527 1528
}

CV_EXPORTS void cv::hal::gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
                        int m_a, int n_a, int n_d, int flags)
{
    CALL_HAL(gemm32fc, cv_hal_gemm32fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
1529
    callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32FC2);
1530 1531 1532 1533 1534 1535 1536
}

CV_EXPORTS void cv::hal::gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
                        int m_a, int n_a, int n_d, int flags)
{
    CALL_HAL(gemm64fc, cv_hal_gemm64fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
1537
    callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64FC2);
1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557
}

void cv::gemm( InputArray matA, InputArray matB, double alpha,
           InputArray matC, double beta, OutputArray _matD, int flags )
{
#ifdef HAVE_CLAMDBLAS
    CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() &&
        matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes
        ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags))
#endif

#ifdef HAVE_OPENCL
    CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2,
               ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags))
#endif

    Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0.0 ? matC.getMat() : Mat();
    Size a_size = A.size(), d_size;
    int len = 0, type = A.type();

1558
    CV_Assert_N( type == B.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585

    switch( flags & (GEMM_1_T|GEMM_2_T) )
    {
    case 0:
        d_size = Size( B.cols, a_size.height );
        len = B.rows;
        CV_Assert( a_size.width == len );
        break;
    case 1:
        d_size = Size( B.cols, a_size.width );
        len = B.rows;
        CV_Assert( a_size.height == len );
        break;
    case 2:
        d_size = Size( B.rows, a_size.height );
        len = B.cols;
        CV_Assert( a_size.width == len );
        break;
    case 3:
        d_size = Size( B.rows, a_size.width );
        len = B.cols;
        CV_Assert( a_size.height == len );
        break;
    }

    if( !C.empty() )
    {
1586
        CV_Assert_N( C.type() == type,
1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631
            (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) ||
             ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height)));
    }

    _matD.create( d_size.height, d_size.width, type );
    Mat D = _matD.getMat();
    if( (flags & GEMM_3_T) != 0 && C.data == D.data )
    {
        transpose( C, C );
        flags &= ~GEMM_3_T;
    }

    Mat *DProxyPtr = &D, DProxy;
    if( D.data == A.data || D.data == B.data )
    {
        DProxy = Mat(d_size.height, d_size.width, D.type());
        DProxyPtr = &DProxy;
    }

    if( type == CV_32FC1 )
        hal::gemm32f(A.ptr<float>(), A.step, B.ptr<float>(), B.step, static_cast<float>(alpha),
                     C.ptr<float>(), C.step, static_cast<float>(beta),
                     DProxyPtr->ptr<float>(), DProxyPtr->step,
                     a_size.height, a_size.width, DProxyPtr->cols, flags);
    else if( type == CV_64FC1 )
        hal::gemm64f(A.ptr<double>(), A.step, B.ptr<double>(), B.step, alpha,
                     C.ptr<double>(), C.step, beta,
                     DProxyPtr->ptr<double>(), DProxyPtr->step,
                     a_size.height, a_size.width, DProxyPtr->cols, flags);
    else if( type == CV_32FC2 )
        hal::gemm32fc(A.ptr<float>(), A.step, B.ptr<float>(), B.step, static_cast<float>(alpha),
                      C.ptr<float>(), C.step, static_cast<float>(beta),
                      DProxyPtr->ptr<float>(), DProxyPtr->step,
                      a_size.height, a_size.width, DProxyPtr->cols, flags);
    else
    {
        CV_Assert( type == CV_64FC2 );
        hal::gemm64fc(A.ptr<double>(), A.step, B.ptr<double>(), B.step, alpha,
                      C.ptr<double>(), C.step, beta,
                      D.ptr<double>(), D.step,
                      a_size.height, a_size.width, DProxyPtr->cols, flags);
    }

    if(DProxyPtr != &D)
        DProxyPtr->copyTo(D);
1632 1633 1634 1635 1636 1637
}

/****************************************************************************************\
*                                        Transform                                       *
\****************************************************************************************/

1638
namespace cv
1639 1640 1641
{

template<typename T, typename WT> static void
1642
transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
1643
{
1644
    int x;
1645

1646
    if( scn == 2 && dcn == 2 )
1647
    {
1648
        for( x = 0; x < len*2; x += 2 )
1649
        {
1650 1651 1652 1653
            WT v0 = src[x], v1 = src[x+1];
            T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]);
            T t1 = saturate_cast<T>(m[3]*v0 + m[4]*v1 + m[5]);
            dst[x] = t0; dst[x+1] = t1;
1654 1655
        }
    }
1656
    else if( scn == 3 && dcn == 3 )
1657
    {
1658
        for( x = 0; x < len*3; x += 3 )
1659
        {
1660 1661 1662 1663 1664
            WT v0 = src[x], v1 = src[x+1], v2 = src[x+2];
            T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
            T t1 = saturate_cast<T>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
            T t2 = saturate_cast<T>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
            dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1665 1666
        }
    }
1667
    else if( scn == 3 && dcn == 1 )
1668
    {
1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
        for( x = 0; x < len; x++, src += 3 )
            dst[x] = saturate_cast<T>(m[0]*src[0] + m[1]*src[1] + m[2]*src[2] + m[3]);
    }
    else if( scn == 4 && dcn == 4 )
    {
        for( x = 0; x < len*4; x += 4 )
        {
            WT v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
            T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]*v3 + m[4]);
            T t1 = saturate_cast<T>(m[5]*v0 + m[6]*v1 + m[7]*v2 + m[8]*v3 + m[9]);
            dst[x] = t0; dst[x+1] = t1;
            t0 = saturate_cast<T>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
            t1 = saturate_cast<T>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
            dst[x+2] = t0; dst[x+3] = t1;
        }
    }
    else
    {
        for( x = 0; x < len; x++, src += scn, dst += dcn )
1688 1689
        {
            const WT* _m = m;
1690 1691 1692 1693 1694 1695 1696 1697
            int j, k;
            for( j = 0; j < dcn; j++, _m += scn + 1 )
            {
                WT s = _m[scn];
                for( k = 0; k < scn; k++ )
                    s += _m[k]*src[k];
                dst[j] = saturate_cast<T>(s);
            }
1698 1699 1700 1701
        }
    }
}

1702
#if CV_SIMD128
1703
static inline void
1704
load3x3Matrix(const float* m, v_float32x4& m0, v_float32x4& m1, v_float32x4& m2, v_float32x4& m3)
1705
{
1706 1707 1708 1709
    m0 = v_float32x4(m[0], m[4], m[8], 0);
    m1 = v_float32x4(m[1], m[5], m[9], 0);
    m2 = v_float32x4(m[2], m[6], m[10], 0);
    m3 = v_float32x4(m[3], m[7], m[11], 0);
1710 1711
}

1712 1713
static inline v_int16x8
v_matmulvec(const v_int16x8 &v0, const v_int16x8 &m0, const v_int16x8 &m1, const v_int16x8 &m2, const v_int32x4 &m3, const int BITS)
1714
{
1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
    // v0 : 0 b0 g0 r0 b1 g1 r1 ?
    v_int32x4 t0 = v_dotprod(v0, m0); // a0 b0 a1 b1
    v_int32x4 t1 = v_dotprod(v0, m1); // c0 d0 c1 d1
    v_int32x4 t2 = v_dotprod(v0, m2); // e0 f0 e1 f1
    v_int32x4 t3 = v_setzero_s32();
    v_int32x4 s0, s1, s2, s3;
    v_transpose4x4(t0, t1, t2, t3, s0, s1, s2, s3);
    s0 = s0 + s1 + m3; // B0 G0 R0 ?
    s2 = s2 + s3 + m3; // B1 G1 R1 ?

    s0 = s0 >> BITS;
    s2 = s2 >> BITS;
1727

1728 1729 1730 1731 1732
    v_int16x8 result = v_pack(s0, v_setzero_s32());                    // B0 G0 R0 0 0 0 0 0
    result = v_reinterpret_as_s16(v_reinterpret_as_s64(result) << 16); // 0 B0 G0 R0 0 0 0 0
    result = result | v_pack(v_setzero_s32(), s2);                     // 0 B0 G0 R0 B1 G1 R1 0
    return result;
}
1733
#endif
1734

1735 1736
static void
transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
1737
{
1738
#if CV_SIMD128
1739 1740 1741
    const int BITS = 10, SCALE = 1 << BITS;
    const float MAX_M = (float)(1 << (15 - BITS));

1742
    if( hasSIMD128() && scn == 3 && dcn == 3 &&
1743 1744 1745 1746
        std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 &&
        std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 &&
        std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 )
    {
1747 1748
        const int nChannels = 3;
        const int cWidth = v_int16x8::nlanes;
1749 1750 1751 1752 1753 1754 1755 1756 1757
        // faster fixed-point transformation
        short m00 = saturate_cast<short>(m[0]*SCALE), m01 = saturate_cast<short>(m[1]*SCALE),
            m02 = saturate_cast<short>(m[2]*SCALE), m10 = saturate_cast<short>(m[4]*SCALE),
            m11 = saturate_cast<short>(m[5]*SCALE), m12 = saturate_cast<short>(m[6]*SCALE),
            m20 = saturate_cast<short>(m[8]*SCALE), m21 = saturate_cast<short>(m[9]*SCALE),
            m22 = saturate_cast<short>(m[10]*SCALE);
        int m03 = saturate_cast<int>((m[3]+0.5f)*SCALE), m13 = saturate_cast<int>((m[7]+0.5f)*SCALE ),
            m23 = saturate_cast<int>((m[11]+0.5f)*SCALE);

1758 1759 1760 1761
        v_int16x8 m0 = v_int16x8(0, m00, m01, m02, m00, m01, m02, 0);
        v_int16x8 m1 = v_int16x8(0, m10, m11, m12, m10, m11, m12, 0);
        v_int16x8 m2 = v_int16x8(0, m20, m21, m22, m20, m21, m22, 0);
        v_int32x4 m3 = v_int32x4(m03, m13, m23, 0);
1762
        int x = 0;
1763

1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801
        for (; x <= (len - cWidth) * nChannels; x += cWidth * nChannels)
        {
            // load 8 pixels
            v_int16x8 v0 = v_reinterpret_as_s16(v_load_expand(src + x));
            v_int16x8 v1 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth));
            v_int16x8 v2 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth * 2));
            v_int16x8 v3;

            // rotate and pack
            v3 = v_rotate_right<1>(v2);     // 0 b6 g6 r6 b7 g7 r7 0
            v2 = v_rotate_left <5>(v2, v1); // 0 b4 g4 r4 b5 g5 r5 0
            v1 = v_rotate_left <3>(v1, v0); // 0 b2 g2 r2 b3 g3 r3 0
            v0 = v_rotate_left <1>(v0);     // 0 b0 g0 r0 b1 g1 r1 0

            // multiply with matrix and normalize
            v0 = v_matmulvec(v0, m0, m1, m2, m3, BITS); // 0 B0 G0 R0 B1 G1 R1 0
            v1 = v_matmulvec(v1, m0, m1, m2, m3, BITS); // 0 B2 G2 R2 B3 G3 R3 0
            v2 = v_matmulvec(v2, m0, m1, m2, m3, BITS); // 0 B4 G4 R4 B5 G5 R5 0
            v3 = v_matmulvec(v3, m0, m1, m2, m3, BITS); // 0 B6 G6 R6 B7 G7 R7 0

            // narrow down as uint8x16
            v_uint8x16 z0 = v_pack_u(v0, v_setzero_s16()); // 0 B0 G0 R0 B1 G1 R1 0 0 0 0 0 0 0 0 0
            v_uint8x16 z1 = v_pack_u(v1, v_setzero_s16()); // 0 B2 G2 R2 B3 G3 R3 0 0 0 0 0 0 0 0 0
            v_uint8x16 z2 = v_pack_u(v2, v_setzero_s16()); // 0 B4 G4 R4 B5 G5 R5 0 0 0 0 0 0 0 0 0
            v_uint8x16 z3 = v_pack_u(v3, v_setzero_s16()); // 0 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 0

            // rotate and pack
            z0 = v_reinterpret_as_u8(v_reinterpret_as_u64(z0) >> 8) | v_reinterpret_as_u8(v_reinterpret_as_u64(z1) << 40);  // B0 G0 R0 B1 G1 R1 B2 G2 0 0 0 0 0 0 0 0
            z1 = v_reinterpret_as_u8(v_reinterpret_as_u64(z1) >> 24) | v_reinterpret_as_u8(v_reinterpret_as_u64(z2) << 24); // R2 B3 G3 R3 B4 G4 R4 B5 0 0 0 0 0 0 0 0
            z2 = v_reinterpret_as_u8(v_reinterpret_as_u64(z2) >> 40) | v_reinterpret_as_u8(v_reinterpret_as_u64(z3) << 8);  // G5 R6 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0

            // store on memory
            v_store_low(dst + x, z0);
            v_store_low(dst + x + cWidth, z1);
            v_store_low(dst + x + cWidth * 2, z2);
        }

        for( ; x < len * nChannels; x += nChannels )
1802 1803 1804 1805 1806 1807
        {
            int v0 = src[x], v1 = src[x+1], v2 = src[x+2];
            uchar t0 = saturate_cast<uchar>((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS);
            uchar t1 = saturate_cast<uchar>((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS);
            uchar t2 = saturate_cast<uchar>((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS);
            dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1808 1809 1810
        }
        return;
    }
1811
#endif
1812

1813
    transform_(src, dst, m, len, scn, dcn);
1814 1815
}

1816 1817
static void
transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
1818
{
1819 1820
#if CV_SIMD128 && !defined(__aarch64__)
    if( hasSIMD128() && scn == 3 && dcn == 3 )
1821
    {
1822 1823 1824 1825
        const int nChannels = 3;
        const int cWidth = v_float32x4::nlanes;
        v_int16x8 delta = v_int16x8(0, -32768, -32768, -32768, -32768, -32768, -32768, 0);
        v_float32x4 m0, m1, m2, m3;
1826
        load3x3Matrix(m, m0, m1, m2, m3);
1827
        m3 -= v_float32x4(32768.f, 32768.f, 32768.f, 0.f);
1828

1829
        int x = 0;
1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875
        for( ; x <= (len - cWidth) * nChannels; x += cWidth * nChannels )
        {
            // load 4 pixels
            v_uint16x8 v0_16 = v_load(src + x);              // b0 g0 r0 b1 g1 r1 b2 g2
            v_uint16x8 v2_16 = v_load_low(src + x + cWidth * 2); // r2 b3 g3 r3 ?  ?  ?  ?

            // expand to 4 vectors
            v_uint32x4 v0_32, v1_32, v2_32, v3_32, dummy_32;
            v_expand(v_rotate_right<3>(v0_16), v1_32, dummy_32);         // b1 g1 r1
            v_expand(v_rotate_right<1>(v2_16), v3_32, dummy_32);         // b3 g3 r3
            v_expand(v_rotate_right<6>(v0_16, v2_16), v2_32, dummy_32); // b2 g2 r2
            v_expand(v0_16, v0_32, dummy_32);                            // b0 g0 r0

            // convert to float32x4
            v_float32x4 x0 = v_cvt_f32(v_reinterpret_as_s32(v0_32)); // b0 g0 r0
            v_float32x4 x1 = v_cvt_f32(v_reinterpret_as_s32(v1_32)); // b1 g1 r1
            v_float32x4 x2 = v_cvt_f32(v_reinterpret_as_s32(v2_32)); // b2 g2 r2
            v_float32x4 x3 = v_cvt_f32(v_reinterpret_as_s32(v3_32)); // b3 g3 r3

            // multiply and convert back to int32x4
            v_int32x4 y0, y1, y2, y3;
            y0 = v_round(v_matmuladd(x0, m0, m1, m2, m3)); // B0 G0 R0
            y1 = v_round(v_matmuladd(x1, m0, m1, m2, m3)); // B1 G1 R1
            y2 = v_round(v_matmuladd(x2, m0, m1, m2, m3)); // B2 G2 R2
            y3 = v_round(v_matmuladd(x3, m0, m1, m2, m3)); // B3 G3 R3

            // narrow down to int16x8
            v_int16x8 v0 = v_add_wrap(v_pack(v_rotate_left<1>(y0), y1), delta); // 0 B0 G0 R0 B1 G1 R1 0
            v_int16x8 v2 = v_add_wrap(v_pack(v_rotate_left<1>(y2), y3), delta); // 0 B2 G2 R2 B3 G3 R3 0

            // rotate and pack
            v0 = v_rotate_right<1>(v0) | v_rotate_left<5>(v2); // B0 G0 R0 B1 G1 R1 B2 G2
            v2 = v_rotate_right<3>(v2);                        // R2 B3 G3 R3 0  0  0  0

            // store 4 pixels
            v_store(dst + x, v_reinterpret_as_u16(v0));
            v_store_low(dst + x + cWidth * 2, v_reinterpret_as_u16(v2));
        }

        for( ; x < len * nChannels; x += nChannels )
        {
            float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
            ushort t0 = saturate_cast<ushort>(m[0] * v0 + m[1] * v1 + m[2] * v2 + m[3]);
            ushort t1 = saturate_cast<ushort>(m[4] * v0 + m[5] * v1 + m[6] * v2 + m[7]);
            ushort t2 = saturate_cast<ushort>(m[8] * v0 + m[9] * v1 + m[10] * v2 + m[11]);
            dst[x] = t0; dst[x + 1] = t1; dst[x + 2] = t2;
1876 1877 1878
        }
        return;
    }
1879
#endif
1880

1881
    transform_(src, dst, m, len, scn, dcn);
1882
}
1883

1884 1885
static void
transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
1886
{
1887 1888
#if CV_SIMD128 && !defined(__aarch64__)
    if( hasSIMD128() )
1889
    {
1890 1891
        int x = 0;
        if( scn == 3 && dcn == 3 )
1892
        {
1893 1894
            const int cWidth = 3;
            v_float32x4 m0, m1, m2, m3;
1895
            load3x3Matrix(m, m0, m1, m2, m3);
1896

1897
            for( ; x < (len - 1)*cWidth; x += cWidth )
1898
            {
1899 1900 1901 1902
                v_float32x4 x0 = v_load(src + x);
                v_float32x4 y0 = v_matmuladd(x0, m0, m1, m2, m3);
                v_store_low(dst + x, y0);
                dst[x + 2] = v_combine_high(y0, y0).get0();
1903 1904
            }

1905
            for( ; x < len*cWidth; x += cWidth )
1906
            {
1907 1908 1909 1910
                float v0 = src[x], v1 = src[x+1], v2 = src[x+2];
                float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
                float t1 = saturate_cast<float>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
                float t2 = saturate_cast<float>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
1911 1912
                dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
            }
1913
            return;
1914
        }
1915

1916
        if( scn == 4 && dcn == 4 )
1917
        {
1918 1919 1920 1921 1922 1923
            const int cWidth = 4;
            v_float32x4 m0 = v_float32x4(m[0], m[5], m[10], m[15]);
            v_float32x4 m1 = v_float32x4(m[1], m[6], m[11], m[16]);
            v_float32x4 m2 = v_float32x4(m[2], m[7], m[12], m[17]);
            v_float32x4 m3 = v_float32x4(m[3], m[8], m[13], m[18]);
            v_float32x4 m4 = v_float32x4(m[4], m[9], m[14], m[19]);
1924

1925
            for( ; x < len*cWidth; x += cWidth )
1926
            {
1927 1928 1929
                v_float32x4 x0 = v_load(src + x);
                v_float32x4 y0 = v_matmul(x0, m0, m1, m2, m3) + m4;
                v_store(dst + x, y0);
1930
            }
1931
            return;
1932 1933
        }
    }
1934
#endif
1935

1936
    transform_(src, dst, m, len, scn, dcn);
1937 1938 1939
}


1940 1941 1942 1943 1944
static void
transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
{
    transform_(src, dst, m, len, scn, dcn);
}
1945

1946 1947
static void
transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
1948
{
1949 1950
    transform_(src, dst, m, len, scn, dcn);
}
1951

1952 1953 1954 1955 1956
static void
transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
{
    transform_(src, dst, m, len, scn, dcn);
}
1957

1958 1959 1960 1961
static void
transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
{
    transform_(src, dst, m, len, scn, dcn);
1962 1963
}

1964 1965 1966 1967
template<typename T, typename WT> static void
diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int )
{
    int x;
1968

1969
    if( cn == 2 )
1970
    {
1971
        for( x = 0; x < len*2; x += 2 )
1972 1973 1974 1975 1976 1977
        {
            T t0 = saturate_cast<T>(m[0]*src[x] + m[2]);
            T t1 = saturate_cast<T>(m[4]*src[x+1] + m[5]);
            dst[x] = t0; dst[x+1] = t1;
        }
    }
1978
    else if( cn == 3 )
1979
    {
1980
        for( x = 0; x < len*3; x += 3 )
1981 1982 1983 1984 1985 1986 1987
        {
            T t0 = saturate_cast<T>(m[0]*src[x] + m[3]);
            T t1 = saturate_cast<T>(m[5]*src[x+1] + m[7]);
            T t2 = saturate_cast<T>(m[10]*src[x+2] + m[11]);
            dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
        }
    }
1988
    else if( cn == 4 )
1989
    {
1990
        for( x = 0; x < len*4; x += 4 )
1991 1992 1993 1994 1995 1996 1997 1998 1999
        {
            T t0 = saturate_cast<T>(m[0]*src[x] + m[4]);
            T t1 = saturate_cast<T>(m[6]*src[x+1] + m[9]);
            dst[x] = t0; dst[x+1] = t1;
            t0 = saturate_cast<T>(m[12]*src[x+2] + m[14]);
            t1 = saturate_cast<T>(m[18]*src[x+3] + m[19]);
            dst[x+2] = t0; dst[x+3] = t1;
        }
    }
2000 2001 2002 2003 2004 2005
    else
    {
        for( x = 0; x < len; x++, src += cn, dst += cn )
        {
            const WT* _m = m;
            for( int j = 0; j < cn; j++, _m += cn + 1 )
2006
                dst[j] = saturate_cast<T>(src[j]*_m[j] + _m[cn]);
2007 2008
        }
    }
2009 2010
}

2011 2012
static void
diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn)
2013
{
2014
    diagtransform_(src, dst, m, len, scn, dcn);
2015 2016
}

2017 2018 2019 2020 2021
static void
diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
{
    diagtransform_(src, dst, m, len, scn, dcn);
}
2022

2023 2024 2025 2026
static void
diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn)
{
    diagtransform_(src, dst, m, len, scn, dcn);
2027 2028
}

2029 2030 2031 2032 2033
static void
diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
{
    diagtransform_(src, dst, m, len, scn, dcn);
}
2034

2035 2036 2037 2038 2039
static void
diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
{
    diagtransform_(src, dst, m, len, scn, dcn);
}
2040

2041 2042 2043 2044
static void
diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn)
{
    diagtransform_(src, dst, m, len, scn, dcn);
2045 2046
}

2047 2048 2049 2050
static void
diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
{
    diagtransform_(src, dst, m, len, scn, dcn);
2051 2052 2053
}


2054
typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int );
2055

2056
static TransformFunc getTransformFunc(int depth)
2057
{
2058 2059 2060 2061 2062 2063 2064 2065 2066
    static TransformFunc transformTab[] =
    {
        (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u,
        (TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f,
        (TransformFunc)transform_64f, 0
    };

    return transformTab[depth];
}
2067

2068
static TransformFunc getDiagTransformFunc(int depth)
2069
{
2070 2071 2072 2073 2074 2075 2076 2077 2078
    static TransformFunc diagTransformTab[] =
    {
        (TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u,
        (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f,
        (TransformFunc)diagtransform_64f, 0
    };

    return diagTransformTab[depth];
}
2079

2080
}
2081

2082
void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx )
2083
{
2084 2085
    CV_INSTRUMENT_REGION()

2086 2087 2088 2089
    Mat src = _src.getMat(), m = _mtx.getMat();
    int depth = src.depth(), scn = src.channels(), dcn = m.rows;
    CV_Assert( scn == m.cols || scn + 1 == m.cols );
    bool isDiag = false;
2090

2091 2092
    _dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
    Mat dst = _dst.getMat();
2093 2094

    int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F;
2095
    AutoBuffer<double> _mbuf;
2096
    double* mbuf;
2097

2098
    if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 )
2099
    {
2100
        _mbuf.allocate(dcn*(scn+1));
2101
        mbuf = _mbuf.data();
2102
        Mat tmp(dcn, scn+1, mtype, mbuf);
2103
        memset(tmp.ptr(), 0, tmp.total()*tmp.elemSize());
2104 2105 2106 2107 2108 2109 2110 2111
        if( m.cols == scn+1 )
            m.convertTo(tmp, mtype);
        else
        {
            Mat tmppart = tmp.colRange(0, m.cols);
            m.convertTo(tmppart, mtype);
        }
        m = tmp;
2112
    }
2113
    else
2114
        mbuf = m.ptr<double>();
2115 2116 2117 2118 2119 2120 2121 2122 2123 2124

    if( scn == dcn )
    {
        int i, j;
        double eps = mtype == CV_32F ? FLT_EPSILON : DBL_EPSILON;

        if( scn == 1 )
        {
            double alpha, beta;
            if( mtype == CV_32F )
2125
                alpha = m.at<float>(0), beta = m.at<float>(1);
2126
            else
2127 2128
                alpha = m.at<double>(0), beta = m.at<double>(1);
            src.convertTo(dst, dst.type(), alpha, beta);
2129 2130 2131 2132 2133 2134 2135
            return;
        }

        for( i = 0, isDiag = true; isDiag && i < scn; i++ )
        {
            for( j = 0; isDiag && j < scn; j++ )
            {
2136
                double v = mtype == CV_32F ? m.at<float>(i, j) : m.at<double>(i, j);
2137 2138 2139 2140 2141 2142
                if( i != j && fabs(v) > eps )
                    isDiag = false;
            }
        }
    }

2143
    TransformFunc func = isDiag ? getDiagTransformFunc(depth): getTransformFunc(depth);
2144
    CV_Assert( func != 0 );
2145

2146
    const Mat* arrays[] = {&src, &dst, 0};
2147
    uchar* ptrs[2]{};
2148 2149
    NAryMatIterator it(arrays, ptrs);
    size_t i, total = it.size;
2150

2151 2152
    for( i = 0; i < it.nplanes; i++, ++it )
        func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
2153 2154 2155 2156 2157 2158
}

/****************************************************************************************\
*                                  Perspective Transform                                 *
\****************************************************************************************/

2159
namespace cv
2160 2161
{

2162 2163 2164 2165 2166
template<typename T> static void
perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, int dcn )
{
    const double eps = FLT_EPSILON;
    int i;
2167

2168
    if( scn == 2 && dcn == 2 )
2169
    {
2170
        for( i = 0; i < len*2; i += 2 )
2171
        {
2172 2173
            T x = src[i], y = src[i + 1];
            double w = x*m[6] + y*m[7] + m[8];
2174

2175
            if( fabs(w) > eps )
2176 2177
            {
                w = 1./w;
2178 2179
                dst[i] = (T)((x*m[0] + y*m[1] + m[2])*w);
                dst[i+1] = (T)((x*m[3] + y*m[4] + m[5])*w);
2180 2181
            }
            else
2182
                dst[i] = dst[i+1] = (T)0;
2183 2184
        }
    }
2185
    else if( scn == 3 && dcn == 3 )
2186
    {
2187
        for( i = 0; i < len*3; i += 3 )
2188
        {
2189 2190
            T x = src[i], y = src[i + 1], z = src[i + 2];
            double w = x*m[12] + y*m[13] + z*m[14] + m[15];
2191

2192
            if( fabs(w) > eps )
2193 2194
            {
                w = 1./w;
2195 2196 2197
                dst[i] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3]) * w);
                dst[i+1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7]) * w);
                dst[i+2] = (T)((x*m[8] + y*m[9] + z*m[10] + m[11]) * w);
2198 2199
            }
            else
2200
                dst[i] = dst[i+1] = dst[i+2] = (T)0;
2201 2202
        }
    }
2203
    else if( scn == 3 && dcn == 2 )
2204
    {
2205
        for( i = 0; i < len; i++, src += 3, dst += 2 )
2206 2207
        {
            T x = src[0], y = src[1], z = src[2];
2208
            double w = x*m[8] + y*m[9] + z*m[10] + m[11];
2209

2210
            if( fabs(w) > eps )
2211 2212
            {
                w = 1./w;
2213 2214
                dst[0] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3])*w);
                dst[1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7])*w);
2215 2216 2217 2218 2219
            }
            else
                dst[0] = dst[1] = (T)0;
        }
    }
2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244
    else
    {
        for( i = 0; i < len; i++, src += scn, dst += dcn )
        {
            const double* _m = m + dcn*(scn + 1);
            double w = _m[scn];
            int j, k;
            for( k = 0; k < scn; k++ )
                w += _m[k]*src[k];
            if( fabs(w) > eps )
            {
                _m = m;
                for( j = 0; j < dcn; j++, _m += scn + 1 )
                {
                    double s = _m[scn];
                    for( k = 0; k < scn; k++ )
                        s += _m[k]*src[k];
                    dst[j] = (T)(s*w);
                }
            }
            else
                for( j = 0; j < dcn; j++ )
                    dst[j] = 0;
        }
    }
2245 2246
}

2247

2248 2249
static void
perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn)
2250
{
2251 2252
    perspectiveTransform_(src, dst, m, len, scn, dcn);
}
2253

2254 2255 2256 2257 2258
static void
perspectiveTransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
{
    perspectiveTransform_(src, dst, m, len, scn, dcn);
}
2259

2260
}
2261

2262
void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mtx )
2263
{
2264 2265
    CV_INSTRUMENT_REGION()

2266 2267
    Mat src = _src.getMat(), m = _mtx.getMat();
    int depth = src.depth(), scn = src.channels(), dcn = m.rows-1;
2268 2269
    CV_Assert( scn + 1 == m.cols );
    CV_Assert( depth == CV_32F || depth == CV_64F );
2270

2271 2272
    _dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
    Mat dst = _dst.getMat();
2273

2274 2275
    const int mtype = CV_64F;
    AutoBuffer<double> _mbuf;
2276
    double* mbuf = m.ptr<double>();
2277

2278
    if( !m.isContinuous() || m.type() != mtype )
2279
    {
2280
        _mbuf.allocate((dcn+1)*(scn+1));
2281 2282
        mbuf = _mbuf.data();
        Mat tmp(dcn+1, scn+1, mtype, mbuf);
2283 2284
        m.convertTo(tmp, mtype);
        m = tmp;
2285
    }
2286

2287 2288 2289 2290
    TransformFunc func = depth == CV_32F ?
        (TransformFunc)perspectiveTransform_32f :
        (TransformFunc)perspectiveTransform_64f;
    CV_Assert( func != 0 );
2291

2292
    const Mat* arrays[] = {&src, &dst, 0};
2293
    uchar* ptrs[2]{};
2294 2295
    NAryMatIterator it(arrays, ptrs);
    size_t i, total = it.size;
2296

2297 2298
    for( i = 0; i < it.nplanes; i++, ++it )
        func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
2299
}
2300 2301 2302 2303 2304

/****************************************************************************************\
*                                       ScaleAdd                                         *
\****************************************************************************************/

2305
namespace cv
2306 2307
{

2308 2309 2310 2311 2312
static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
                         int len, float* _alpha)
{
    float alpha = *_alpha;
    int i = 0;
2313 2314
#if CV_SIMD128
    if (hasSIMD128())
2315
    {
2316 2317 2318
        v_float32x4 v_alpha = v_setall_f32(alpha);
        const int cWidth = v_float32x4::nlanes;
        for (; i <= len - cWidth; i += cWidth)
2319
        {
2320 2321 2322
            v_float32x4 v_src1 = v_load(src1 + i);
            v_float32x4 v_src2 = v_load(src2 + i);
            v_store(dst + i, (v_src1 * v_alpha) + v_src2);
2323 2324
        }
    }
2325
#endif
2326 2327
    for (; i < len; i++)
        dst[i] = src1[i] * alpha + src2[i];
2328
}
2329

2330

2331 2332 2333 2334 2335
static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
                         int len, double* _alpha)
{
    double alpha = *_alpha;
    int i = 0;
2336 2337
#if CV_SIMD128_64F
    if (hasSIMD128())
2338
    {
2339 2340 2341
        v_float64x2 a2 = v_setall_f64(alpha);
        const int cWidth = v_float64x2::nlanes;
        for (; i <= len - cWidth * 2; i += cWidth * 2)
2342
        {
2343 2344 2345 2346 2347 2348 2349
            v_float64x2 x0, x1, y0, y1, t0, t1;
            x0 = v_load(src1 + i); x1 = v_load(src1 + i + cWidth);
            y0 = v_load(src2 + i); y1 = v_load(src2 + i + cWidth);
            t0 = x0 * a2 + y0;
            t1 = x1 * a2 + y1;
            v_store(dst + i, t0);
            v_store(dst + i + cWidth, t1);
2350
        }
2351 2352
    }
#endif
2353 2354
    for (; i < len; i++)
        dst[i] = src1[i] * alpha + src2[i];
2355
}
2356

2357
typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha);
2358

2359 2360
#ifdef HAVE_OPENCL

2361 2362
static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
{
2363
    const ocl::Device & d = ocl::Device::getDefault();
2364

2365
    bool doubleSupport = d.doubleFPConfig() > 0;
2366
    Size size = _src1.size();
2367
    int depth = CV_MAT_DEPTH(type);
2368 2369 2370
    if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() )
        return false;

2371 2372 2373 2374 2375
    _dst.create(size, type);
    int cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F);
    int kercn = ocl::predictOptimalVectorWidthMax(_src1, _src2, _dst),
        rowsPerWI = d.isIntel() ? 4 : 1;

2376 2377
    char cvt[2][50];
    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2378
                  format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
2379 2380
                         " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s"
                         " -D wdepth=%d%s -D rowsPerWI=%d",
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2381 2382 2383 2384 2385
                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
                         ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
                         ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
                         ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]),
                         ocl::typeToStr(wdepth), wdepth,
2386
                         doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI));
2387 2388 2389
    if (k.empty())
        return false;

2390
    UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat();
2391 2392 2393

    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
Ilya Lavrenov's avatar
Ilya Lavrenov committed
2394
            dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
2395 2396 2397 2398 2399 2400

    if (wdepth == CV_32F)
        k.args(src1arg, src2arg, dstarg, (float)alpha);
    else
        k.args(src1arg, src2arg, dstarg, alpha);

2401
    size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
2402 2403 2404
    return k.run(2, globalsize, NULL, false);
}

2405 2406
#endif

2407
}
2408

2409
void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst )
2410
{
2411 2412
    CV_INSTRUMENT_REGION()

2413 2414 2415
    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    CV_Assert( type == _src2.type() );

2416
    CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(),
2417
            ocl_scaleAdd(_src1, alpha, _src2, _dst, type))
2418

2419 2420 2421 2422
    if( depth < CV_32F )
    {
        addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth);
        return;
2423
    }
2424

2425 2426 2427
    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
    CV_Assert(src1.size == src2.size);

Ilya Lavrenov's avatar
Ilya Lavrenov committed
2428
    _dst.create(src1.dims, src1.size, type);
2429
    Mat dst = _dst.getMat();
2430

2431 2432
    float falpha = (float)alpha;
    void* palpha = depth == CV_32F ? (void*)&falpha : (void*)&alpha;
2433

Andrey Kamaev's avatar
Andrey Kamaev committed
2434
    ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f;
2435

Ilya Lavrenov's avatar
Ilya Lavrenov committed
2436
    if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous())
2437 2438
    {
        size_t len = src1.total()*cn;
2439
        func(src1.ptr(), src2.ptr(), dst.ptr(), (int)len, palpha);
2440 2441
        return;
    }
2442

2443
    const Mat* arrays[] = {&src1, &src2, &dst, 0};
2444
    uchar* ptrs[3]{};
2445 2446
    NAryMatIterator it(arrays, ptrs);
    size_t i, len = it.size*cn;
2447

2448 2449
    for( i = 0; i < it.nplanes; i++, ++it )
        func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha );
2450 2451 2452 2453 2454 2455
}

/****************************************************************************************\
*                                 Covariation Matrix                                     *
\****************************************************************************************/

2456
void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype )
2457
{
2458 2459
    CV_INSTRUMENT_REGION()

2460
    CV_Assert_N( data, nsamples > 0 );
2461
    Size size = data[0].size();
2462
    int sz = size.width * size.height, esz = (int)data[0].elemSize();
2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479
    int type = data[0].type();
    Mat mean;
    ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F);

    if( (flags & CV_COVAR_USE_AVG) != 0 )
    {
        CV_Assert( _mean.size() == size );
        if( _mean.isContinuous() && _mean.type() == ctype )
            mean = _mean.reshape(1, 1);
        else
        {
            _mean.convertTo(mean, ctype);
            mean = mean.reshape(1, 1);
        }
    }

    Mat _data(nsamples, sz, type);
2480

2481 2482
    for( int i = 0; i < nsamples; i++ )
    {
2483
        CV_Assert_N( data[i].size() == size, data[i].type() == type );
2484
        if( data[i].isContinuous() )
2485
            memcpy( _data.ptr(i), data[i].ptr(), sz*esz );
2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497
        else
        {
            Mat dataRow(size.height, size.width, type, _data.ptr(i));
            data[i].copyTo(dataRow);
        }
    }

    calcCovarMatrix( _data, covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype );
    if( (flags & CV_COVAR_USE_AVG) == 0 )
        _mean = mean.reshape(1, size.height);
}

Andrey Kamaev's avatar
Andrey Kamaev committed
2498
void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype )
2499
{
2500 2501
    CV_INSTRUMENT_REGION()

2502
    if(_src.kind() == _InputArray::STD_VECTOR_MAT || _src.kind() == _InputArray::STD_ARRAY_MAT)
2503 2504
    {
        std::vector<cv::Mat> src;
Andrey Kamaev's avatar
Andrey Kamaev committed
2505
        _src.getMatVector(src);
2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516

        CV_Assert( src.size() > 0 );

        Size size = src[0].size();
        int type = src[0].type();

        ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F);

        Mat _data(static_cast<int>(src.size()), size.area(), type);

        int i = 0;
2517
        for(std::vector<cv::Mat>::iterator each = src.begin(); each != src.end(); ++each, ++i )
2518
        {
2519
            CV_Assert_N( (*each).size() == size, (*each).type() == type );
2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550
            Mat dataRow(size.height, size.width, type, _data.ptr(i));
            (*each).copyTo(dataRow);
        }

        Mat mean;
        if( (flags & CV_COVAR_USE_AVG) != 0 )
        {
            CV_Assert( _mean.size() == size );

            if( mean.type() != ctype )
            {
                mean = _mean.getMat();
                _mean.create(mean.size(), ctype);
                Mat tmp = _mean.getMat();
                mean.convertTo(tmp, ctype);
                mean = tmp;
            }

            mean = _mean.getMat().reshape(1, 1);
        }

        calcCovarMatrix( _data, _covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype );

        if( (flags & CV_COVAR_USE_AVG) == 0 )
        {
            mean = mean.reshape(1, size.height);
            mean.copyTo(_mean);
        }
        return;
    }

Andrey Kamaev's avatar
Andrey Kamaev committed
2551
    Mat data = _src.getMat(), mean;
2552 2553 2554 2555 2556 2557 2558 2559 2560
    CV_Assert( ((flags & CV_COVAR_ROWS) != 0) ^ ((flags & CV_COVAR_COLS) != 0) );
    bool takeRows = (flags & CV_COVAR_ROWS) != 0;
    int type = data.type();
    int nsamples = takeRows ? data.rows : data.cols;
    CV_Assert( nsamples > 0 );
    Size size = takeRows ? Size(data.cols, 1) : Size(1, data.rows);

    if( (flags & CV_COVAR_USE_AVG) != 0 )
    {
2561 2562
        mean = _mean.getMat();
        ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), mean.depth()), CV_32F);
2563 2564
        CV_Assert( mean.size() == size );
        if( mean.type() != ctype )
2565 2566 2567 2568 2569 2570
        {
            _mean.create(mean.size(), ctype);
            Mat tmp = _mean.getMat();
            mean.convertTo(tmp, ctype);
            mean = tmp;
        }
2571 2572 2573
    }
    else
    {
2574
        ctype = std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), CV_32F);
Andrey Kamaev's avatar
Andrey Kamaev committed
2575
        reduce( _src, _mean, takeRows ? 0 : 1, CV_REDUCE_AVG, ctype );
2576
        mean = _mean.getMat();
2577 2578
    }

2579
    mulTransposed( data, _covar, ((flags & CV_COVAR_NORMAL) == 0) ^ takeRows,
2580 2581 2582 2583 2584 2585 2586
        mean, (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, ctype );
}

/****************************************************************************************\
*                                        Mahalanobis                                     *
\****************************************************************************************/

2587
double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
2588
{
2589 2590
    CV_INSTRUMENT_REGION()

Andrey Kamaev's avatar
Andrey Kamaev committed
2591
    Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat();
2592 2593 2594
    int type = v1.type(), depth = v1.depth();
    Size sz = v1.size();
    int i, j, len = sz.width*sz.height*v1.channels();
2595
    AutoBuffer<double> buf(len);
2596 2597
    double result = 0;

2598
    CV_Assert_N( type == v2.type(), type == icovar.type(),
2599
        sz == v2.size(), len == icovar.rows && len == icovar.cols );
2600

2601
    sz.width *= v1.channels();
2602 2603 2604 2605 2606 2607 2608 2609
    if( v1.isContinuous() && v2.isContinuous() )
    {
        sz.width *= sz.height;
        sz.height = 1;
    }

    if( depth == CV_32F )
    {
2610 2611
        const float* src1 = v1.ptr<float>();
        const float* src2 = v2.ptr<float>();
2612 2613
        size_t step1 = v1.step/sizeof(src1[0]);
        size_t step2 = v2.step/sizeof(src2[0]);
2614
        double* diff = buf.data();
2615
        const float* mat = icovar.ptr<float>();
2616 2617 2618 2619 2620 2621 2622 2623
        size_t matstep = icovar.step/sizeof(mat[0]);

        for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width )
        {
            for( i = 0; i < sz.width; i++ )
                diff[i] = src1[i] - src2[i];
        }

2624
        diff = buf.data();
2625 2626 2627
        for( i = 0; i < len; i++, mat += matstep )
        {
            double row_sum = 0;
Victoria Zhislina's avatar
Victoria Zhislina committed
2628
            j = 0;
Andrey Kamaev's avatar
Andrey Kamaev committed
2629
             #if CV_ENABLE_UNROLLED
Victoria Zhislina's avatar
Victoria Zhislina committed
2630
            for(; j <= len - 4; j += 4 )
2631 2632
                row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
                           diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
Victoria Zhislina's avatar
Victoria Zhislina committed
2633
            #endif
2634 2635 2636 2637 2638 2639 2640
            for( ; j < len; j++ )
                row_sum += diff[j]*mat[j];
            result += row_sum * diff[i];
        }
    }
    else if( depth == CV_64F )
    {
2641 2642
        const double* src1 = v1.ptr<double>();
        const double* src2 = v2.ptr<double>();
2643 2644
        size_t step1 = v1.step/sizeof(src1[0]);
        size_t step2 = v2.step/sizeof(src2[0]);
2645
        double* diff = buf.data();
2646
        const double* mat = icovar.ptr<double>();
2647 2648 2649 2650 2651 2652 2653 2654
        size_t matstep = icovar.step/sizeof(mat[0]);

        for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width )
        {
            for( i = 0; i < sz.width; i++ )
                diff[i] = src1[i] - src2[i];
        }

2655
        diff = buf.data();
2656 2657 2658
        for( i = 0; i < len; i++, mat += matstep )
        {
            double row_sum = 0;
Victoria Zhislina's avatar
Victoria Zhislina committed
2659
            j = 0;
Andrey Kamaev's avatar
Andrey Kamaev committed
2660
             #if CV_ENABLE_UNROLLED
Victoria Zhislina's avatar
Victoria Zhislina committed
2661
            for(; j <= len - 4; j += 4 )
2662 2663
                row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
                           diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
Victoria Zhislina's avatar
Victoria Zhislina committed
2664
            #endif
2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676
            for( ; j < len; j++ )
                row_sum += diff[j]*mat[j];
            result += row_sum * diff[i];
        }
    }
    else
        CV_Error( CV_StsUnsupportedFormat, "" );

    return std::sqrt(result);
}

/****************************************************************************************\
2677
*                                        MulTransposed                                   *
2678 2679
\****************************************************************************************/

2680 2681 2682
namespace cv
{

2683 2684 2685 2686
template<typename sT, typename dT> static void
MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale )
{
    int i, j, k;
2687 2688 2689
    const sT* src = srcmat.ptr<sT>();
    dT* dst = dstmat.ptr<dT>();
    const dT* delta = deltamat.ptr<dT>();
2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706
    size_t srcstep = srcmat.step/sizeof(src[0]);
    size_t dststep = dstmat.step/sizeof(dst[0]);
    size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0;
    int delta_cols = deltamat.cols;
    Size size = srcmat.size();
    dT* tdst = dst;
    dT* col_buf = 0;
    dT* delta_buf = 0;
    int buf_size = size.height*sizeof(dT);
    AutoBuffer<uchar> buf;

    if( delta && delta_cols < size.width )
    {
        assert( delta_cols == 1 );
        buf_size *= 5;
    }
    buf.allocate(buf_size);
2707
    col_buf = (dT*)buf.data();
2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750

    if( delta && delta_cols < size.width )
    {
        delta_buf = col_buf + size.height;
        for( i = 0; i < size.height; i++ )
            delta_buf[i*4] = delta_buf[i*4+1] =
                delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep];
        delta = delta_buf;
        deltastep = deltastep ? 4 : 0;
    }

    if( !delta )
        for( i = 0; i < size.width; i++, tdst += dststep )
        {
            for( k = 0; k < size.height; k++ )
                col_buf[k] = src[k*srcstep+i];

            for( j = i; j <= size.width - 4; j += 4 )
            {
                double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
                const sT *tsrc = src + j;

                for( k = 0; k < size.height; k++, tsrc += srcstep )
                {
                    double a = col_buf[k];
                    s0 += a * tsrc[0];
                    s1 += a * tsrc[1];
                    s2 += a * tsrc[2];
                    s3 += a * tsrc[3];
                }

                tdst[j] = (dT)(s0*scale);
                tdst[j+1] = (dT)(s1*scale);
                tdst[j+2] = (dT)(s2*scale);
                tdst[j+3] = (dT)(s3*scale);
            }

            for( ; j < size.width; j++ )
            {
                double s0 = 0;
                const sT *tsrc = src + j;

                for( k = 0; k < size.height; k++, tsrc += srcstep )
2751
                    s0 += (double)col_buf[k] * tsrc[0];
2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793

                tdst[j] = (dT)(s0*scale);
            }
        }
    else
        for( i = 0; i < size.width; i++, tdst += dststep )
        {
            if( !delta_buf )
                for( k = 0; k < size.height; k++ )
                    col_buf[k] = src[k*srcstep+i] - delta[k*deltastep+i];
            else
                for( k = 0; k < size.height; k++ )
                    col_buf[k] = src[k*srcstep+i] - delta_buf[k*deltastep];

            for( j = i; j <= size.width - 4; j += 4 )
            {
                double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
                const sT *tsrc = src + j;
                const dT *d = delta_buf ? delta_buf : delta + j;

                for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
                {
                    double a = col_buf[k];
                    s0 += a * (tsrc[0] - d[0]);
                    s1 += a * (tsrc[1] - d[1]);
                    s2 += a * (tsrc[2] - d[2]);
                    s3 += a * (tsrc[3] - d[3]);
                }

                tdst[j] = (dT)(s0*scale);
                tdst[j+1] = (dT)(s1*scale);
                tdst[j+2] = (dT)(s2*scale);
                tdst[j+3] = (dT)(s3*scale);
            }

            for( ; j < size.width; j++ )
            {
                double s0 = 0;
                const sT *tsrc = src + j;
                const dT *d = delta_buf ? delta_buf : delta + j;

                for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
2794
                    s0 += (double)col_buf[k] * (tsrc[0] - d[0]);
2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805

                tdst[j] = (dT)(s0*scale);
            }
        }
}


template<typename sT, typename dT> static void
MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale )
{
    int i, j, k;
2806 2807 2808
    const sT* src = srcmat.ptr<sT>();
    dT* dst = dstmat.ptr<dT>();
    const dT* delta = deltamat.ptr<dT>();
2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824
    size_t srcstep = srcmat.step/sizeof(src[0]);
    size_t dststep = dstmat.step/sizeof(dst[0]);
    size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0;
    int delta_cols = deltamat.cols;
    Size size = srcmat.size();
    dT* tdst = dst;

    if( !delta )
        for( i = 0; i < size.height; i++, tdst += dststep )
            for( j = i; j < size.height; j++ )
            {
                double s = 0;
                const sT *tsrc1 = src + i*srcstep;
                const sT *tsrc2 = src + j*srcstep;

                for( k = 0; k <= size.width - 4; k += 4 )
2825 2826
                    s += (double)tsrc1[k]*tsrc2[k] + (double)tsrc1[k+1]*tsrc2[k+1] +
                         (double)tsrc1[k+2]*tsrc2[k+2] + (double)tsrc1[k+3]*tsrc2[k+3];
2827
                for( ; k < size.width; k++ )
2828
                    s += (double)tsrc1[k] * tsrc2[k];
2829 2830 2831 2832 2833 2834 2835
                tdst[j] = (dT)(s*scale);
            }
    else
    {
        dT delta_buf[4];
        int delta_shift = delta_cols == size.width ? 4 : 0;
        AutoBuffer<uchar> buf(size.width*sizeof(dT));
2836
        dT* row_buf = (dT*)buf.data();
2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861

        for( i = 0; i < size.height; i++, tdst += dststep )
        {
            const sT *tsrc1 = src + i*srcstep;
            const dT *tdelta1 = delta + i*deltastep;

            if( delta_cols < size.width )
                for( k = 0; k < size.width; k++ )
                    row_buf[k] = tsrc1[k] - tdelta1[0];
            else
                for( k = 0; k < size.width; k++ )
                    row_buf[k] = tsrc1[k] - tdelta1[k];

            for( j = i; j < size.height; j++ )
            {
                double s = 0;
                const sT *tsrc2 = src + j*srcstep;
                const dT *tdelta2 = delta + j*deltastep;
                if( delta_cols < size.width )
                {
                    delta_buf[0] = delta_buf[1] =
                        delta_buf[2] = delta_buf[3] = tdelta2[0];
                    tdelta2 = delta_buf;
                }
                for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift )
2862 2863 2864 2865
                    s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]) +
                         (double)row_buf[k+1]*(tsrc2[k+1] - tdelta2[1]) +
                         (double)row_buf[k+2]*(tsrc2[k+2] - tdelta2[2]) +
                         (double)row_buf[k+3]*(tsrc2[k+3] - tdelta2[3]);
2866
                for( ; k < size.width; k++, tdelta2++ )
2867
                    s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]);
2868 2869 2870 2871 2872 2873 2874 2875
                tdst[j] = (dT)(s*scale);
            }
        }
    }
}

typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale);

2876
}
2877

2878 2879
void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata,
                        InputArray _delta, double scale, int dtype )
2880
{
2881 2882
    CV_INSTRUMENT_REGION()

2883
    Mat src = _src.getMat(), delta = _delta.getMat();
2884 2885 2886 2887 2888
    const int gemm_level = 100; // boundary above which GEMM is faster.
    int stype = src.type();
    dtype = std::max(std::max(CV_MAT_DEPTH(dtype >= 0 ? dtype : stype), delta.depth()), CV_32F);
    CV_Assert( src.channels() == 1 );

2889
    if( !delta.empty() )
2890
    {
2891
        CV_Assert_N( delta.channels() == 1,
2892
            (delta.rows == src.rows || delta.rows == 1),
2893 2894
            (delta.cols == src.cols || delta.cols == 1));
        if( delta.type() != dtype )
2895
            delta.convertTo(delta, dtype);
2896 2897 2898
    }

    int dsize = ata ? src.cols : src.rows;
2899 2900
    _dst.create( dsize, dsize, dtype );
    Mat dst = _dst.getMat();
2901 2902 2903 2904 2905 2906 2907

    if( src.data == dst.data || (stype == dtype &&
        (dst.cols >= gemm_level && dst.rows >= gemm_level &&
         src.cols >= gemm_level && src.rows >= gemm_level)))
    {
        Mat src2;
        const Mat* tsrc = &src;
2908
        if( !delta.empty() )
2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998
        {
            if( delta.size() == src.size() )
                subtract( src, delta, src2 );
            else
            {
                repeat(delta, src.rows/delta.rows, src.cols/delta.cols, src2);
                subtract( src, src2, src2 );
            }
            tsrc = &src2;
        }
        gemm( *tsrc, *tsrc, scale, Mat(), 0, dst, ata ? GEMM_1_T : GEMM_2_T );
    }
    else
    {
        MulTransposedFunc func = 0;
        if(stype == CV_8U && dtype == CV_32F)
        {
            if(ata)
                func = MulTransposedR<uchar,float>;
            else
                func = MulTransposedL<uchar,float>;
        }
        else if(stype == CV_8U && dtype == CV_64F)
        {
            if(ata)
                func = MulTransposedR<uchar,double>;
            else
                func = MulTransposedL<uchar,double>;
        }
        else if(stype == CV_16U && dtype == CV_32F)
        {
            if(ata)
                func = MulTransposedR<ushort,float>;
            else
                func = MulTransposedL<ushort,float>;
        }
        else if(stype == CV_16U && dtype == CV_64F)
        {
            if(ata)
                func = MulTransposedR<ushort,double>;
            else
                func = MulTransposedL<ushort,double>;
        }
        else if(stype == CV_16S && dtype == CV_32F)
        {
            if(ata)
                func = MulTransposedR<short,float>;
            else
                func = MulTransposedL<short,float>;
        }
        else if(stype == CV_16S && dtype == CV_64F)
        {
            if(ata)
                func = MulTransposedR<short,double>;
            else
                func = MulTransposedL<short,double>;
        }
        else if(stype == CV_32F && dtype == CV_32F)
        {
            if(ata)
                func = MulTransposedR<float,float>;
            else
                func = MulTransposedL<float,float>;
        }
        else if(stype == CV_32F && dtype == CV_64F)
        {
            if(ata)
                func = MulTransposedR<float,double>;
            else
                func = MulTransposedL<float,double>;
        }
        else if(stype == CV_64F && dtype == CV_64F)
        {
            if(ata)
                func = MulTransposedR<double,double>;
            else
                func = MulTransposedL<double,double>;
        }
        if( !func )
            CV_Error( CV_StsUnsupportedFormat, "" );

        func( src, dst, delta, scale );
        completeSymm( dst, false );
    }
}

/****************************************************************************************\
*                                      Dot Product                                       *
\****************************************************************************************/

2999
namespace cv
3000 3001
{

3002 3003
template<typename T> double
dotProd_(const T* src1, const T* src2, int len)
3004
{
3005 3006
    int i = 0;
    double result = 0;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3007 3008

    #if CV_ENABLE_UNROLLED
3009 3010 3011
    for( ; i <= len - 4; i += 4 )
        result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] +
            (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
Victoria Zhislina's avatar
Victoria Zhislina committed
3012
    #endif
3013 3014
    for( ; i < len; i++ )
        result += (double)src1[i]*src2[i];
3015

3016 3017
    return result;
}
3018 3019


3020
static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
3021
{
3022
    double r = 0;
3023
#if ARITHM_USE_IPP
3024
    CV_IPP_RUN(IPP_VERSION_X100 > 201800 || cv::ipp::getIppTopFeatures() != ippCPUID_SSE42, CV_INSTRUMENT_FUN_IPP(ippiDotProd_8u64f_C1R, src1, len*sizeof(uchar), src2, len*sizeof(uchar), ippiSize(len, 1), &r) >= 0, r);
3025
#endif
3026
    int i = 0;
3027

3028 3029
#if CV_SIMD128
    if (hasSIMD128())
3030
    {
3031
        int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3032

3033
        while (i < len0)
3034
        {
3035
            blockSize = std::min(len0 - i, blockSize0);
3036 3037 3038 3039 3040
            v_int32x4 v_sum = v_setzero_s32();
            const int cWidth = v_uint16x8::nlanes;

            int j = 0;
            for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
3041
            {
3042 3043 3044 3045 3046 3047
                v_uint16x8 v_src10, v_src20, v_src11, v_src21;
                v_expand(v_load(src1 + j), v_src10, v_src11);
                v_expand(v_load(src2 + j), v_src20, v_src21);

                v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20));
                v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21));
3048
            }
3049

3050
            for (; j <= blockSize - cWidth; j += cWidth)
3051
            {
3052 3053
                v_int16x8 v_src10 = v_reinterpret_as_s16(v_load_expand(src1 + j));
                v_int16x8 v_src20 = v_reinterpret_as_s16(v_load_expand(src2 + j));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3054

3055 3056 3057
                v_sum += v_dotprod(v_src10, v_src20);
            }
            r += (double)v_reduce_sum(v_sum);
3058

3059 3060 3061 3062
            src1 += blockSize;
            src2 += blockSize;
            i += blockSize;
        }
3063
    }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3064
#elif CV_NEON
3065
    if( cv::checkHardwareSupport(CV_CPU_NEON) )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3066
    {
3067 3068 3069
        int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
        uint32x4_t v_zero = vdupq_n_u32(0u);
        CV_DECL_ALIGNED(16) uint buf[4];
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3070

3071
        while( i < len0 )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3072
        {
3073 3074
            blockSize = std::min(len0 - i, blockSize0);
            uint32x4_t v_sum = v_zero;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3075

3076 3077 3078 3079
            int j = 0;
            for( ; j <= blockSize - 16; j += 16 )
            {
                uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3080

3081 3082 3083
                uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2));
                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3084

3085 3086 3087 3088 3089
                v_src10 = vmovl_u8(vget_high_u8(v_src1));
                v_src20 = vmovl_u8(vget_high_u8(v_src2));
                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
            }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3090

3091 3092 3093 3094 3095 3096
            for( ; j <= blockSize - 8; j += 8 )
            {
                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j));
                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2));
                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2));
            }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3097

3098 3099 3100 3101 3102 3103 3104
            vst1q_u32(buf, v_sum);
            r += buf[0] + buf[1] + buf[2] + buf[3];

            src1 += blockSize;
            src2 += blockSize;
            i += blockSize;
        }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3105
    }
3106 3107 3108
#endif
    return r + dotProd_(src1, src2, len - i);
}
3109 3110


3111
static double dotProd_8s(const schar* src1, const schar* src2, int len)
3112
{
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3113
    double r = 0.0;
3114
    int i = 0;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3115

3116 3117
#if CV_SIMD128
    if (hasSIMD128())
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3118
    {
3119
        int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3120

3121
        while (i < len0)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3122 3123
        {
            blockSize = std::min(len0 - i, blockSize0);
3124 3125 3126 3127 3128
            v_int32x4 v_sum = v_setzero_s32();
            const int cWidth = v_int16x8::nlanes;

            int j = 0;
            for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3129
            {
3130 3131 3132 3133 3134 3135
                v_int16x8 v_src10, v_src20, v_src11, v_src21;
                v_expand(v_load(src1 + j), v_src10, v_src11);
                v_expand(v_load(src2 + j), v_src20, v_src21);

                v_sum += v_dotprod(v_src10, v_src20);
                v_sum += v_dotprod(v_src11, v_src21);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3136 3137
            }

3138
            for (; j <= blockSize - cWidth; j += cWidth)
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3139
            {
3140 3141
                v_int16x8 v_src10 = v_load_expand(src1 + j);
                v_int16x8 v_src20 = v_load_expand(src2 + j);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3142

3143 3144 3145
                v_sum += v_dotprod(v_src10, v_src20);
            }
            r += (double)v_reduce_sum(v_sum);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3146 3147 3148 3149 3150 3151 3152

            src1 += blockSize;
            src2 += blockSize;
            i += blockSize;
        }
    }
#elif CV_NEON
3153
    if( cv::checkHardwareSupport(CV_CPU_NEON) )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3154
    {
3155 3156 3157
        int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
        int32x4_t v_zero = vdupq_n_s32(0);
        CV_DECL_ALIGNED(16) int buf[4];
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3158

3159
        while( i < len0 )
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3160
        {
3161 3162
            blockSize = std::min(len0 - i, blockSize0);
            int32x4_t v_sum = v_zero;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3163

3164 3165 3166 3167
            int j = 0;
            for( ; j <= blockSize - 16; j += 16 )
            {
                int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3168

3169 3170 3171
                int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2));
                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3172

3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184
                v_src10 = vmovl_s8(vget_high_s8(v_src1));
                v_src20 = vmovl_s8(vget_high_s8(v_src2));
                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
            }

            for( ; j <= blockSize - 8; j += 8 )
            {
                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j));
                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2));
                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2));
            }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3185

3186 3187
            vst1q_s32(buf, v_sum);
            r += buf[0] + buf[1] + buf[2] + buf[3];
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3188

3189 3190 3191 3192
            src1 += blockSize;
            src2 += blockSize;
            i += blockSize;
        }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3193 3194 3195 3196
    }
#endif

    return r + dotProd_(src1, src2, len - i);
3197
}
3198

3199
static double dotProd_16u(const ushort* src1, const ushort* src2, int len)
3200
{
3201 3202 3203
#if ARITHM_USE_IPP
    double r = 0;
    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16u64f_C1R, src1, len*sizeof(ushort), src2, len*sizeof(ushort), ippiSize(len, 1), &r) >= 0, r);
3204 3205
#endif
    return dotProd_(src1, src2, len);
3206
}
3207

3208
static double dotProd_16s(const short* src1, const short* src2, int len)
3209
{
3210 3211 3212
#if ARITHM_USE_IPP && (IPP_VERSION_X100 != 900) // bug in IPP 9.0.0
    double r = 0;
    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16s64f_C1R, src1, len*sizeof(short), src2, len*sizeof(short), ippiSize(len, 1), &r) >= 0, r);
3213 3214
#endif
    return dotProd_(src1, src2, len);
3215
}
3216

3217 3218
static double dotProd_32s(const int* src1, const int* src2, int len)
{
3219 3220 3221
#if ARITHM_USE_IPP
    double r = 0;
    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32s64f_C1R, src1, len*sizeof(int), src2, len*sizeof(int), ippiSize(len, 1), &r) >= 0, r);
3222 3223
#endif
    return dotProd_(src1, src2, len);
3224
}
3225

3226 3227
static double dotProd_32f(const float* src1, const float* src2, int len)
{
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3228
    double r = 0.0;
3229 3230 3231 3232

#if ARITHM_USE_IPP
    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32f64f_C1R, src1, len*sizeof(float), src2, len*sizeof(float), ippiSize(len, 1), &r, ippAlgHintFast) >= 0, r);
#endif
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3233 3234
    int i = 0;

3235 3236
#if CV_SIMD128
    if (hasSIMD128())
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3237
    {
3238
        int len0 = len & -4, blockSize0 = (1 << 13), blockSize;
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3239

3240 3241 3242 3243 3244 3245 3246 3247 3248
        while (i < len0)
        {
            blockSize = std::min(len0 - i, blockSize0);
            v_float32x4 v_sum = v_setzero_f32();

            int j = 0;
            int cWidth = v_float32x4::nlanes;
            for (; j <= blockSize - cWidth; j += cWidth)
                v_sum = v_muladd(v_load(src1 + j), v_load(src2 + j), v_sum);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3249

3250
            r += v_reduce_sum(v_sum);
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3251

3252 3253 3254 3255
            src1 += blockSize;
            src2 += blockSize;
            i += blockSize;
        }
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3256
    }
3257
#endif
Ilya Lavrenov's avatar
Ilya Lavrenov committed
3258
    return r + dotProd_(src1, src2, len - i);
3259
}
3260

3261 3262
static double dotProd_64f(const double* src1, const double* src2, int len)
{
3263 3264 3265
#if ARITHM_USE_IPP
    double r = 0;
    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsDotProd_64f, src1, src2, len, &r) >= 0, r);
3266
#endif
3267

3268
    return dotProd_(src1, src2, len);
3269 3270 3271
}


3272
typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len);
3273

3274
static DotProdFunc getDotProdFunc(int depth)
3275
{
3276 3277 3278 3279 3280 3281 3282 3283 3284 3285
    static DotProdFunc dotProdTab[] =
    {
        (DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s),
        (DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s,
        (DotProdFunc)dotProd_32s, (DotProdFunc)GET_OPTIMIZED(dotProd_32f),
        (DotProdFunc)dotProd_64f, 0
    };

    return dotProdTab[depth];
}
3286

3287
double Mat::dot(InputArray _mat) const
3288
{
3289 3290
    CV_INSTRUMENT_REGION()

3291 3292
    Mat mat = _mat.getMat();
    int cn = channels();
3293
    DotProdFunc func = getDotProdFunc(depth());
3294
    CV_Assert_N( mat.type() == type(), mat.size == size, func != 0 );
3295

3296
    if( isContinuous() && mat.isContinuous() )
3297
    {
3298 3299
        size_t len = total()*cn;
        if( len == (size_t)(int)len )
3300
            return func(data, mat.data, (int)len);
3301
    }
3302

3303
    const Mat* arrays[] = {this, &mat, 0};
3304
    uchar* ptrs[2]{};
3305 3306 3307
    NAryMatIterator it(arrays, ptrs);
    int len = (int)(it.size*cn);
    double r = 0;
3308

3309 3310
    for( size_t i = 0; i < it.nplanes; i++, ++it )
        r += func( ptrs[0], ptrs[1], len );
3311

3312
    return r;
3313 3314
}

3315 3316
}

3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329
/****************************************************************************************\
*                                    Earlier API                                         *
\****************************************************************************************/

CV_IMPL void cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha,
                     const CvArr* Carr, double beta, CvArr* Darr, int flags )
{
    cv::Mat A = cv::cvarrToMat(Aarr), B = cv::cvarrToMat(Barr);
    cv::Mat C, D = cv::cvarrToMat(Darr);

    if( Carr )
        C = cv::cvarrToMat(Carr);

3330
    CV_Assert_N( (D.rows == ((flags & CV_GEMM_A_T) == 0 ? A.rows : A.cols)),
3331
               (D.cols == ((flags & CV_GEMM_B_T) == 0 ? B.cols : B.rows)),
3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352
               D.type() == A.type() );

    gemm( A, B, alpha, C, beta, D, flags );
}


CV_IMPL void
cvTransform( const CvArr* srcarr, CvArr* dstarr,
             const CvMat* transmat, const CvMat* shiftvec )
{
    cv::Mat m = cv::cvarrToMat(transmat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);

    if( shiftvec )
    {
        cv::Mat v = cv::cvarrToMat(shiftvec).reshape(1,m.rows),
            _m(m.rows, m.cols + 1, m.type()), m1 = _m.colRange(0,m.cols), v1 = _m.col(m.cols);
        m.convertTo(m1, m1.type());
        v.convertTo(v1, v1.type());
        m = _m;
    }

3353
    CV_Assert_N( dst.depth() == src.depth(), dst.channels() == m.rows );
3354 3355 3356 3357 3358 3359 3360 3361 3362
    cv::transform( src, dst, m );
}


CV_IMPL void
cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat )
{
    cv::Mat m = cv::cvarrToMat(mat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);

3363
    CV_Assert_N( dst.type() == src.type(), dst.channels() == m.rows-1 );
3364 3365 3366 3367 3368 3369 3370 3371 3372
    cv::perspectiveTransform( src, dst, m );
}


CV_IMPL void cvScaleAdd( const CvArr* srcarr1, CvScalar scale,
                         const CvArr* srcarr2, CvArr* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);

3373
    CV_Assert_N( src1.size == dst.size, src1.type() == dst.type() );
3374 3375 3376 3377 3378 3379 3380 3381 3382
    cv::scaleAdd( src1, scale.val[0], cv::cvarrToMat(srcarr2), dst );
}


CV_IMPL void
cvCalcCovarMatrix( const CvArr** vecarr, int count,
                   CvArr* covarr, CvArr* avgarr, int flags )
{
    cv::Mat cov0 = cv::cvarrToMat(covarr), cov = cov0, mean0, mean;
3383
    CV_Assert_N( vecarr != 0, count >= 1 );
3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447

    if( avgarr )
        mean = mean0 = cv::cvarrToMat(avgarr);

    if( (flags & CV_COVAR_COLS) != 0 || (flags & CV_COVAR_ROWS) != 0 )
    {

        cv::Mat data = cv::cvarrToMat(vecarr[0]);
        cv::calcCovarMatrix( data, cov, mean, flags, cov.type() );
    }
    else
    {
        std::vector<cv::Mat> data(count);
        for( int i = 0; i < count; i++ )
            data[i] = cv::cvarrToMat(vecarr[i]);
        cv::calcCovarMatrix( &data[0], count, cov, mean, flags, cov.type() );
    }

    if( mean.data != mean0.data && mean0.data )
        mean.convertTo(mean0, mean0.type());

    if( cov.data != cov0.data )
        cov.convertTo(cov0, cov0.type());
}


CV_IMPL double
cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, const CvArr* matarr )
{
    return cv::Mahalanobis(cv::cvarrToMat(srcAarr),
        cv::cvarrToMat(srcBarr), cv::cvarrToMat(matarr));
}

CV_IMPL void
cvMulTransposed( const CvArr* srcarr, CvArr* dstarr,
                 int order, const CvArr* deltaarr, double scale )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0, delta;
    if( deltaarr )
        delta = cv::cvarrToMat(deltaarr);
    cv::mulTransposed( src, dst, order != 0, delta, scale, dst.type());
    if( dst.data != dst0.data )
        dst.convertTo(dst0, dst0.type());
}

CV_IMPL double cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr )
{
    return cv::cvarrToMat(srcAarr).dot(cv::cvarrToMat(srcBarr));
}


CV_IMPL void
cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigenvects, int flags )
{
    cv::Mat data = cv::cvarrToMat(data_arr), mean0 = cv::cvarrToMat(avg_arr);
    cv::Mat evals0 = cv::cvarrToMat(eigenvals), evects0 = cv::cvarrToMat(eigenvects);
    cv::Mat mean = mean0, evals = evals0, evects = evects0;

    cv::PCA pca;
    pca.mean = mean;
    pca.eigenvalues = evals;
    pca.eigenvectors = evects;

    pca(data, (flags & CV_PCA_USE_AVG) ? mean : cv::Mat(),
3448
        flags, !evals.empty() ? evals.rows + evals.cols - 1 : 0);
3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461

    if( pca.mean.size() == mean.size() )
        pca.mean.convertTo( mean, mean.type() );
    else
    {
        cv::Mat temp; pca.mean.convertTo( temp, mean.type() );
        transpose( temp, mean );
    }

    evals = pca.eigenvalues;
    evects = pca.eigenvectors;
    int ecount0 = evals0.cols + evals0.rows - 1;
    int ecount = evals.cols + evals.rows - 1;
3462

3463
    CV_Assert_N( (evals0.cols == 1 || evals0.rows == 1),
3464 3465
                ecount0 <= ecount,
                evects0.cols == evects.cols,
3466
                evects0.rows == ecount0 );
3467

3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493
    cv::Mat temp = evals0;
    if( evals.rows == 1 )
        evals.colRange(0, ecount0).convertTo(temp, evals0.type());
    else
        evals.rowRange(0, ecount0).convertTo(temp, evals0.type());
    if( temp.data != evals0.data )
        transpose(temp, evals0);
    evects.rowRange(0, ecount0).convertTo( evects0, evects0.type() );

    // otherwise some datatype's or size's were incorrect, so the output arrays have been reallocated
    CV_Assert( mean0.data == mean.data );
}


CV_IMPL void
cvProjectPCA( const CvArr* data_arr, const CvArr* avg_arr,
              const CvArr* eigenvects, CvArr* result_arr )
{
    cv::Mat data = cv::cvarrToMat(data_arr), mean = cv::cvarrToMat(avg_arr);
    cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0;

    cv::PCA pca;
    pca.mean = mean;
    int n;
    if( mean.rows == 1 )
    {
3494
        CV_Assert_N(dst.cols <= evects.rows, dst.rows == data.rows);
3495 3496 3497 3498
        n = dst.cols;
    }
    else
    {
3499
        CV_Assert_N(dst.rows <= evects.rows, dst.cols == data.cols);
3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524
        n = dst.rows;
    }
    pca.eigenvectors = evects.rowRange(0, n);

    cv::Mat result = pca.project(data);
    if( result.cols != dst.cols )
        result = result.reshape(1, 1);
    result.convertTo(dst, dst.type());

    CV_Assert(dst0.data == dst.data);
}


CV_IMPL void
cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr,
                  const CvArr* eigenvects, CvArr* result_arr )
{
    cv::Mat data = cv::cvarrToMat(proj_arr), mean = cv::cvarrToMat(avg_arr);
    cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0;

    cv::PCA pca;
    pca.mean = mean;
    int n;
    if( mean.rows == 1 )
    {
3525
        CV_Assert_N(data.cols <= evects.rows, dst.rows == data.rows);
3526 3527 3528 3529
        n = data.cols;
    }
    else
    {
3530
        CV_Assert_N(data.rows <= evects.rows, dst.cols == data.cols);
3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541
        n = data.rows;
    }
    pca.eigenvectors = evects.rowRange(0, n);

    cv::Mat result = pca.backProject(data);
    result.convertTo(dst, dst.type());

    CV_Assert(dst0.data == dst.data);
}

/* End of file. */