match_template.cpp 25.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#include "precomp.hpp"

using namespace cv;
46
using namespace cv::cuda;
47

48
#if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDAARITHM) || defined (CUDA_DISABLER)
49

50
Ptr<cuda::TemplateMatching> cv::cuda::createTemplateMatching(int, int, Size) { throw_no_cuda(); return Ptr<cuda::TemplateMatching>(); }
51 52 53

#else

54
namespace cv { namespace cuda { namespace device
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
{
    namespace match_template
    {
        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);

        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);

        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
            int cn, cudaStream_t stream);

        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
            int cn, cudaStream_t stream);

        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC2(
            int w, int h,
            const PtrStepSz<unsigned int> image_sum_r,
            const PtrStepSz<unsigned int> image_sum_g,
            unsigned int templ_sum_r,
            unsigned int templ_sum_g,
            PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC3(
                int w, int h,
                const PtrStepSz<unsigned int> image_sum_r,
                const PtrStepSz<unsigned int> image_sum_g,
                const PtrStepSz<unsigned int> image_sum_b,
                unsigned int templ_sum_r,
                unsigned int templ_sum_g,
                unsigned int templ_sum_b,
                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC4(
                int w, int h,
                const PtrStepSz<unsigned int> image_sum_r,
                const PtrStepSz<unsigned int> image_sum_g,
                const PtrStepSz<unsigned int> image_sum_b,
                const PtrStepSz<unsigned int> image_sum_a,
                unsigned int templ_sum_r,
                unsigned int templ_sum_g,
                unsigned int templ_sum_b,
                unsigned int templ_sum_a,
                PtrStepSzf result, cudaStream_t stream);


        void matchTemplatePrepared_CCOFF_NORMED_8U(
                int w, int h, const PtrStepSz<unsigned int> image_sum,
                const PtrStepSz<unsigned long long> image_sqsum,
                unsigned int templ_sum, unsigned long long templ_sqsum,
                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
                int w, int h,
                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
                int w, int h,
                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                int w, int h,
                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
                const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
                PtrStepSzf result, cudaStream_t stream);

        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);

        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
    }
}}}

namespace
{
    // Evaluates optimal template's area threshold. If
    // template's area is less  than the threshold, we use naive match
    // template version, otherwise FFT-based (if available)
    int getTemplateThreshold(int method, int depth)
    {
        switch (method)
        {
149
        case TM_CCORR:
150 151 152
            if (depth == CV_32F) return 250;
            if (depth == CV_8U) return 300;
            break;
153 154

        case TM_SQDIFF:
155 156 157
            if (depth == CV_8U) return 300;
            break;
        }
158 159

        CV_Error(Error::StsBadArg, "unsupported match template mode");
160 161 162
        return 0;
    }

163 164 165 166 167 168 169 170 171
    ///////////////////////////////////////////////////////////////
    // CCORR_32F

    class Match_CCORR_32F : public TemplateMatching
    {
    public:
        explicit Match_CCORR_32F(Size user_block_size);

        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
172

173
    private:
174
        Ptr<cuda::Convolution> conv_;
175 176 177 178 179
        GpuMat result_;
    };

    Match_CCORR_32F::Match_CCORR_32F(Size user_block_size)
    {
180
        conv_ = cuda::createConvolution(user_block_size);
181 182 183
    }

    void Match_CCORR_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& _stream)
184
    {
185
        using namespace cv::cuda::device::match_template;
186 187 188 189 190 191 192 193 194 195 196 197 198 199

        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();

        CV_Assert( image.depth() == CV_32F );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );

        cudaStream_t stream = StreamAccessor::getStream(_stream);

        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
        GpuMat result = _result.getGpuMat();

        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_32F))
200
        {
201
            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), stream);
202 203 204 205
            return;
        }

        if (image.channels() == 1)
206
        {
207
            conv_->convolve(image.reshape(1), templ.reshape(1), result, true, _stream);
208
        }
209 210
        else
        {
211 212
            conv_->convolve(image.reshape(1), templ.reshape(1), result_, true, _stream);
            extractFirstChannel_32F(result_, result, image.channels(), stream);
213 214 215
        }
    }

216 217
    ///////////////////////////////////////////////////////////////
    // CCORR_8U
218

219
    class Match_CCORR_8U : public TemplateMatching
220
    {
221 222
    public:
        explicit Match_CCORR_8U(Size user_block_size) : match32F_(user_block_size)
223
        {
224 225 226 227 228 229 230 231 232 233 234
        }

        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());

    private:
        GpuMat imagef_, templf_;
        Match_CCORR_32F match32F_;
    };

    void Match_CCORR_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
235
        using namespace cv::cuda::device::match_template;
236 237 238 239 240 241 242 243 244 245 246 247 248

        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();

        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );

        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_8U))
        {
            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
            GpuMat result = _result.getGpuMat();

249 250 251 252
            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

253 254
        image.convertTo(imagef_, CV_32F, stream);
        templ.convertTo(templf_, CV_32F, stream);
255

256
        match32F_.match(imagef_, templf_, _result, stream);
257 258
    }

259 260
    ///////////////////////////////////////////////////////////////
    // CCORR_NORMED_8U
261

262
    class Match_CCORR_NORMED_8U : public TemplateMatching
263
    {
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
    public:
        explicit Match_CCORR_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
        {
        }

        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());

    private:
        Match_CCORR_8U match_CCORR_;
        GpuMat image_sqsums_;
        GpuMat intBuffer_;
    };

    void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
279
        using namespace cv::cuda::device::match_template;
280 281 282

        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
283

284 285 286
        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
287

288 289 290
        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();

291
        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
292

293
        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
294 295

        normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
296 297
    }

298 299
    ///////////////////////////////////////////////////////////////
    // SQDIFF_32F
300

301
    class Match_SQDIFF_32F : public TemplateMatching
302
    {
303 304 305 306 307 308
    public:
        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
    };

    void Match_SQDIFF_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
309
        using namespace cv::cuda::device::match_template;
310 311 312 313 314 315 316 317 318 319 320

        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();

        CV_Assert( image.depth() == CV_32F );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );

        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
        GpuMat result = _result.getGpuMat();

321 322 323
        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
    }

324 325
    ///////////////////////////////////////////////////////////////
    // SQDIFF_8U
326

327
    class Match_SQDIFF_8U : public TemplateMatching
328
    {
329 330
    public:
        explicit Match_SQDIFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
331
        {
332 333 334 335 336 337 338 339 340 341 342 343
        }

        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());

    private:
        GpuMat image_sqsums_;
        GpuMat intBuffer_;
        Match_CCORR_8U match_CCORR_;
    };

    void Match_SQDIFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
344
        using namespace cv::cuda::device::match_template;
345 346 347 348 349 350 351 352 353 354 355 356 357

        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();

        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );

        if (templ.size().area() < getTemplateThreshold(TM_SQDIFF, CV_8U))
        {
            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
            GpuMat result = _result.getGpuMat();

358 359 360 361
            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

362
        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
363

364
        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
365

366 367 368 369
        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();

        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
370 371
    }

372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
    ///////////////////////////////////////////////////////////////
    // SQDIFF_NORMED_8U

    class Match_SQDIFF_NORMED_8U : public TemplateMatching
    {
    public:
        explicit Match_SQDIFF_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
        {
        }

        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());

    private:
        GpuMat image_sqsums_;
        GpuMat intBuffer_;
        Match_CCORR_8U match_CCORR_;
    };
389

390
    void Match_SQDIFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
391
    {
392
        using namespace cv::cuda::device::match_template;
393

394 395
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
396

397 398 399 400
        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );

401
        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
402

403
        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
404 405 406 407 408

        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();

        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
409 410
    }

411 412
    ///////////////////////////////////////////////////////////////
    // CCOFF_8U
413

414
    class Match_CCOEFF_8U : public TemplateMatching
415
    {
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
    public:
        explicit Match_CCOEFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
        {
        }

        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());

    private:
        GpuMat intBuffer_;
        std::vector<GpuMat> images_;
        std::vector<GpuMat> image_sums_;
        Match_CCORR_8U match_CCORR_;
    };

    void Match_CCOEFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
432
        using namespace cv::cuda::device::match_template;
433 434 435 436 437 438 439 440 441 442

        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();

        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );

        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();
443 444 445

        if (image.channels() == 1)
        {
446
            image_sums_.resize(1);
447
            cuda::integral(image, image_sums_[0], intBuffer_, stream);
448

449
            unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0];
450

451
            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
452 453 454
        }
        else
        {
455
            cuda::split(image, images_);
456 457

            image_sums_.resize(images_.size());
458
            for (int i = 0; i < image.channels(); ++i)
459
                cuda::integral(images_[i], image_sums_[i], intBuffer_, stream);
460

461
            Scalar templ_sum = cuda::sum(templ);
462 463 464 465 466

            switch (image.channels())
            {
            case 2:
                matchTemplatePrepared_CCOFF_8UC2(
467 468
                        templ.cols, templ.rows, image_sums_[0], image_sums_[1],
                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1],
469 470 471 472
                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
                matchTemplatePrepared_CCOFF_8UC3(
473 474
                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2],
475 476 477 478
                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
                matchTemplatePrepared_CCOFF_8UC4(
479 480 481
                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (unsigned int) templ_sum[3],
                        result, StreamAccessor::getStream(stream));
482 483
                break;
            default:
484
                CV_Error(Error::StsBadArg, "unsupported number of channels");
485 486 487 488
            }
        }
    }

489 490
    ///////////////////////////////////////////////////////////////
    // CCOFF_NORMED_8U
491

492
    class Match_CCOEFF_NORMED_8U : public TemplateMatching
493
    {
494 495 496 497 498 499
    public:
        explicit Match_CCOEFF_NORMED_8U(Size user_block_size) : match_CCORR_32F_(user_block_size)
        {
        }

        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
500

501 502 503 504 505 506 507 508 509 510 511
    private:
        GpuMat imagef_, templf_;
        Match_CCORR_32F match_CCORR_32F_;
        GpuMat intBuffer_;
        std::vector<GpuMat> images_;
        std::vector<GpuMat> image_sums_;
        std::vector<GpuMat> image_sqsums_;
    };

    void Match_CCOEFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
512
        using namespace cv::cuda::device::match_template;
513 514 515 516 517 518 519 520 521 522 523 524 525

        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();

        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );

        image.convertTo(imagef_, CV_32F, stream);
        templ.convertTo(templf_, CV_32F, stream);

        match_CCORR_32F_.match(imagef_, templf_, _result, stream);
        GpuMat result = _result.getGpuMat();
526 527 528

        if (image.channels() == 1)
        {
529
            image_sums_.resize(1);
530
            cuda::integral(image, image_sums_[0], intBuffer_, stream);
531 532

            image_sqsums_.resize(1);
533
            cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
534

535 536
            unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0];
            unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ)[0];
537 538

            matchTemplatePrepared_CCOFF_NORMED_8U(
539
                    templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
540 541 542 543
                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
        }
        else
        {
544
            cuda::split(image, images_);
545 546 547

            image_sums_.resize(images_.size());
            image_sqsums_.resize(images_.size());
548 549
            for (int i = 0; i < image.channels(); ++i)
            {
550 551
                cuda::integral(images_[i], image_sums_[i], intBuffer_, stream);
                cuda::sqrIntegral(images_[i], image_sqsums_[i], intBuffer_, stream);
552 553
            }

554 555
            Scalar templ_sum = cuda::sum(templ);
            Scalar templ_sqsum = cuda::sqrSum(templ);
556 557 558 559 560 561

            switch (image.channels())
            {
            case 2:
                matchTemplatePrepared_CCOFF_NORMED_8UC2(
                        templ.cols, templ.rows,
562 563
                        image_sums_[0], image_sqsums_[0],
                        image_sums_[1], image_sqsums_[1],
564 565 566 567 568 569 570
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
                matchTemplatePrepared_CCOFF_NORMED_8UC3(
                        templ.cols, templ.rows,
571 572 573
                        image_sums_[0], image_sqsums_[0],
                        image_sums_[1], image_sqsums_[1],
                        image_sums_[2], image_sqsums_[2],
574 575 576 577 578 579 580 581
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
                matchTemplatePrepared_CCOFF_NORMED_8UC4(
                        templ.cols, templ.rows,
582 583 584 585
                        image_sums_[0], image_sqsums_[0],
                        image_sums_[1], image_sqsums_[1],
                        image_sums_[2], image_sqsums_[2],
                        image_sums_[3], image_sqsums_[3],
586 587 588 589 590 591 592
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
                        result, StreamAccessor::getStream(stream));
                break;
            default:
593
                CV_Error(Error::StsBadArg, "unsupported number of channels");
594 595 596 597 598
            }
        }
    }
}

599
Ptr<cuda::TemplateMatching> cv::cuda::createTemplateMatching(int srcType, int method, Size user_block_size)
600
{
601
    const int sdepth = CV_MAT_DEPTH(srcType);
602

603
    CV_Assert( sdepth == CV_8U || sdepth == CV_32F );
604

605 606 607 608 609
    if (sdepth == CV_32F)
    {
        switch (method)
        {
        case TM_SQDIFF:
610
            return makePtr<Match_SQDIFF_32F>();
611

612
        case TM_CCORR:
613
            return makePtr<Match_CCORR_32F>(user_block_size);
614

615 616
        default:
            CV_Error( Error::StsBadFlag, "Unsopported method" );
617
            return Ptr<cuda::TemplateMatching>();
618
        }
619
    }
620 621 622 623 624
    else
    {
        switch (method)
        {
        case TM_SQDIFF:
625
            return makePtr<Match_SQDIFF_8U>(user_block_size);
626 627

        case TM_SQDIFF_NORMED:
628
            return makePtr<Match_SQDIFF_NORMED_8U>(user_block_size);
629

630
        case TM_CCORR:
631
            return makePtr<Match_CCORR_8U>(user_block_size);
632 633

        case TM_CCORR_NORMED:
634
            return makePtr<Match_CCORR_NORMED_8U>(user_block_size);
635 636

        case TM_CCOEFF:
637
            return makePtr<Match_CCOEFF_8U>(user_block_size);
638 639

        case TM_CCOEFF_NORMED:
640
            return makePtr<Match_CCOEFF_NORMED_8U>(user_block_size);
641 642 643

        default:
            CV_Error( Error::StsBadFlag, "Unsopported method" );
644
            return Ptr<cuda::TemplateMatching>();
645 646
        }
    }
647 648 649
}

#endif