imgproc.cpp 62.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#include "precomp.hpp"

using namespace cv;
using namespace cv::gpu;

#if !defined (HAVE_CUDA)

50 51
void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
52
void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
53
void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, int, Stream&) { throw_nogpu(); }
54
void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_nogpu(); }
55
void cv::gpu::buildWarpPlaneMaps(Size, Rect, const Mat&, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
56 57
void cv::gpu::buildWarpCylindricalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
58 59 60 61
void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_nogpu(); }
void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
62
void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); }
63
void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_nogpu(); }
64
void cv::gpu::evenLevels(GpuMat&, int, int, int) { throw_nogpu(); }
65
void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
66
void cv::gpu::histEven(const GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
67
void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*, Stream&) { throw_nogpu(); }
68
void cv::gpu::histEven(const GpuMat&, GpuMat*, GpuMat&, int*, int*, int*, Stream&) { throw_nogpu(); }
69
void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
70
void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
71
void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_nogpu(); }
72
void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, GpuMat&, Stream&) { throw_nogpu(); }
73 74
void cv::gpu::calcHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::calcHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
75 76 77
void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
78
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
79
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
80
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_nogpu(); }
81
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
82
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
83 84 85 86
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_nogpu(); }
void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_nogpu(); }
void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); }
87
void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
88
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
89
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_nogpu(); }
90 91 92 93 94 95 96
void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); }
cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); }
void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); }
void cv::gpu::CannyBuf::release() { throw_nogpu(); }
97 98 99

#else /* !defined (HAVE_CUDA) */

100 101
////////////////////////////////////////////////////////////////////////
// meanShiftFiltering_GPU
102

103
namespace cv { namespace gpu { namespace device
104
{
105
    namespace imgproc
106 107 108 109
    {
        void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
    }
}}}
110

111
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
112
{
113
    using namespace ::cv::gpu::device::imgproc;
114

115 116 117 118 119 120
    if( src.empty() )
        CV_Error( CV_StsBadArg, "The input image is empty" );

    if( src.depth() != CV_8U || src.channels() != 4 )
        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );

121
    dst.create( src.size(), CV_8UC4 );
122

123 124
    if( !(criteria.type & TermCriteria::MAX_ITER) )
        criteria.maxCount = 5;
125

126
    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
127

128
    float eps;
129 130
    if( !(criteria.type & TermCriteria::EPS) )
        eps = 1.f;
131
    eps = (float)std::max(criteria.epsilon, 0.0);
132

133
    meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
134 135
}

136
////////////////////////////////////////////////////////////////////////
137 138
// meanShiftProc_GPU

139
namespace cv { namespace gpu { namespace device
140
{
141
    namespace imgproc
142 143 144 145
    {
        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
    }
}}}
146

147
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
148
{
149
    using namespace ::cv::gpu::device::imgproc;
150

151 152 153 154 155 156 157 158
    if( src.empty() )
        CV_Error( CV_StsBadArg, "The input image is empty" );

    if( src.depth() != CV_8U || src.channels() != 4 )
        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );

    dstr.create( src.size(), CV_8UC4 );
    dstsp.create( src.size(), CV_16SC2 );
159

160 161
    if( !(criteria.type & TermCriteria::MAX_ITER) )
        criteria.maxCount = 5;
162

163
    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
164

165 166 167
    float eps;
    if( !(criteria.type & TermCriteria::EPS) )
        eps = 1.f;
168
    eps = (float)std::max(criteria.epsilon, 0.0);
169

170
    meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
171 172 173
}

////////////////////////////////////////////////////////////////////////
174 175
// drawColorDisp

176
namespace cv { namespace gpu { namespace device
177
{
178
    namespace imgproc
179 180 181 182 183
    {
        void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
        void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
    }
}}}
184

185 186 187
namespace
{
    template <typename T>
188
    void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
189
    {
190
        using namespace ::cv::gpu::device::imgproc;
191

192
        dst.create(src.size(), CV_8UC4);
193

194
        drawColorDisp_gpu((DevMem2D_<T>)src, dst, ndisp, stream);
195
    }
196 197 198 199

    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);

    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
200 201
}

202
void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& stream)
203 204
{
    CV_Assert(src.type() == CV_8U || src.type() == CV_16S);
205

206
    drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
207
}
208

209 210 211
////////////////////////////////////////////////////////////////////////
// reprojectImageTo3D

212
namespace cv { namespace gpu { namespace device
213
{
214
    namespace imgproc
215
    {
216 217
        template <typename T, typename D>
        void reprojectImageTo3D_gpu(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
218 219
    }
}}}
220

221
void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q, int dst_cn, Stream& stream)
222
{
223
    using namespace cv::gpu::device::imgproc;
224

225 226 227 228 229 230
    typedef void (*func_t)(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
    static const func_t funcs[2][4] = 
    {
        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
    };
231

232 233 234
    CV_Assert(disp.type() == CV_8U || disp.type() == CV_16S);
    CV_Assert(Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous());
    CV_Assert(dst_cn == 3 || dst_cn == 4);
235

236
    xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
237

238
    funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
239 240
}

241 242 243
////////////////////////////////////////////////////////////////////////
// copyMakeBorder

244
namespace cv { namespace gpu { namespace device
245
{
246
    namespace imgproc
247 248 249 250
    {
        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
    }
}}}
251

252 253
namespace
{
254
    template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
255
    {
256
        using namespace ::cv::gpu::device::imgproc;
257

258
        Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
259

260
        copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
261 262
    }
}
263

264 265 266 267
void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
{
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
268

269 270 271
    dst.create(src.rows + top + bottom, src.cols + left + right, src.type());

    cudaStream_t stream = StreamAccessor::getStream(s);
272

273
    if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
274
    {
275 276 277 278 279 280 281 282 283 284 285
        NppiSize srcsz;
        srcsz.width  = src.cols;
        srcsz.height = src.rows;

        NppiSize dstsz;
        dstsz.width  = dst.cols;
        dstsz.height = dst.rows;

        NppStreamHandler h(stream);

        switch (src.type())
Vladislav Vinogradov's avatar
Vladislav Vinogradov committed
286
        {
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
        case CV_8UC1:
            {
                Npp8u nVal = saturate_cast<Npp8u>(value[0]);
                nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_8UC4:
            {
                Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
                nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_32SC1:
            {
                Npp32s nVal = saturate_cast<Npp32s>(value[0]);
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
        case CV_32FC1:
            {
                Npp32f val = saturate_cast<Npp32f>(value[0]);
                Npp32s nVal = *(reinterpret_cast<Npp32s*>(&val));
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
            }
Vladislav Vinogradov's avatar
Vladislav Vinogradov committed
316
        }
317 318 319

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
320
    }
321 322
    else
    {
323
        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
324
        static const caller_t callers[6][4] =
325 326 327 328 329 330 331 332 333 334 335
        {
            {   copyMakeBorder_caller<uchar, 1>  , 0/*copyMakeBorder_caller<uchar, 2>*/ ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
            {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
            {   copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/,    copyMakeBorder_caller<ushort, 3> ,    copyMakeBorder_caller<ushort, 4>},
            {   copyMakeBorder_caller<short, 1>  , 0/*copyMakeBorder_caller<short, 2>*/ ,    copyMakeBorder_caller<short, 3>  ,    copyMakeBorder_caller<short, 4>},
            {0/*copyMakeBorder_caller<int, 1>*/  , 0/*copyMakeBorder_caller<int, 2>*/   , 0/*copyMakeBorder_caller<int, 3>*/  , 0/*copyMakeBorder_caller<int, 4>*/},
            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
        };

        caller_t func = callers[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);
336

337 338 339 340 341
        int gpuBorderType;
        CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

        func(src, dst, top, left, gpuBorderType, value, stream);
    }
342 343
}

344 345 346
//////////////////////////////////////////////////////////////////////////////
// buildWarpPlaneMaps

347
namespace cv { namespace gpu { namespace device
348
{
349
    namespace imgproc
350 351 352 353 354 355
    {
        void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
                                cudaStream_t stream);
    }
}}}
356

357
void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T,
358
                                 float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)
359
{
360
    (void)src_size;
361
    using namespace ::cv::gpu::device::imgproc;
362

363 364
    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
365
    CV_Assert((T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32F && T.isContinuous());
366 367 368 369 370

    Mat K_Rinv = K * R.t();
    Mat R_Kinv = R * K.inv();
    CV_Assert(K_Rinv.isContinuous());
    CV_Assert(R_Kinv.isContinuous());
371 372 373

    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
Anatoly Baksheev's avatar
Anatoly Baksheev committed
374
    device::imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
375
                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
376 377 378 379 380
}

//////////////////////////////////////////////////////////////////////////////
// buildWarpCylyndricalMaps

381
namespace cv { namespace gpu { namespace device
382
{
383
    namespace imgproc
384 385 386 387 388 389
    {
        void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                                      const float k_rinv[9], const float r_kinv[9], float scale,
                                      cudaStream_t stream);
    }
}}}
390

391
void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
392 393
                                       GpuMat& map_x, GpuMat& map_y, Stream& stream)
{
394
    (void)src_size;
395
    using namespace ::cv::gpu::device::imgproc;
396

397 398 399 400 401 402 403
    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);

    Mat K_Rinv = K * R.t();
    Mat R_Kinv = R * K.inv();
    CV_Assert(K_Rinv.isContinuous());
    CV_Assert(R_Kinv.isContinuous());
404 405 406

    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
Anatoly Baksheev's avatar
Anatoly Baksheev committed
407
    device::imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
408 409
}

410 411 412 413

//////////////////////////////////////////////////////////////////////////////
// buildWarpSphericalMaps

414
namespace cv { namespace gpu { namespace device
415
{
416
    namespace imgproc
417 418 419 420 421 422
    {
        void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                                    const float k_rinv[9], const float r_kinv[9], float scale,
                                    cudaStream_t stream);
    }
}}}
423

424
void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
425 426
                                     GpuMat& map_x, GpuMat& map_y, Stream& stream)
{
427
    (void)src_size;
428
    using namespace ::cv::gpu::device::imgproc;
429

430 431 432 433 434 435 436
    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);

    Mat K_Rinv = K * R.t();
    Mat R_Kinv = R * K.inv();
    CV_Assert(K_Rinv.isContinuous());
    CV_Assert(R_Kinv.isContinuous());
437 438 439

    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
Anatoly Baksheev's avatar
Anatoly Baksheev committed
440
    device::imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
441 442
}

443 444 445
////////////////////////////////////////////////////////////////////////
// rotate

446
namespace
447
{
448 449 450 451 452 453 454 455 456 457 458 459
    template<int DEPTH> struct NppTypeTraits;
    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };

    template <int DEPTH> struct NppRotateFunc
    {
        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
460

461
        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI,
462 463 464
                                    npp_t* pDst, int nDstStep, NppiRect oDstROI,
                                    double nAngle, double nShiftX, double nShiftY, int eInterpolation);
    };
465

466 467 468
    template <int DEPTH, typename NppRotateFunc<DEPTH>::func_t func> struct NppRotate
    {
        typedef typename NppRotateFunc<DEPTH>::npp_t npp_t;
469

470 471
        static void call(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream)
        {
472
            (void)dsize;
473
            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
474

475
            NppStreamHandler h(stream);
476

477 478 479 480 481 482 483 484 485 486 487 488 489 490
            NppiSize srcsz;
            srcsz.height = src.rows;
            srcsz.width = src.cols;
            NppiRect srcroi;
            srcroi.x = srcroi.y = 0;
            srcroi.height = src.rows;
            srcroi.width = src.cols;
            NppiRect dstroi;
            dstroi.x = dstroi.y = 0;
            dstroi.height = dst.rows;
            dstroi.width = dst.cols;

            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
                dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
491

492 493 494 495 496 497 498 499 500 501
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
}

void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream)
{
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);

502
    static const func_t funcs[6][4] =
503
    {
504 505 506 507 508 509 510
        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
        {0,0,0,0},
        {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
        {0,0,0,0},
        {0,0,0,0},
        {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
    };
511

512 513 514 515 516 517 518
    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);

    dst.create(dsize, src.type());

    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
519 520 521 522 523
}

////////////////////////////////////////////////////////////////////////
// integral

524
void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
525 526
{
    GpuMat buffer;
527
    integralBuffered(src, sum, buffer, s);
528 529
}

530
void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
531 532 533 534
{
    CV_Assert(src.type() == CV_8UC1);

    sum.create(src.rows + 1, src.cols + 1, CV_32S);
535

536
    NcvSize32u roiSize;
537 538 539
    roiSize.width = src.cols;
    roiSize.height = src.rows;

540 541
    cudaDeviceProp prop;
    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
542 543

    Ncv32u bufSize;
544
    ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
545
    ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
546

547 548 549 550
    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStStreamHandler h(stream);

551
    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
552
        sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
553

554 555
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
556 557
}

558 559 560
//////////////////////////////////////////////////////////////////////////////
// sqrIntegral

561
void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
562 563 564
{
    CV_Assert(src.type() == CV_8U);

565
    NcvSize32u roiSize;
566 567 568
    roiSize.width = src.cols;
    roiSize.height = src.rows;

569 570
    cudaDeviceProp prop;
    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
571 572

    Ncv32u bufSize;
573
    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
574 575
    GpuMat buf(1, bufSize, CV_8U);

576 577 578 579
    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStStreamHandler h(stream);

580
    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
581
    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
582
            sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
583

584 585
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
586 587
}

588 589 590
//////////////////////////////////////////////////////////////////////////////
// columnSum

591
namespace cv { namespace gpu { namespace device
592
{
593 594 595 596 597
    namespace imgproc
    {
        void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);
    }
}}}
598 599 600

void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
{
601
    using namespace ::cv::gpu::device::imgproc;
602

603 604 605
    CV_Assert(src.type() == CV_32F);

    dst.create(src.size(), CV_32F);
606

Anatoly Baksheev's avatar
Anatoly Baksheev committed
607
    device::imgproc::columnSum_32F(src, dst);
608 609
}

610
void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
611
{
612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1);

    dst.create(src.size(), CV_32FC1);

    NppiSize sz;
    sz.width = src.cols;
    sz.height = src.rows;

    NppiRect nppRect;
    nppRect.height = rect.height;
    nppRect.width = rect.width;
    nppRect.x = rect.x;
    nppRect.y = rect.y;

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
635 636
}

637

638 639 640 641 642 643 644 645 646 647
////////////////////////////////////////////////////////////////////////
// Histogram

namespace
{
    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);

    template<int SDEPTH> struct NppHistogramEvenFuncC1
    {
648
        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
649

650 651
    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
            int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
652 653 654
    };
    template<int SDEPTH> struct NppHistogramEvenFuncC4
    {
655
        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
656

657
        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
658 659
            Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer);
    };
660 661

    template<int SDEPTH, typename NppHistogramEvenFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
662
    struct NppHistogramEvenC1
663
    {
664 665
        typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;

666
        static void hist(const GpuMat& src, GpuMat& hist, GpuMat& buffer, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
667 668 669 670 671 672 673 674 675 676
        {
            int levels = histSize + 1;
            hist.create(1, histSize, CV_32S);

            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;

            int buf_size;
            get_buf_size(sz, levels, &buf_size);
677 678

            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
679 680 681

            NppStreamHandler h(stream);

682
            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
683
                lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
684

685 686
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
687
        }
688 689
    };
    template<int SDEPTH, typename NppHistogramEvenFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
690
    struct NppHistogramEvenC4
691
    {
692 693
        typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;

694
        static void hist(const GpuMat& src, GpuMat hist[4], GpuMat& buffer, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream)
695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
        {
            int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
            hist[0].create(1, histSize[0], CV_32S);
            hist[1].create(1, histSize[1], CV_32S);
            hist[2].create(1, histSize[2], CV_32S);
            hist[3].create(1, histSize[3], CV_32S);

            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;

            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};

            int buf_size;
            get_buf_size(sz, levels, &buf_size);
710 711

            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
712 713 714

            NppStreamHandler h(stream);

715
            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
716

717 718
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
719 720 721 722 723
        }
    };

    template<int SDEPTH> struct NppHistogramRangeFuncC1
    {
724
        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
725 726
        typedef Npp32s level_t;
        enum {LEVEL_TYPE_CODE=CV_32SC1};
727

728
        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
729 730
            const Npp32s* pLevels, int nLevels, Npp8u* pBuffer);
    };
731 732 733 734 735 736
    template<> struct NppHistogramRangeFuncC1<CV_32F>
    {
        typedef Npp32f src_t;
        typedef Npp32f level_t;
        enum {LEVEL_TYPE_CODE=CV_32FC1};

737
        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
738 739
            const Npp32f* pLevels, int nLevels, Npp8u* pBuffer);
    };
740 741
    template<int SDEPTH> struct NppHistogramRangeFuncC4
    {
742
        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
743 744
        typedef Npp32s level_t;
        enum {LEVEL_TYPE_CODE=CV_32SC1};
745

746
        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
747 748
            const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer);
    };
749 750 751 752 753 754
    template<> struct NppHistogramRangeFuncC4<CV_32F>
    {
        typedef Npp32f src_t;
        typedef Npp32f level_t;
        enum {LEVEL_TYPE_CODE=CV_32FC1};

755
        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
756 757
            const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer);
    };
758 759

    template<int SDEPTH, typename NppHistogramRangeFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
760
    struct NppHistogramRangeC1
761
    {
762
        typedef typename NppHistogramRangeFuncC1<SDEPTH>::src_t src_t;
763 764
        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
765

766
        static void hist(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buffer, cudaStream_t stream)
767
        {
768
            CV_Assert(levels.type() == LEVEL_TYPE_CODE && levels.rows == 1);
769 770 771 772 773 774 775 776 777

            hist.create(1, levels.cols - 1, CV_32S);

            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;

            int buf_size;
            get_buf_size(sz, levels.cols, &buf_size);
778

779
            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
780 781 782

            NppStreamHandler h(stream);

783
            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buffer.ptr<Npp8u>()) );
784

785 786
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
787
        }
788 789
    };
    template<int SDEPTH, typename NppHistogramRangeFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
790
    struct NppHistogramRangeC4
791
    {
792
        typedef typename NppHistogramRangeFuncC4<SDEPTH>::src_t src_t;
793 794
        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
795

796
        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buffer, cudaStream_t stream)
797
        {
798 799 800 801
            CV_Assert(levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1);
            CV_Assert(levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1);
            CV_Assert(levels[2].type() == LEVEL_TYPE_CODE && levels[2].rows == 1);
            CV_Assert(levels[3].type() == LEVEL_TYPE_CODE && levels[3].rows == 1);
802 803 804 805 806 807 808 809

            hist[0].create(1, levels[0].cols - 1, CV_32S);
            hist[1].create(1, levels[1].cols - 1, CV_32S);
            hist[2].create(1, levels[2].cols - 1, CV_32S);
            hist[3].create(1, levels[3].cols - 1, CV_32S);

            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
            int nLevels[] = {levels[0].cols, levels[1].cols, levels[2].cols, levels[3].cols};
810
            const level_t* pLevels[] = {levels[0].ptr<level_t>(), levels[1].ptr<level_t>(), levels[2].ptr<level_t>(), levels[3].ptr<level_t>()};
811 812 813 814 815 816 817

            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;

            int buf_size;
            get_buf_size(sz, nLevels, &buf_size);
818 819

            ensureSizeIsEnough(1, buf_size, CV_8U, buffer);
820 821 822

            NppStreamHandler h(stream);

823
            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, pLevels, nLevels, buffer.ptr<Npp8u>()) );
824

825 826
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
827
        }
828
    };
829 830 831 832 833 834 835 836 837
}

void cv::gpu::evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel)
{
    Mat host_levels(1, nLevels, CV_32SC1);
    nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
    levels.upload(host_levels);
}

838
void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
839 840 841 842 843 844
{
    GpuMat buf;
    histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
}

void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
845 846 847
{
    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );

848
    typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, GpuMat& buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
849
    static const hist_t hist_callers[] =
850 851 852 853 854 855 856
    {
        NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
        0,
        NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R, nppiHistogramEvenGetBufferSize_16u_C1R>::hist,
        NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist
    };

857
    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
858 859
}

860
void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
861 862 863 864 865 866
{
    GpuMat buf;
    histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
}

void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
867 868
{
    CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );
869

870
    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream);
871
    static const hist_t hist_callers[] =
872 873 874 875 876 877 878
    {
        NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
        0,
        NppHistogramEvenC4<CV_16U, nppiHistogramEven_16u_C4R, nppiHistogramEvenGetBufferSize_16u_C4R>::hist,
        NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R, nppiHistogramEvenGetBufferSize_16s_C4R>::hist
    };

879
    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
880 881
}

882
void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream)
883 884 885 886 887 888
{
    GpuMat buf;
    histRange(src, hist, levels, buf, stream);
}

void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream)
889
{
890
    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1);
891

892
    typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, cudaStream_t stream);
893
    static const hist_t hist_callers[] =
894 895 896 897
    {
        NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
        0,
        NppHistogramRangeC1<CV_16U, nppiHistogramRange_16u_C1R, nppiHistogramRangeGetBufferSize_16u_C1R>::hist,
898 899 900
        NppHistogramRangeC1<CV_16S, nppiHistogramRange_16s_C1R, nppiHistogramRangeGetBufferSize_16s_C1R>::hist,
        0,
        NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R, nppiHistogramRangeGetBufferSize_32f_C1R>::hist
901 902
    };

903
    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
904 905
}

906
void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
907 908 909 910 911 912
{
    GpuMat buf;
    histRange(src, hist, levels, buf, stream);
}

void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream)
913
{
914
    CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4);
915

916
    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, cudaStream_t stream);
917
    static const hist_t hist_callers[] =
918 919 920 921
    {
        NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
        0,
        NppHistogramRangeC4<CV_16U, nppiHistogramRange_16u_C4R, nppiHistogramRangeGetBufferSize_16u_C4R>::hist,
922 923 924
        NppHistogramRangeC4<CV_16S, nppiHistogramRange_16s_C4R, nppiHistogramRangeGetBufferSize_16s_C4R>::hist,
        0,
        NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R, nppiHistogramRangeGetBufferSize_32f_C4R>::hist
925 926
    };

927
    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
928 929
}

930
namespace cv { namespace gpu { namespace device
931
{
932 933 934
    namespace hist
    {
        void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);
935

936 937
        const int PARTIAL_HISTOGRAM256_COUNT = 240;
        const int HISTOGRAM256_BIN_COUNT     = 256;
938

939 940 941
        void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
    }
}}}
942 943 944 945 946 947 948 949 950

void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
{
    GpuMat buf;
    calcHist(src, hist, buf, stream);
}

void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
{
951
    using namespace ::cv::gpu::device::hist;
952 953 954 955 956 957 958 959 960 961

    CV_Assert(src.type() == CV_8UC1);

    hist.create(1, 256, CV_32SC1);

    ensureSizeIsEnough(1, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT, CV_32SC1, buf);

    histogram256_gpu(src, hist.ptr<int>(), buf.ptr<unsigned int>(), StreamAccessor::getStream(stream));
}

962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream)
{
    GpuMat hist;
    GpuMat buf;
    equalizeHist(src, dst, hist, buf, stream);
}

void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream)
{
    GpuMat buf;
    equalizeHist(src, dst, hist, buf, stream);
}

void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
{
977
    using namespace ::cv::gpu::device::hist;
978 979 980 981 982 983 984 985

    CV_Assert(src.type() == CV_8UC1);

    dst.create(src.size(), src.type());

    int intBufSize;
    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );

986
    int bufSize = static_cast<int>(std::max(256 * 240 * sizeof(int), intBufSize + 256 * sizeof(int)));
987 988 989 990 991 992 993 994 995 996 997 998 999 1000

    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);

    GpuMat histBuf(1, 256 * 240, CV_32SC1, buf.ptr());
    GpuMat intBuf(1, intBufSize, CV_8UC1, buf.ptr());
    GpuMat lut(1, 256, CV_32S, buf.ptr() + intBufSize);

    calcHist(src, hist, histBuf, s);

    cudaStream_t stream = StreamAccessor::getStream(s);

    NppStreamHandler h(stream);

    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
1001

1002 1003 1004 1005 1006 1007
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );

    equalizeHist_gpu(src, dst, lut.ptr<int>(), stream);
}

1008 1009 1010
////////////////////////////////////////////////////////////////////////
// cornerHarris & minEgenVal

1011
namespace cv { namespace gpu { namespace device
1012
{
1013
    namespace imgproc
1014
    {
1015 1016
        void cornerHarris_gpu(int block_size, float k, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream);
        void cornerMinEigenVal_gpu(int block_size, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream);
1017 1018
    }
}}}
1019

1020
namespace
1021
{
1022
    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
1023 1024 1025
    {
        double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;

1026
        if (ksize < 0)
1027
            scale *= 2.;
1028

1029 1030
        if (src.depth() == CV_8U)
            scale *= 255.;
1031

1032
        scale = 1./scale;
1033

1034 1035 1036
        Dx.create(src.size(), CV_32F);
        Dy.create(src.size(), CV_32F);

1037 1038
        if (ksize > 0)
        {
1039 1040
            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
1041 1042 1043
        }
        else
        {
1044 1045
            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
1046
        }
1047
    }
1048
}
1049

1050 1051
bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
{
1052
    switch (cpuBorderType)
1053
    {
1054
    case cv::BORDER_REFLECT101:
1055 1056
        gpuBorderType = cv::gpu::BORDER_REFLECT101_GPU;
        return true;
1057
    case cv::BORDER_REPLICATE:
1058 1059
        gpuBorderType = cv::gpu::BORDER_REPLICATE_GPU;
        return true;
1060
    case cv::BORDER_CONSTANT:
1061 1062
        gpuBorderType = cv::gpu::BORDER_CONSTANT_GPU;
        return true;
1063 1064 1065 1066 1067 1068 1069 1070 1071
    case cv::BORDER_REFLECT:
        gpuBorderType = cv::gpu::BORDER_REFLECT_GPU;
        return true;
    case cv::BORDER_WRAP:
        gpuBorderType = cv::gpu::BORDER_WRAP_GPU;
        return true;
    default:
        return false;
    };
1072 1073 1074
    return false;
}

1075
void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType)
1076 1077 1078 1079 1080 1081
{
    GpuMat Dx, Dy;
    cornerHarris(src, dst, Dx, Dy, blockSize, ksize, k, borderType);
}

void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType)
1082 1083 1084 1085 1086 1087
{
    GpuMat buf;
    cornerHarris(src, dst, Dx, Dy, buf, blockSize, ksize, k, borderType);
}

void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
1088
{
1089
    using namespace cv::gpu::device::imgproc;
1090

1091
    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1092

1093
    int gpuBorderType;
1094
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
1095

1096
    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
1097

1098
    dst.create(src.size(), CV_32F);
1099 1100

    cornerHarris_gpu(blockSize, static_cast<float>(k), Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
1101 1102
}

1103
void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
1104
{
1105 1106 1107 1108 1109
    GpuMat Dx, Dy;
    cornerMinEigenVal(src, dst, Dx, Dy, blockSize, ksize, borderType);
}

void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType)
1110 1111 1112 1113 1114 1115
{
    GpuMat buf;
    cornerMinEigenVal(src, dst, Dx, Dy, buf, blockSize, ksize, borderType);
}

void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
1116
{
1117
    using namespace ::cv::gpu::device::imgproc;
1118

1119
    CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1120

1121
    int gpuBorderType;
1122
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
1123

1124
    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
1125

1126
    dst.create(src.size(), CV_32F);
1127 1128

    cornerMinEigenVal_gpu(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
1129 1130
}

1131 1132 1133
//////////////////////////////////////////////////////////////////////////////
// mulSpectrums

1134
namespace cv { namespace gpu { namespace device
1135
{
1136
    namespace imgproc
1137 1138
    {
        void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
1139

1140 1141 1142
        void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
    }
}}}
1143

1144
void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
1145
{
1146
    (void)flags;
1147
    using namespace ::cv::gpu::device::imgproc;
1148

1149
    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
1150

1151
    static Caller callers[] = { device::imgproc::mulSpectrums, device::imgproc::mulSpectrums_CONJ };
1152 1153 1154 1155 1156 1157 1158

    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
    CV_Assert(a.size() == b.size());

    c.create(a.size(), CV_32FC2);

    Caller caller = callers[(int)conjB];
1159
    caller(a, b, c, StreamAccessor::getStream(stream));
1160 1161 1162 1163 1164
}

//////////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums

1165
namespace cv { namespace gpu { namespace device
1166
{
1167
    namespace imgproc
1168 1169
    {
        void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
1170

1171 1172 1173
        void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
    }
}}}
1174

1175
void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
1176
{
1177
    (void)flags;
1178
    using namespace ::cv::gpu::device::imgproc;
1179

1180
    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
1181
    static Caller callers[] = { device::imgproc::mulAndScaleSpectrums, device::imgproc::mulAndScaleSpectrums_CONJ };
1182 1183 1184 1185 1186 1187 1188

    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
    CV_Assert(a.size() == b.size());

    c.create(a.size(), CV_32FC2);

    Caller caller = callers[(int)conjB];
1189
    caller(a, b, scale, c, StreamAccessor::getStream(stream));
1190 1191
}

1192 1193 1194
//////////////////////////////////////////////////////////////////////////////
// dft

1195
void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)
1196
{
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
#ifndef HAVE_CUFFT

    OPENCV_GPU_UNUSED(src);
    OPENCV_GPU_UNUSED(dst);
    OPENCV_GPU_UNUSED(dft_size);
    OPENCV_GPU_UNUSED(flags);
    OPENCV_GPU_UNUSED(stream);

    throw_nogpu();

#else

1209 1210 1211 1212 1213
    CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);

    // We don't support unpacked output (in the case of real input)
    CV_Assert(!(flags & DFT_COMPLEX_OUTPUT));

Alexey Spizhevoy's avatar
Alexey Spizhevoy committed
1214
    bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1);
1215 1216 1217 1218 1219 1220 1221 1222 1223
    int is_row_dft = flags & DFT_ROWS;
    int is_scaled_dft = flags & DFT_SCALE;
    int is_inverse = flags & DFT_INVERSE;
    bool is_complex_input = src.channels() == 2;
    bool is_complex_output = !(flags & DFT_REAL_OUTPUT);

    // We don't support real-to-real transform
    CV_Assert(is_complex_input || is_complex_output);

1224
    GpuMat src_data;
1225

1226
    // Make sure here we work with the continuous input,
1227
    // as CUFFT can't handle gaps
1228 1229 1230 1231
    src_data = src;
    createContinuous(src.rows, src.cols, src.type(), src_data);
    if (src_data.data != src.data)
        src.copyTo(src_data);
1232

1233
    Size dft_size_opt = dft_size;
1234
    if (is_1d_input && !is_row_dft)
Alexey Spizhevoy's avatar
Alexey Spizhevoy committed
1235 1236
    {
        // If the source matrix is single column handle it as single row
1237 1238
        dft_size_opt.width = std::max(dft_size.width, dft_size.height);
        dft_size_opt.height = std::min(dft_size.width, dft_size.height);
Alexey Spizhevoy's avatar
Alexey Spizhevoy committed
1239
    }
1240 1241

    cufftType dft_type = CUFFT_R2C;
1242
    if (is_complex_input)
1243 1244
        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;

1245
    CV_Assert(dft_size_opt.width > 1);
1246 1247 1248

    cufftHandle plan;
    if (is_1d_input || is_row_dft)
1249
        cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height);
1250
    else
1251
        cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);
1252

1253 1254
    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );

1255 1256 1257 1258
    if (is_complex_input)
    {
        if (is_complex_output)
        {
Alexey Spizhevoy's avatar
Alexey Spizhevoy committed
1259
            createContinuous(dft_size, CV_32FC2, dst);
1260
            cufftSafeCall(cufftExecC2C(
1261
                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
1262 1263 1264 1265
                    is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
        }
        else
        {
Alexey Spizhevoy's avatar
Alexey Spizhevoy committed
1266
            createContinuous(dft_size, CV_32F, dst);
1267
            cufftSafeCall(cufftExecC2R(
1268
                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
1269 1270 1271 1272
        }
    }
    else
    {
1273 1274
        // We could swap dft_size for efficiency. Here we must reflect it
        if (dft_size == dft_size_opt)
Alexey Spizhevoy's avatar
Alexey Spizhevoy committed
1275 1276 1277
            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, dst);
        else
            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, dst);
1278 1279

        cufftSafeCall(cufftExecR2C(
1280
                plan, src_data.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
1281 1282 1283
    }

    cufftSafeCall(cufftDestroy(plan));
1284 1285

    if (is_scaled_dft)
1286 1287 1288
        multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);

#endif
1289 1290
}

1291
//////////////////////////////////////////////////////////////////////////////
1292
// convolve
1293

1294
void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size)
1295
{
1296 1297
    result_size = Size(image_size.width - templ_size.width + 1,
                       image_size.height - templ_size.height + 1);
1298

1299 1300 1301
    block_size = user_block_size;
    if (user_block_size.width == 0 || user_block_size.height == 0)
        block_size = estimateBlockSize(result_size, templ_size);
1302

1303 1304
    dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
    dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
1305 1306 1307 1308

    // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
    // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
    if (dft_size.width > 8192)
1309
        dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
1310
    if (dft_size.height > 8192)
1311
        dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
1312 1313 1314 1315 1316

    // To avoid wasting time doing small DFTs
    dft_size.width = std::max(dft_size.width, 512);
    dft_size.height = std::max(dft_size.height, 512);

1317 1318 1319 1320 1321 1322 1323 1324 1325
    createContinuous(dft_size, CV_32F, image_block);
    createContinuous(dft_size, CV_32F, templ_block);
    createContinuous(dft_size, CV_32F, result_data);

    spect_len = dft_size.height * (dft_size.width / 2 + 1);
    createContinuous(1, spect_len, CV_32FC2, image_spect);
    createContinuous(1, spect_len, CV_32FC2, templ_spect);
    createContinuous(1, spect_len, CV_32FC2, result_spect);

1326 1327 1328
    // Use maximum result matrix block size for the estimated DFT block size
    block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
    block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
1329
}
1330 1331


1332
Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
1333
{
1334 1335 1336
    int width = (result_size.width + 2) / 3;
    int height = (result_size.height + 2) / 3;
    width = std::min(width, result_size.width);
1337
    height = std::min(height, result_size.height);    
1338
    return Size(width, height);
1339 1340 1341
}


1342
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
1343 1344 1345
{
    ConvolveBuf buf;
    convolve(image, templ, result, ccorr, buf);
1346 1347
}

1348
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
1349
{
1350
    using namespace ::cv::gpu::device::imgproc;
1351

1352
#ifndef HAVE_CUFFT
1353
    throw_nogpu();
1354
#else
1355 1356
    StaticAssert<sizeof(float) == sizeof(cufftReal)>::check();
    StaticAssert<sizeof(float) * 2 == sizeof(cufftComplex)>::check();
1357 1358 1359 1360

    CV_Assert(image.type() == CV_32F);
    CV_Assert(templ.type() == CV_32F);

1361 1362
    buf.create(image.size(), templ.size());
    result.create(buf.result_size, CV_32F);
1363

1364 1365
    Size& block_size = buf.block_size;
    Size& dft_size = buf.dft_size;
1366

1367 1368 1369
    GpuMat& image_block = buf.image_block;
    GpuMat& templ_block = buf.templ_block;
    GpuMat& result_data = buf.result_data;
1370

1371 1372 1373
    GpuMat& image_spect = buf.image_spect;
    GpuMat& templ_spect = buf.templ_spect;
    GpuMat& result_spect = buf.result_spect;
1374

1375 1376
    cufftHandle planR2C, planC2R;
    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
1377
    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));   
1378

1379 1380
    cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
    cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
1381

1382
    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
1383
    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
1384
                   templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
1385

1386
    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
1387
                               templ_spect.ptr<cufftComplex>()));
1388

1389 1390 1391 1392
    // Process all blocks of the result matrix
    for (int y = 0; y < result.rows; y += block_size.height)
    {
        for (int x = 0; x < result.cols; x += block_size.width)
1393
        {
1394 1395
            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
                                std::min(y + dft_size.height, image.rows) - y);
1396
            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
1397 1398 1399 1400
                             image.step);
            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
                           0, image_block.cols - image_roi.cols, 0, Scalar(), stream);

1401
            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
1402 1403 1404
                                       image_spect.ptr<cufftComplex>()));
            mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
                                 1.f / dft_size.area(), ccorr, stream);
1405
            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
1406 1407 1408 1409
                                       result_data.ptr<cufftReal>()));

            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
                                 std::min(y + block_size.height, result.rows) - y);
1410
            GpuMat result_roi(result_roi_size, result.type(),
1411
                              (void*)(result.ptr<float>(y) + x), result.step);
1412
            GpuMat result_block(result_roi_size, result_data.type(),
1413
                                result_data.ptr(), result_data.step);
1414

1415 1416 1417 1418 1419
            if (stream)
                stream.enqueueCopy(result_block, result_roi);
            else
                result_block.copyTo(result_roi);
        }
1420 1421
    }

1422 1423
    cufftSafeCall(cufftDestroy(planR2C));
    cufftSafeCall(cufftDestroy(planC2R));
1424
#endif
1425 1426
}

1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472

//////////////////////////////////////////////////////////////////////////////
// Canny

cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_)
{
    CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());

    create(dx_.size(), -1);
}

void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
{
    ensureSizeIsEnough(image_size, CV_32SC1, dx);
    ensureSizeIsEnough(image_size, CV_32SC1, dy);

    if (apperture_size == 3)
    {
        ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
        ensureSizeIsEnough(image_size, CV_32SC1, dy_buf);
    }
    else if(apperture_size > 0)
    {
        if (!filterDX)
            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
        if (!filterDY)
            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
    }

    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf);

    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
}

void cv::gpu::CannyBuf::release()
{
    dx.release();
    dy.release();
    dx_buf.release();
    dy_buf.release();
    edgeBuf.release();
    trackBuf1.release();
    trackBuf2.release();
}

1473
namespace cv { namespace gpu { namespace device
1474
{
1475
    namespace canny
1476 1477
    {
        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
1478

1479 1480
        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
1481

1482
        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
1483

1484
        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
1485

1486
        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
1487

1488 1489 1490
        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
    }
}}}
1491 1492 1493 1494 1495

namespace
{
    void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
    {
1496
        using namespace ::cv::gpu::device::canny;
1497 1498

        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
1499

1500
        edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols);
1501

1502
        edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols);
1503

1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515
        getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
    }
}

void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{
    CannyBuf buf(src.size(), apperture_size);
    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
}

void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{
1516
    using namespace ::cv::gpu::device::canny;
1517 1518 1519

    CV_Assert(src.type() == CV_8UC1);

1520 1521 1522
    if (!TargetArchs::builtWith(SHARED_ATOMICS) || !DeviceInfo().supports(SHARED_ATOMICS))
        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics");

1523 1524 1525 1526 1527
    if( low_thresh > high_thresh )
        std::swap( low_thresh, high_thresh);

    dst.create(src.size(), CV_8U);
    dst.setTo(Scalar::all(0));
1528

1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556
    buf.create(src.size(), apperture_size);
    buf.edgeBuf.setTo(Scalar::all(0));

    if (apperture_size == 3)
    {
        calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);

        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
    }
    else
    {
        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));

        calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
    }

    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
}

void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{
    CannyBuf buf(dx, dy);
    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
}

void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{
1557
    using namespace ::cv::gpu::device::canny;
1558

1559
    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
1560 1561 1562 1563 1564 1565 1566
    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());

    if( low_thresh > high_thresh )
        std::swap( low_thresh, high_thresh);

    dst.create(dx.size(), CV_8U);
    dst.setTo(Scalar::all(0));
1567

1568 1569 1570 1571 1572 1573 1574 1575 1576
    buf.dx = dx; buf.dy = dy;
    buf.create(dx.size(), -1);
    buf.edgeBuf.setTo(Scalar::all(0));

    calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);

    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
}

1577
#endif /* !defined (HAVE_CUDA) */
1578

1579