matrix_operations.cpp 36.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
yao's avatar
yao committed
15
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 17 18 19
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Niko Li, newlife20080214@gmail.com
yao's avatar
yao committed
20
//    Yao Wang, bitwangyaoyao@gmail.com
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other oclMaterials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#include "precomp.hpp"

50
#define ALIGN 32
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
#define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN

using namespace cv;
using namespace cv::ocl;

////////////////////////////////////////////////////////////////////////
//////////////////////////////// oclMat ////////////////////////////////
////////////////////////////////////////////////////////////////////////

//helper routines
namespace cv
{
    namespace ocl
    {
        ///////////////////////////OpenCL kernel strings///////////////////////////
        extern const char *operator_copyToM;
        extern const char *operator_convertTo;
        extern const char *operator_setTo;
        extern const char *operator_setToM;
70
        extern const char *convertC3C4;
71 72
        extern DevMemType gDeviceMemType;
        extern DevMemRW gDeviceMemRW;
73 74 75
    }
}

76

77 78
////////////////////////////////////////////////////////////////////////
// convert_C3C4
79
static void convert_C3C4(const cl_mem &src, oclMat &dst)
80
{
81 82
    int dstStep_in_pixel = dst.step1() / dst.oclchannels();
    int pixel_end = dst.wholecols * dst.wholerows - 1;
83
    Context *clCxt = dst.clCxt;
84
    String kernelName = "convertC3C4";
85
    char compile_option[32];
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
    switch(dst.depth())
    {
    case 0:
        sprintf(compile_option, "-D GENTYPE4=uchar4");
        break;
    case 1:
        sprintf(compile_option, "-D GENTYPE4=char4");
        break;
    case 2:
        sprintf(compile_option, "-D GENTYPE4=ushort4");
        break;
    case 3:
        sprintf(compile_option, "-D GENTYPE4=short4");
        break;
    case 4:
        sprintf(compile_option, "-D GENTYPE4=int4");
        break;
    case 5:
        sprintf(compile_option, "-D GENTYPE4=float4");
        break;
    case 6:
        sprintf(compile_option, "-D GENTYPE4=double4");
        break;
109
    default:
110
        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
111
    }
112 113 114 115 116 117 118
    std::vector< std::pair<size_t, const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src));
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.wholecols));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.wholerows));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&pixel_end));
119

120
    size_t globalThreads[3] = {((dst.wholecols * dst.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
121 122
    size_t localThreads[3] = {256, 1, 1};

123
    openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
124 125 126
}
////////////////////////////////////////////////////////////////////////
// convert_C4C3
127
static void convert_C4C3(const oclMat &src, cl_mem &dst)
128
{
129 130
    int srcStep_in_pixel = src.step1() / src.oclchannels();
    int pixel_end = src.wholecols * src.wholerows - 1;
131
    Context *clCxt = src.clCxt;
132
    String kernelName = "convertC4C3";
133
    char compile_option[32];
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
    switch(src.depth())
    {
    case 0:
        sprintf(compile_option, "-D GENTYPE4=uchar4");
        break;
    case 1:
        sprintf(compile_option, "-D GENTYPE4=char4");
        break;
    case 2:
        sprintf(compile_option, "-D GENTYPE4=ushort4");
        break;
    case 3:
        sprintf(compile_option, "-D GENTYPE4=short4");
        break;
    case 4:
        sprintf(compile_option, "-D GENTYPE4=int4");
        break;
    case 5:
        sprintf(compile_option, "-D GENTYPE4=float4");
        break;
    case 6:
        sprintf(compile_option, "-D GENTYPE4=double4");
        break;
157
    default:
158
        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
159
    }
160

161 162 163 164 165 166 167
    std::vector< std::pair<size_t, const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.wholecols));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.wholerows));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&pixel_end));
168

169
    size_t globalThreads[3] = {((src.wholecols * src.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
170 171
    size_t localThreads[3] = {256, 1, 1};

172
    openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
173 174 175 176 177 178 179 180
}

void cv::ocl::oclMat::upload(const Mat &m)
{
    CV_DbgAssert(!m.empty());
    Size wholeSize;
    Point ofs;
    m.locateROI(wholeSize, ofs);
181 182
    if(m.channels() == 3)
    {
183
        create(wholeSize, m.type());
184 185 186
        int pitch = wholeSize.width * 3 * m.elemSize1();
        int tail_padding = m.elemSize1() * 3072;
        int err;
187 188 189 190 191
        cl_mem temp;
        if(gDeviceMemType!=DEVICE_MEM_UHP && gDeviceMemType!=DEVICE_MEM_CHP){
            temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
                                  (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
            openCLVerifyCall(err);
192
            openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step,
193 194 195 196 197 198 199
                           wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
        }
        else{
            temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,
                                  (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, m.datastart, &err);
            openCLVerifyCall(err);
        }
200

201

Niko's avatar
Niko committed
202
        convert_C3C4(temp, *this);
203
        openCLSafeCall(clReleaseMemObject(temp));
204 205
    }
    else
206
    {
207 208 209
        // try to use host ptr
        createEx(wholeSize, m.type(), gDeviceMemRW, gDeviceMemType, m.datastart);
        if(gDeviceMemType!=DEVICE_MEM_UHP && gDeviceMemType!=DEVICE_MEM_CHP)
210
            openCLMemcpy2D(clCxt, data, step, m.datastart, m.step,
211
                           wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
212
    }
213 214 215 216 217 218

    rows = m.rows;
    cols = m.cols;
    offset = ofs.y * step + ofs.x * elemSize();
}

219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
cv::ocl::oclMat::operator cv::_InputArray()
{
    _InputArray newInputArray;
    newInputArray.flags = cv::_InputArray::OCL_MAT;
    newInputArray.obj   = reinterpret_cast<void *>(this);
    return newInputArray;
}

cv::ocl::oclMat::operator cv::_OutputArray()
{
    _OutputArray newOutputArray;
    newOutputArray.flags = cv::_InputArray::OCL_MAT;
    newOutputArray.obj   = reinterpret_cast<void *>(this);
    return newOutputArray;
}

235
cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
236 237 238 239 240
{
    CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
    return *reinterpret_cast<oclMat*>(src.obj);
}

241
cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
242 243 244 245 246
{
    CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
    return *reinterpret_cast<oclMat*>(src.obj);
}

247 248 249
void cv::ocl::oclMat::download(cv::Mat &m) const
{
    CV_DbgAssert(!this->empty());
250 251 252 253 254 255 256 257
    //   int t = type();
    //   if(download_channels == 3)
    //{
    //	t = CV_MAKETYPE(depth(), 3);
    //}
    m.create(wholerows, wholecols, type());

    if(m.channels() == 3)
258
    {
259 260 261
        int pitch = wholecols * 3 * m.elemSize1();
        int tail_padding = m.elemSize1() * 3072;
        int err;
262
        cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
263 264 265
                                     (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
        openCLVerifyCall(err);

Niko's avatar
Niko committed
266
        convert_C4C3(*this, temp);
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
        openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
        //int* cputemp=new int[wholecols*wholerows * 3];
        //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
        //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
        //						0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL));
        //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
        //						0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
        //for(int i=0;i<wholerows;i++)
        //{
        //	int *a = cputemp+i*wholecols * 3,*b = cpudata + i*this->step/sizeof(int);
        //	for(int j=0;j<wholecols;j++)
        //	{
        //		if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
        //			printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
        //			i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
        //	}
        //}
        //delete []cputemp;
        //delete []cpudata;
        openCLSafeCall(clReleaseMemObject(temp));
287 288
    }
    else
289 290 291
    {
        openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
    }
292 293 294 295 296 297 298 299 300 301 302 303 304 305
    Size wholesize;
    Point ofs;
    locateROI(wholesize, ofs);
    m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
}

/////////////////////common//////////////////////////////////////
inline int divUp(int total, int grain)
{
    return (total + grain - 1) / grain;
}
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
306
static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, String kernelName)
307 308
{
    CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
niko's avatar
niko committed
309
                  src.rows == dst.rows && src.cols == dst.cols
310
                  && mask.type() == CV_8UC1);
311

312
    std::vector<std::pair<size_t , const void *> > args;
313

314
    String string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
niko's avatar
niko committed
315 316 317
        {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
        {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
        {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
318
    };
319 320
    char compile_option[32];
    sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
321 322 323
    size_t localThreads[3] = {16, 16, 1};
    size_t globalThreads[3];

niko's avatar
niko committed
324
    globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
325 326 327 328 329 330
    globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
    globalThreads[2] = 1;

    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();

331 332 333 334 335 336 337 338 339 340 341
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
342 343

    openCLExecuteKernel(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
344
                        localThreads, args, -1, -1, compile_option);
345 346 347 348 349 350 351
}

void cv::ocl::oclMat::copyTo( oclMat &m ) const
{
    CV_DbgAssert(!this->empty());
    m.create(size(), type());
    openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
Niko's avatar
Niko committed
352
                       data, step, cols * elemSize(), rows, offset);
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
}

void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
{
    if (mask.empty())
    {
        copyTo(mat);
    }
    else
    {
        mat.create(size(), type());
        copy_to_with_mask(*this, mat, mask, "copy_to_with_mask");
    }
}

///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
371
static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
372
{
373
    String kernelName = "convert_to_S";
Andrey Kamaev's avatar
Andrey Kamaev committed
374
    std::stringstream idxStr;
375
    idxStr << src.depth();
Andrey Kamaev's avatar
Andrey Kamaev committed
376
    kernelName = kernelName + idxStr.str().c_str();
377 378
    float alpha_f = alpha, beta_f = beta;
    CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
379
    std::vector<std::pair<size_t , const void *> > args;
380 381 382 383 384 385 386 387 388 389 390
    size_t localThreads[3] = {16, 16, 1};
    size_t globalThreads[3];
    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
    globalThreads[2] = 1;
    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
    if(dst.type() == CV_8UC1)
    {
        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
    }
391 392 393 394 395 396 397 398 399 400
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&alpha_f ));
    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&beta_f ));
401
    openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
402
                        localThreads, args, dst.oclchannels(), dst.depth());
403 404 405 406 407 408 409 410 411 412 413
}
void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
{
    //cout << "cv::ocl::oclMat::convertTo()" << endl;

    bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
                   && fabs(beta) < std::numeric_limits<double>::epsilon();

    if( rtype < 0 )
        rtype = type();
    else
peng xiao's avatar
peng xiao committed
414
        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441

    //int scn = channels();
    int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
    if( sdepth == ddepth && noScale )
    {
        copyTo(dst);
        return;
    }

    oclMat temp;
    const oclMat *psrc = this;
    if( sdepth != ddepth && psrc == &dst )
        psrc = &(temp = *this);

    dst.create( size(), rtype );
    convert_run(*psrc, dst, alpha, beta);
}

///////////////////////////////////////////////////////////////////////////
//////////////////////////////// setTo ////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
{
    //cout << "cv::ocl::oclMat::=" << endl;
    setTo(s);
    return *this;
}
442
static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, String kernelName)
443
{
444
    std::vector<std::pair<size_t , const void *> > args;
445

446 447 448 449 450 451 452 453 454 455
    size_t localThreads[3] = {16, 16, 1};
    size_t globalThreads[3];
    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
    globalThreads[2] = 1;
    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
    if(dst.type() == CV_8UC1)
    {
        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
    }
456 457 458 459 460 461 462 463 464 465 466
    char compile_option[32];
    union sc
    {
        cl_uchar4 uval;
        cl_char4  cval;
        cl_ushort4 usval;
        cl_short4 shval;
        cl_int4 ival;
        cl_float4 fval;
        cl_double4 dval;
    } val;
467 468
    switch(dst.depth())
    {
niko's avatar
niko committed
469
    case CV_8U:
470 471 472 473 474 475 476 477
        val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
        val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
        val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
        val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=uchar");
478
            args.push_back( std::make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
479 480 481
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=uchar4");
482
            args.push_back( std::make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
483 484
            break;
        default:
485
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
486
        }
487
        break;
niko's avatar
niko committed
488
    case CV_8S:
489 490 491 492 493 494 495 496
        val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
        val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
        val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
        val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=char");
497
            args.push_back( std::make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
498 499 500
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=char4");
501
            args.push_back( std::make_pair( sizeof(cl_char4) , (void *)&val.cval ));
502 503
            break;
        default:
504
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
505
        }
506
        break;
niko's avatar
niko committed
507
    case CV_16U:
508 509 510 511 512 513 514 515
        val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
        val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
        val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
        val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=ushort");
516
            args.push_back( std::make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
517 518 519
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=ushort4");
520
            args.push_back( std::make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
521 522
            break;
        default:
523
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
524
        }
525
        break;
niko's avatar
niko committed
526
    case CV_16S:
527 528 529 530 531 532 533 534
        val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
        val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
        val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
        val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=short");
535
            args.push_back( std::make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
536 537 538
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=short4");
539
            args.push_back( std::make_pair( sizeof(cl_short4) , (void *)&val.shval ));
540 541
            break;
        default:
542
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
543
        }
544
        break;
niko's avatar
niko committed
545
    case CV_32S:
546 547 548 549 550 551 552 553
        val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
        val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
        val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
        val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=int");
554
            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
555 556 557 558 559 560
            break;
        case 2:
            sprintf(compile_option, "-D GENTYPE=int2");
            cl_int2 i2val;
            i2val.s[0] = val.ival.s[0];
            i2val.s[1] = val.ival.s[1];
561
            args.push_back( std::make_pair( sizeof(cl_int2) , (void *)&i2val ));
562 563 564
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=int4");
565
            args.push_back( std::make_pair( sizeof(cl_int4) , (void *)&val.ival ));
566 567
            break;
        default:
568
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
569
        }
570
        break;
niko's avatar
niko committed
571
    case CV_32F:
572 573 574 575 576 577 578 579
        val.fval.s[0] = scalar.val[0];
        val.fval.s[1] = scalar.val[1];
        val.fval.s[2] = scalar.val[2];
        val.fval.s[3] = scalar.val[3];
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=float");
580
            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
581 582 583
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=float4");
584
            args.push_back( std::make_pair( sizeof(cl_float4) , (void *)&val.fval ));
585 586
            break;
        default:
587
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
588
        }
589
        break;
niko's avatar
niko committed
590
    case CV_64F:
591 592 593 594 595 596 597 598
        val.dval.s[0] = scalar.val[0];
        val.dval.s[1] = scalar.val[1];
        val.dval.s[2] = scalar.val[2];
        val.dval.s[3] = scalar.val[3];
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=double");
599
            args.push_back( std::make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
600 601 602
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=double4");
603
            args.push_back( std::make_pair( sizeof(cl_double4) , (void *)&val.dval ));
604 605
            break;
        default:
606
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
607
        }
608
        break;
609
    default:
610
        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
611
    }
612
#ifdef CL_VERSION_1_2
613 614 615 616
    //this enables backwards portability to
    //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
    if(Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
        dst.offset == 0 && dst.cols == dst.wholecols)
617
    {
618
        clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
619
            (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
620 621
    }
    else
622
#endif
623
    {
624 625 626 627 628
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
629
        openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
630
            localThreads, args, -1, -1, compile_option);
631
    }
632 633
}

634
static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, String kernelName)
635 636
{
    CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
637
    std::vector<std::pair<size_t , const void *> > args;
638 639 640 641 642 643
    size_t localThreads[3] = {16, 16, 1};
    size_t globalThreads[3];
    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
    globalThreads[2] = 1;
    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
644 645 646 647 648 649 650 651 652 653 654
    char compile_option[32];
    union sc
    {
        cl_uchar4 uval;
        cl_char4  cval;
        cl_ushort4 usval;
        cl_short4 shval;
        cl_int4 ival;
        cl_float4 fval;
        cl_double4 dval;
    } val;
655 656
    switch(dst.depth())
    {
niko's avatar
niko committed
657
    case CV_8U:
658 659 660 661 662 663 664 665
        val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
        val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
        val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
        val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=uchar");
666
            args.push_back( std::make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
667 668 669
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=uchar4");
670
            args.push_back( std::make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
671 672
            break;
        default:
673
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
674
        }
675
        break;
niko's avatar
niko committed
676
    case CV_8S:
677 678 679 680 681 682 683 684
        val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
        val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
        val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
        val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=char");
685
            args.push_back( std::make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
686 687 688
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=char4");
689
            args.push_back( std::make_pair( sizeof(cl_char4) , (void *)&val.cval ));
690 691
            break;
        default:
692
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
693
        }
694
        break;
niko's avatar
niko committed
695
    case CV_16U:
696 697 698 699 700 701 702 703
        val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
        val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
        val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
        val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=ushort");
704
            args.push_back( std::make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
705 706 707
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=ushort4");
708
            args.push_back( std::make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
709 710
            break;
        default:
711
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
712
        }
713
        break;
niko's avatar
niko committed
714
    case CV_16S:
715 716 717 718 719 720 721 722
        val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
        val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
        val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
        val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=short");
723
            args.push_back( std::make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
724 725 726
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=short4");
727
            args.push_back( std::make_pair( sizeof(cl_short4) , (void *)&val.shval ));
728 729
            break;
        default:
730
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
731
        }
732
        break;
niko's avatar
niko committed
733
    case CV_32S:
734 735 736 737 738 739 740 741
        val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
        val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
        val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
        val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=int");
742
            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
743 744 745
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=int4");
746
            args.push_back( std::make_pair( sizeof(cl_int4) , (void *)&val.ival ));
747 748
            break;
        default:
749
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
750
        }
751
        break;
niko's avatar
niko committed
752
    case CV_32F:
753 754 755 756 757 758 759 760
        val.fval.s[0] = scalar.val[0];
        val.fval.s[1] = scalar.val[1];
        val.fval.s[2] = scalar.val[2];
        val.fval.s[3] = scalar.val[3];
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=float");
761
            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
762 763 764
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=float4");
765
            args.push_back( std::make_pair( sizeof(cl_float4) , (void *)&val.fval ));
766 767
            break;
        default:
768
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
769
        }
770
        break;
niko's avatar
niko committed
771
    case CV_64F:
772 773 774 775 776 777 778 779
        val.dval.s[0] = scalar.val[0];
        val.dval.s[1] = scalar.val[1];
        val.dval.s[2] = scalar.val[2];
        val.dval.s[3] = scalar.val[3];
        switch(dst.oclchannels())
        {
        case 1:
            sprintf(compile_option, "-D GENTYPE=double");
780
            args.push_back( std::make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
781 782 783
            break;
        case 4:
            sprintf(compile_option, "-D GENTYPE=double4");
784
            args.push_back( std::make_pair( sizeof(cl_double4) , (void *)&val.dval ));
785 786
            break;
        default:
787
            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
788
        }
789
        break;
790
    default:
791
        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
792
    }
793 794 795 796 797 798 799 800
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
801
    openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
802
                        localThreads, args, -1, -1, compile_option);
803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
}

oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
{
    //cout << "cv::ocl::oclMat::setTo()" << endl;
    CV_Assert(mask.type() == CV_8UC1);
    CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
    CV_DbgAssert( !this->empty());
    //cl_int status;
    //cl_mem mem;
    //mem = clCreateBuffer(this->clCxt->clContext,CL_MEM_READ_WRITE,
    //                   sizeof(double)*4,NULL,&status);
    //openCLVerifyCall(status);
    //double* s =  (double *)scalar.val;
    //openCLSafeCall(clEnqueueWriteBuffer(this->clCxt->clCmdQueue,
    //                   (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0));
    if (mask.empty())
    {
821 822 823 824 825 826 827 828
        if(type() == CV_8UC1)
        {
            set_to_withoutmask_run(*this, scalar, "set_to_without_mask_C1_D0");
        }
        else
        {
            set_to_withoutmask_run(*this, scalar, "set_to_without_mask");
        }
829 830 831
    }
    else
    {
832
        set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
833 834 835 836 837 838 839
    }

    return *this;
}

oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
{
840 841
    if( new_rows != 0 && new_rows != rows)
    {
842
        CV_Error( Error::StsBadFunc, "oclMat's number of rows can not be changed for current version" );
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874
    }

    oclMat hdr = *this;

    int cn = oclchannels();

    if (new_cn == 0)

        new_cn = cn;



    int total_width = cols * cn;



    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)

        new_rows = rows * total_width / new_cn;



    if (new_rows != 0 && new_rows != rows)

    {

        int total_size = total_width * rows;



        if (!isContinuous())

875
            CV_Error(Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
876 877 878 879 880



        if ((unsigned)new_rows > (unsigned)total_size)

881
            CV_Error(Error::StsOutOfRange, "Bad new number of rows");
882 883 884 885 886 887 888 889 890



        total_width = total_size / new_rows;



        if (total_width * new_rows != total_size)

891
            CV_Error(Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908



        hdr.rows = new_rows;

        hdr.step = total_width * elemSize1();

    }



    int new_width = total_width / new_cn;



    if (new_width * new_cn != total_width)

909
        CV_Error(Error::BadNumChannels, "The total width is not divisible by the new number of channels");
910 911 912 913 914 915 916 917 918 919 920



    hdr.cols = new_width;

    hdr.wholecols = new_width;

    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);



921 922 923 924
    return hdr;

}

925
void cv::ocl::oclMat::createEx(Size size, int type,
926
                               DevMemRW rw_type, DevMemType mem_type, void* hptr)
927
{
928
    createEx(size.height, size.width, type, rw_type, mem_type, hptr);
929 930
}

931
void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
932 933 934 935
{
    createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
}

936
void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
937
                               DevMemRW rw_type, DevMemType mem_type, void* hptr)
938 939 940
{
    clCxt = Context::getContext();
    /* core logic */
Andrey Kamaev's avatar
Andrey Kamaev committed
941
    _type &= Mat::TYPE_MASK;
942 943 944 945 946 947 948 949 950 951 952 953 954 955 956
    if( rows == _rows && cols == _cols && type() == _type && data )
        return;
    if( data )
        release();
    CV_DbgAssert( _rows >= 0 && _cols >= 0 );
    if( _rows > 0 && _cols > 0 )
    {
        flags = Mat::MAGIC_VAL + _type;
        rows = _rows;
        cols = _cols;
        wholerows = _rows;
        wholecols = _cols;
        size_t esz = elemSize();

        void *dev_ptr;
957
        openCLMallocPitch(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols),
958
                            rows, rw_type, mem_type, hptr);
959

960
        if (esz * cols == step)
961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
            flags |= Mat::CONTINUOUS_FLAG;

        int64 _nettosize = (int64)step * rows;
        size_t nettosize = (size_t)_nettosize;

        datastart = data = (uchar *)dev_ptr;
        dataend = data + nettosize;

        refcount = (int *)fastMalloc(sizeof(*refcount));
        *refcount = 1;
    }
}

void cv::ocl::oclMat::release()
{
    //cout << "cv::ocl::oclMat::release()" << endl;
    if( refcount && CV_XADD(refcount, -1) == 1 )
    {
        fastFree(refcount);
        openCLFree(datastart);
    }
    data = datastart = dataend = 0;
    step = rows = cols = 0;
    offset = wholerows = wholecols = 0;
    refcount = 0;
}

yao's avatar
yao committed
988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
oclMat& cv::ocl::oclMat::operator+=( const oclMat& m )
{
    add(*this, m, *this);
    return *this;
}

oclMat& cv::ocl::oclMat::operator-=( const oclMat& m )
{
    subtract(*this, m, *this);
    return *this;
}

oclMat& cv::ocl::oclMat::operator*=( const oclMat& m )
{
    multiply(*this, m, *this);
    return *this;
}

oclMat& cv::ocl::oclMat::operator/=( const oclMat& m )
{
    divide(*this, m, *this);
    return *this;
}