Commit b6b190df authored by Jin Ma's avatar Jin Ma

Rewrote moments of opencl version.

parent 957c85e9
...@@ -1518,7 +1518,12 @@ namespace cv ...@@ -1518,7 +1518,12 @@ namespace cv
float pos, oclMat &newFrame, oclMat &buf); float pos, oclMat &newFrame, oclMat &buf);
//! computes moments of the rasterized shape or a vector of points //! computes moments of the rasterized shape or a vector of points
CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage); //! _array should be a vector a points standing for the contour
CV_EXPORTS Moments ocl_moments(InputArray contour);
//! src should be a general image uploaded to the GPU.
//! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1
//! to use type of CV_64FC1, the GPU should support CV_64FC1
CV_EXPORTS Moments ocl_moments(oclMat& src, bool binary);
class CV_EXPORTS StereoBM_OCL class CV_EXPORTS StereoBM_OCL
{ {
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
// //
// * Redistribution's in binary form must reproduce the above copyright notice, // * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation // this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution. // and/or other oclMaterials provided with the distribution.
// //
// * The name of the copyright holders may not be used to endorse or promote products // * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission. // derived from this software without specific prior written permission.
...@@ -49,41 +49,42 @@ ...@@ -49,41 +49,42 @@
using namespace perf; using namespace perf;
using std::tr1::tuple; using std::tr1::tuple;
using std::tr1::get; using std::tr1::get;
using namespace cv;
using namespace cv::ocl;
using namespace cvtest;
using namespace testing;
using namespace std;
///////////// Moments ////////////////////////
typedef Size_MatType MomentsFixture; ///////////// Moments ////////////////////////
//*! performance of image
typedef tuple<Size, MatType, bool> MomentsParamType;
typedef TestBaseWithParam<MomentsParamType> MomentsFixture;
PERF_TEST_P(MomentsFixture, DISABLED_Moments, PERF_TEST_P(MomentsFixture, Moments,
::testing::Combine(OCL_TYPICAL_MAT_SIZES, ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1))) // TODO does not work properly (see below) OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_16UC1, CV_32FC1, CV_64FC1), ::testing::Values(false, true)))
{ {
const Size_MatType_t params = GetParam(); const MomentsParamType params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);
const int type = get<1>(params); const int type = get<1>(params);
const bool binaryImage = get<2>(params);
Mat src(srcSize, type), dst(7, 1, CV_64F); Mat src(srcSize, type), dst(7, 1, CV_64F);
const bool binaryImage = false; randu(src, 0, 255);
cv::Moments mom;
declare.in(src, WARMUP_RNG).out(dst);
oclMat src_d(src);
cv::Moments mom;
if (RUN_OCL_IMPL) if (RUN_OCL_IMPL)
{ {
ocl::oclMat oclSrc(src); OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(src_d, binaryImage);
OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(oclSrc, binaryImage); // TODO Use oclSrc
cv::HuMoments(mom, dst);
SANITY_CHECK(dst);
} }
else if (RUN_PLAIN_IMPL) else if (RUN_PLAIN_IMPL)
{ {
TEST_CYCLE() mom = cv::moments(src, binaryImage); TEST_CYCLE() mom = cv::moments(src, binaryImage);
cv::HuMoments(mom, dst);
SANITY_CHECK(dst);
} }
else else
OCL_PERF_ELSE OCL_PERF_ELSE
cv::HuMoments(mom, dst);
SANITY_CHECK(dst, 1e-3);
} }
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
// //
// * Redistribution's in binary form must reproduce the above copyright notice, // * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation // this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution. // and/or other oclMaterials provided with the distribution.
// //
// * The name of the copyright holders may not be used to endorse or promote products // * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission. // derived from this software without specific prior written permission.
...@@ -46,294 +46,342 @@ ...@@ -46,294 +46,342 @@
#include "precomp.hpp" #include "precomp.hpp"
#include "opencl_kernels.hpp" #include "opencl_kernels.hpp"
#if defined _MSC_VER
#define snprintf sprintf_s
#endif
namespace cv namespace cv
{ {
namespace ocl namespace ocl
{
// The function calculates center of gravity and the central second order moments
static void icvCompleteMomentState( CvMoments* moments )
{
double cx = 0, cy = 0;
double mu20, mu11, mu02;
assert( moments != 0 );
moments->inv_sqrt_m00 = 0;
if( fabs(moments->m00) > DBL_EPSILON )
{
double inv_m00 = 1. / moments->m00;
cx = moments->m10 * inv_m00;
cy = moments->m01 * inv_m00;
moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
}
// mu20 = m20 - m10*cx
mu20 = moments->m20 - moments->m10 * cx;
// mu11 = m11 - m10*cy
mu11 = moments->m11 - moments->m10 * cy;
// mu02 = m02 - m01*cy
mu02 = moments->m02 - moments->m01 * cy;
moments->mu20 = mu20;
moments->mu11 = mu11;
moments->mu02 = mu02;
// mu30 = m30 - cx*(3*mu20 + cx*m10)
moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
mu11 += mu11;
// mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
// mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
// mu03 = m03 - cy*(3*mu02 + cy*m01)
moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
}
static void icvContourMoments( CvSeq* contour, CvMoments* mom )
{
if( contour->total )
{ {
CvSeqReader reader; // The function calculates center of gravity and the central second order moments
int lpt = contour->total; static void icvCompleteMomentState( CvMoments* moments )
double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03; {
double cx = 0, cy = 0;
cvStartReadSeq( contour, &reader, 0 ); double mu20, mu11, mu02;
size_t reader_size = lpt << 1; assert( moments != 0 );
cv::Mat reader_mat(1,reader_size,CV_32FC1); moments->inv_sqrt_m00 = 0;
bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; if( fabs(moments->m00) > DBL_EPSILON )
{
double inv_m00 = 1. / moments->m00;
cx = moments->m10 * inv_m00;
cy = moments->m01 * inv_m00;
moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
}
if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float) // mu20 = m20 - m10*cx
{ mu20 = moments->m20 - moments->m10 * cx;
CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); // mu11 = m11 - m10*cy
mu11 = moments->m11 - moments->m10 * cy;
// mu02 = m02 - m01*cy
mu02 = moments->m02 - moments->m01 * cy;
moments->mu20 = mu20;
moments->mu11 = mu11;
moments->mu02 = mu02;
// mu30 = m30 - cx*(3*mu20 + cx*m10)
moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
mu11 += mu11;
// mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
// mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
// mu03 = m03 - cy*(3*mu02 + cy*m01)
moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
} }
if( is_float )
static void icvContourMoments( CvSeq* contour, CvMoments* mom )
{ {
for(size_t i = 0; i < reader_size; ++i) if( contour->total )
{ {
reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x; CvSeqReader reader;
reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y; int lpt = contour->total;
CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
cvStartReadSeq( contour, &reader, 0 );
size_t reader_size = lpt << 1;
cv::Mat reader_mat(1,reader_size,CV_32FC1);
bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
{
CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
}
if( is_float )
{
for(size_t i = 0; i < reader_size; ++i)
{
reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
}
}
else
{
for(size_t i = 0; i < reader_size; ++i)
{
reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
}
}
cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
cv::ocl::oclMat reader_oclmat(reader_mat);
int llength = std::min(lpt,128);
size_t localThreads[3] = { llength, 1, 1};
size_t globalThreads[3] = { lpt, 1, 1};
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_int) , (void *)&contour->total ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
cl_int dst_step = (cl_int)dst_a.step;
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step ));
char builOption[128];
snprintf(builOption, 128, "-D CV_8UC1");
openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1, builOption);
cv::Mat dst(dst_a);
a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
{
for (int i = 0; i < contour->total; ++i)
{
a00 += dst.at<cl_long>(0, i);
a10 += dst.at<cl_long>(1, i);
a01 += dst.at<cl_long>(2, i);
a20 += dst.at<cl_long>(3, i);
a11 += dst.at<cl_long>(4, i);
a02 += dst.at<cl_long>(5, i);
a30 += dst.at<cl_long>(6, i);
a21 += dst.at<cl_long>(7, i);
a12 += dst.at<cl_long>(8, i);
a03 += dst.at<cl_long>(9, i);
}
}
else
{
a00 = cv::sum(dst.row(0))[0];
a10 = cv::sum(dst.row(1))[0];
a01 = cv::sum(dst.row(2))[0];
a20 = cv::sum(dst.row(3))[0];
a11 = cv::sum(dst.row(4))[0];
a02 = cv::sum(dst.row(5))[0];
a30 = cv::sum(dst.row(6))[0];
a21 = cv::sum(dst.row(7))[0];
a12 = cv::sum(dst.row(8))[0];
a03 = cv::sum(dst.row(9))[0];
}
double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
if( fabs(a00) > FLT_EPSILON )
{
if( a00 > 0 )
{
db1_2 = 0.5;
db1_6 = 0.16666666666666666666666666666667;
db1_12 = 0.083333333333333333333333333333333;
db1_24 = 0.041666666666666666666666666666667;
db1_20 = 0.05;
db1_60 = 0.016666666666666666666666666666667;
}
else
{
db1_2 = -0.5;
db1_6 = -0.16666666666666666666666666666667;
db1_12 = -0.083333333333333333333333333333333;
db1_24 = -0.041666666666666666666666666666667;
db1_20 = -0.05;
db1_60 = -0.016666666666666666666666666666667;
}
// spatial moments
mom->m00 = a00 * db1_2;
mom->m10 = a10 * db1_6;
mom->m01 = a01 * db1_6;
mom->m20 = a20 * db1_12;
mom->m11 = a11 * db1_24;
mom->m02 = a02 * db1_12;
mom->m30 = a30 * db1_20;
mom->m21 = a21 * db1_60;
mom->m12 = a12 * db1_60;
mom->m03 = a03 * db1_20;
icvCompleteMomentState( mom );
}
} }
} }
else
Moments ocl_moments(oclMat& src, bool binary) //for image
{ {
for(size_t i = 0; i < reader_size; ++i) CV_Assert(src.oclchannels() == 1);
if(src.type() == CV_64FC1 && Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
{ {
reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x; CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
} }
}
cv::ocl::oclMat dst_a(10, lpt, CV_64FC1); if(binary)
cv::ocl::oclMat reader_oclmat(reader_mat);
int llength = std::min(lpt,128);
size_t localThreads[3] = { llength, 1, 1};
size_t globalThreads[3] = { lpt, 1, 1};
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_int) , (void *)&contour->total ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
cl_int dst_step = (cl_int)dst_a.step;
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step ));
openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
cv::Mat dst(dst_a);
a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
{
for (int i = 0; i < contour->total; ++i)
{ {
a00 += dst.at<cl_long>(0, i); oclMat mask;
a10 += dst.at<cl_long>(1, i); if(src.type() != CV_8UC1)
a01 += dst.at<cl_long>(2, i); {
a20 += dst.at<cl_long>(3, i); src.convertTo(mask, CV_8UC1);
a11 += dst.at<cl_long>(4, i); }
a02 += dst.at<cl_long>(5, i); oclMat src8u(src.size(), CV_8UC1);
a30 += dst.at<cl_long>(6, i); src8u.setTo(Scalar(255), mask);
a21 += dst.at<cl_long>(7, i); src = src8u;
a12 += dst.at<cl_long>(8, i);
a03 += dst.at<cl_long>(9, i);
} }
} const int TILE_SIZE = 256;
else
{
a00 = cv::sum(dst.row(0))[0];
a10 = cv::sum(dst.row(1))[0];
a01 = cv::sum(dst.row(2))[0];
a20 = cv::sum(dst.row(3))[0];
a11 = cv::sum(dst.row(4))[0];
a02 = cv::sum(dst.row(5))[0];
a30 = cv::sum(dst.row(6))[0];
a21 = cv::sum(dst.row(7))[0];
a12 = cv::sum(dst.row(8))[0];
a03 = cv::sum(dst.row(9))[0];
}
double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60; CvMoments mom;
if( fabs(a00) > FLT_EPSILON ) memset(&mom, 0, sizeof(mom));
{
if( a00 > 0 ) cv::Size size = src.size();
int blockx, blocky;
blockx = (size.width + TILE_SIZE - 1)/TILE_SIZE;
blocky = (size.height + TILE_SIZE - 1)/TILE_SIZE;
oclMat dst_m;
int tile_height = TILE_SIZE;
size_t localThreads[3] = {1, tile_height, 1};
size_t globalThreads[3] = {blockx, size.height, 1};
if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
{
dst_m.create(blocky * 10, blockx, CV_64FC1);
}else
{ {
db1_2 = 0.5; dst_m.create(blocky * 10, blockx, CV_32FC1);
db1_6 = 0.16666666666666666666666666666667;
db1_12 = 0.083333333333333333333333333333333;
db1_24 = 0.041666666666666666666666666666667;
db1_20 = 0.05;
db1_60 = 0.016666666666666666666666666666667;
} }
int src_step = (int)(src.step/src.elemSize());
int dstm_step = (int)(dst_m.step/dst_m.elemSize());
vector<pair<size_t , const void *> > args,args_sum;
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dstm_step ));
int binary_;
if(binary)
binary_ = 1;
else else
binary_ = 0;
args.push_back( make_pair( sizeof(cl_int) , (void *)&binary_));
char builOption[128];
if(binary || src.type() == CV_8UC1)
{
snprintf(builOption, 128, "-D CV_8UC1");
}else if(src.type() == CV_16UC1)
{ {
db1_2 = -0.5; snprintf(builOption, 128, "-D CV_16UC1");
db1_6 = -0.16666666666666666666666666666667; }else if(src.type() == CV_16SC1)
db1_12 = -0.083333333333333333333333333333333; {
db1_24 = -0.041666666666666666666666666666667; snprintf(builOption, 128, "-D CV_16SC1");
db1_20 = -0.05; }else if(src.type() == CV_32FC1)
db1_60 = -0.016666666666666666666666666666667; {
snprintf(builOption, 128, "-D CV_32FC1");
}else if(src.type() == CV_64FC1)
{
snprintf(builOption, 128, "-D CV_64FC1");
}else
{
CV_Error( CV_StsUnsupportedFormat, "" );
}
openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, -1, builOption);
Mat tmp(dst_m);
tmp.convertTo(tmp, CV_64FC1);
double tmp_m[10] = {0};
for(int j = 0; j < tmp.rows; j += 10)
{
for(int i = 0; i < tmp.cols; i++)
{
tmp_m[0] += tmp.at<double>(j, i);
tmp_m[1] += tmp.at<double>(j + 1, i);
tmp_m[2] += tmp.at<double>(j + 2, i);
tmp_m[3] += tmp.at<double>(j + 3, i);
tmp_m[4] += tmp.at<double>(j + 4, i);
tmp_m[5] += tmp.at<double>(j + 5, i);
tmp_m[6] += tmp.at<double>(j + 6, i);
tmp_m[7] += tmp.at<double>(j + 7, i);
tmp_m[8] += tmp.at<double>(j + 8, i);
tmp_m[9] += tmp.at<double>(j + 9, i);
}
} }
// spatial moments mom.m00 = tmp_m[0];
mom->m00 = a00 * db1_2; mom.m10 = tmp_m[1];
mom->m10 = a10 * db1_6; mom.m01 = tmp_m[2];
mom->m01 = a01 * db1_6; mom.m20 = tmp_m[3];
mom->m20 = a20 * db1_12; mom.m11 = tmp_m[4];
mom->m11 = a11 * db1_24; mom.m02 = tmp_m[5];
mom->m02 = a02 * db1_12; mom.m30 = tmp_m[6];
mom->m30 = a30 * db1_20; mom.m21 = tmp_m[7];
mom->m21 = a21 * db1_60; mom.m12 = tmp_m[8];
mom->m12 = a12 * db1_60; mom.m03 = tmp_m[9];
mom->m03 = a03 * db1_20; icvCompleteMomentState( &mom );
return mom;
icvCompleteMomentState( mom );
} }
}
}
static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) Moments ocl_moments(InputArray _contour) //for contour
{ {
const int TILE_SIZE = 256; CvMoments mom;
int type, depth, cn, coi = 0; memset(&mom, 0, sizeof(mom));
CvMat stub, *mat = (CvMat*)array;
CvContour contourHeader;
CvSeq* contour = 0;
CvSeqBlock block;
if( CV_IS_SEQ( array ))
{
contour = (CvSeq*)array;
if( !CV_IS_SEQ_POINT_SET( contour ))
CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
}
if( !mom ) Mat arr = _contour.getMat();
CV_Error( CV_StsNullPtr, "" ); CvMat c_array = arr;
memset( mom, 0, sizeof(*mom)); const void* array = &c_array;
if( !contour ) CvSeq* contour = 0;
{ if( CV_IS_SEQ( array ))
{
contour = (CvSeq*)(array);
if( !CV_IS_SEQ_POINT_SET( contour ))
CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
}
mat = cvGetMat( mat, &stub, &coi ); int type, coi = 0;
type = CV_MAT_TYPE( mat->type );
if( type == CV_32SC2 || type == CV_32FC2 ) CvMat stub, *mat = (CvMat*)(array);
{ CvContour contourHeader;
contour = cvPointSeqFromMat( CvSeqBlock block;
CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
mat, &contourHeader, &block );
}
}
if( contour )
{
icvContourMoments( contour, mom );
return;
}
type = CV_MAT_TYPE( mat->type ); if( !contour )
depth = CV_MAT_DEPTH( type ); {
cn = CV_MAT_CN( type ); mat = cvGetMat( mat, &stub, &coi );
type = CV_MAT_TYPE( mat->type );
cv::Size size = cvGetMatSize( mat );
if( cn > 1 && coi == 0 ) if( type == CV_32SC2 || type == CV_32FC2 )
CV_Error( CV_StsBadArg, "Invalid image type" ); {
contour = cvPointSeqFromMat(
if( size.width <= 0 || size.height <= 0 ) CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
return; mat, &contourHeader, &block );
}
cv::Mat src0(mat); }
cv::ocl::oclMat src(src0);
cv::Size tileSize;
int blockx,blocky;
if(size.width%TILE_SIZE == 0)
blockx = size.width/TILE_SIZE;
else
blockx = size.width/TILE_SIZE + 1;
if(size.height%TILE_SIZE == 0)
blocky = size.height/TILE_SIZE;
else
blocky = size.height/TILE_SIZE + 1;
oclMat dst_m(blocky * 10, blockx, CV_64FC1);
oclMat sum(1, 10, CV_64FC1);
int tile_width = std::min(size.width,TILE_SIZE);
int tile_height = std::min(size.height,TILE_SIZE);
size_t localThreads[3] = { tile_height, 1, 1};
size_t globalThreads[3] = { size.height, blockx, 1};
vector<pair<size_t , const void *> > args,args_sum;
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
size_t localThreadss[3] = { 128, 1, 1};
size_t globalThreadss[3] = { 128, 1, 1};
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
Mat dstsum(sum);
mom->m00 = dstsum.at<double>(0, 0);
mom->m10 = dstsum.at<double>(0, 1);
mom->m01 = dstsum.at<double>(0, 2);
mom->m20 = dstsum.at<double>(0, 3);
mom->m11 = dstsum.at<double>(0, 4);
mom->m02 = dstsum.at<double>(0, 5);
mom->m30 = dstsum.at<double>(0, 6);
mom->m21 = dstsum.at<double>(0, 7);
mom->m12 = dstsum.at<double>(0, 8);
mom->m03 = dstsum.at<double>(0, 9);
icvCompleteMomentState( mom );
}
Moments ocl_moments( InputArray _array, bool binaryImage )
{
CvMoments om;
Mat arr = _array.getMat();
CvMat c_array = arr;
ocl_cvMoments(&c_array, &om, binaryImage);
return om;
}
} CV_Assert(contour);
} icvContourMoments(contour, &mom);
return mom;
}
}
}
\ No newline at end of file
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// @Authors // @Authors
// Jin Ma, jin@multicorewareinc.com
// Sen Liu, swjtuls1987@126.com // Sen Liu, swjtuls1987@126.com
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
...@@ -44,22 +45,14 @@ ...@@ -44,22 +45,14 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64 #ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64) #elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable #pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif #endif
typedef double T; typedef double T;
typedef double F;
typedef double4 F4;
#define convert_F4 convert_double4
#else #else
typedef float F;
typedef float4 F4;
typedef long T; typedef long T;
#define convert_F4 convert_float4
#endif #endif
#define DST_ROW_00 0 #define DST_ROW_00 0
...@@ -99,7 +92,6 @@ __kernel void icvContourMoments(int contour_total, ...@@ -99,7 +92,6 @@ __kernel void icvContourMoments(int contour_total,
xi = (T)(*(reader_oclmat_data + (idx + 1) * 2)); xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1)); yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
} }
xi2 = xi * xi; xi2 = xi * xi;
yi2 = yi * yi; yi2 = yi * yi;
dxy = xi_1 * yi - xi * yi_1; dxy = xi_1 * yi - xi * yi_1;
...@@ -117,864 +109,338 @@ __kernel void icvContourMoments(int contour_total, ...@@ -117,864 +109,338 @@ __kernel void icvContourMoments(int contour_total,
*( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
*( dst_a + DST_ROW_21 * dst_step + idx) = *( dst_a + DST_ROW_21 * dst_step + idx) =
dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
xi2 * (yi_1 + 3 * yi)); xi2 * (yi_1 + 3 * yi));
*( dst_a + DST_ROW_12 * dst_step + idx) = *( dst_a + DST_ROW_12 * dst_step + idx) =
dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
yi2 * (xi_1 + 3 * xi)); yi2 * (xi_1 + 3 * xi));
} }
__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, #if defined (DOUBLE_SUPPORT)
__global F* sum, __global F* dst_m, int dst_step) #define WT double
#define WT4 double4
#define convert_T4 convert_double4
#define convert_T convert_double
#else
#define WT float
#define WT4 float4
#define convert_T4 convert_float4
#define convert_T convert_float
#endif
#ifdef CV_8UC1
#define TT uchar
#elif defined CV_16UC1
#define TT ushort
#elif defined CV_16SC1
#define TT short
#elif defined CV_32FC1
#define TT float
#elif defined CV_64FC1
#ifdef DOUBLE_SUPPORT
#define TT double
#else
#define TT float
#endif
#endif
__kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int src_step,
__global WT* dst_m,
int dst_cols, int dst_step, int binary)
{ {
int gidy = get_global_id(0); int dy = get_global_id(1);
int gidx = get_global_id(1); int ly = get_local_id(1);
int block_y = src_rows/tile_height; int gidx = get_group_id(0);
int block_x = src_cols/tile_width; int gidy = get_group_id(1);
int block_num; int x_rest = src_cols % 256;
int y_rest = src_rows % 256;
if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) __local int codxy[256];
block_y ++; codxy[ly] = ly;
if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
block_x ++;
block_num = block_y * block_x;
__local F dst_sum[10][128];
if(gidy<128-block_num)
for(int i=0; i<10; i++)
dst_sum[i][gidy+block_num]=0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
dst_step /= sizeof(F); WT4 x0 = (WT4)(0.f);
if(gidy<block_num) WT4 x1 = (WT4)(0.f);
{ WT4 x2 = (WT4)(0.f);
dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy)); WT4 x3 = (WT4)(0.f);
dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int lsize=64; lsize>0; lsize>>=1)
{
if(gidy<lsize)
{
int lsize2 = gidy + lsize;
for(int i=0; i<10; i++)
dst_sum[i][gidy] += dst_sum[i][lsize2];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(gidy==0)
for(int i=0; i<10; i++)
sum[i] = dst_sum[i][0];
}
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, __global TT* row = src_data + gidy * src_step + ly * src_step + gidx * 256;
__global F* dst_m, bool switchFlag = false;
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, int TILE_SIZE)
{
uchar tmp_coi[16]; // get the coi data
uchar16 tmp[16];
int VLEN_C = 16; // vector length of uchar
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // vector length of uchar
int x = wgidx*TILE_SIZE; // vector length of uchar
int kcn = (cn==2)?2:4;
int rstep = min(src_step, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols - x);
if ( y+lidy < src_rows )
{
if( tileSize_width < TILE_SIZE )
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
if( coi > 0 ) //channel of interest WT4 p;
for(int i = 0; i < tileSize_width; i += VLEN_C) WT4 x;
{ WT4 xp;
for(int j=0; j<VLEN_C; j++) WT4 xxp;
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
}
uchar16 zero = (uchar16)(0); WT py = 0.f, sy = 0.f;
uchar16 full = (uchar16)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
F mom[10]; if(dy < src_rows)
__local int m[10][128];
if(lidy < 128)
{ {
for(int i=0; i<10; i++) if((x_rest > 0) && (gidx == (get_num_groups(0) - 1)))
m[i][lidy]=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
int lm[10] = {0};
int16 x0 = (int16)(0);
int16 x1 = (int16)(0);
int16 x2 = (int16)(0);
int16 x3 = (int16)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
{
int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
int16 p = convert_int16(tmp[xt/VLEN_C]);
int16 xp = v_xt * p, xxp = xp *v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp * v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
int py = lidy * ((int)x0.s0);
int sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((int)py) * sy; // m03
m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((int)py) * sy; // m03
lm[8] = ((int)x1.s0) * sy; // m12
lm[7] = ((int)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = bheight; j >= 1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{ {
F s = 1./255; int i;
for( int mt = 0; mt < 10; mt++ ) for(i = 0; i < x_rest - 4; i += 4)
mom[mt] *= s; {
} p = convert_T4(vload4(0, row + i));
F xm = x * mom[0], ym = y * mom[0]; x = convert_T4(vload4(0, codxy + i));
xp = x * p;
// accumulate moments computed in each tile xxp = xp * x;
dst_step /= sizeof(F);
x0 += p;
// + m00 ( = m00' ) x1 += xp;
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; x2 += xxp;
x3 += convert_T4(xxp * x);
}
// + m10 ( = m10' + x*m00' ) x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' ) x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) WT x0_ = 0;
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); WT x1_ = 0;
WT x2_ = 0;
WT x3_ = 0;
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) for(; i < x_rest; i++)
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); {
WT p_ = 0;
p_ = row[i];
WT x_ = convert_T(codxy[i]);
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') WT xp_ = x_ * p_;
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; WT xxp_ = xp_ * x_;
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) x0_ += p_;
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); x1_ += xp_;
} x2_ += xxp_;
} x3_ += xxp_ * x_;
}
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, x0.s0 += x0_;
__global F* dst_m, x1.s0 += x1_;
int dst_cols, int dst_step, int blocky, x2.s0 += x2_;
int depth, int cn, int coi, int binary, const int TILE_SIZE) x3.s0 += x3_;
{ }else
ushort tmp_coi[8]; // get the coi data {
ushort8 tmp[32]; for(int i = 0; i < 256; i += 4)
int VLEN_US = 8; // vector length of ushort
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
if ( y+lidy < src_rows )
{
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_US)
{ {
for(int j=0; j<VLEN_US; j++) p = convert_T4(vload4(0, row + i));
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1); x = convert_T4(vload4(0, codxy + i));
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); xp = x * p;
xxp = xp * x;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += convert_T4(xxp * x);
} }
else
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
}
ushort8 zero = (ushort8)(0); x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
ushort8 full = (ushort8)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
F mom[10];
__local long m[10][128];
if(lidy < 128)
for(int i=0; i<10; i++)
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0};
int8 x0 = (int8)(0);
int8 x1 = (int8)(0);
int8 x2 = (int8)(0);
long8 x3 = (long8)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
{
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
int8 p = convert_int8(tmp[xt/VLEN_US]);
int8 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += convert_long8(xxp) *convert_long8(v_xt);
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
int py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((long)py) * sy; // m03
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((long)py) * sy; // m03
lm[8] = ((long)x1.s0) * sy; // m12
lm[7] = ((long)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy == 0&&lidx == 0) x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
{
for(int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary) x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
} }
F xm = x *mom[0], ym = y * mom[0]; py = ly * x0.s0;
sy = ly * ly;
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
} }
} __local WT mom[10][256];
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, if((y_rest > 0) && (gidy == (get_num_groups(1) - 1)))
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
short tmp_coi[8]; // get the coi data
short8 tmp[32];
int VLEN_S =8; // vector length of short
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
if ( y+lidy < src_rows )
{ {
if(tileSize_width < TILE_SIZE) if(ly < y_rest)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) {
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; mom[9][ly] = py * sy;
if( coi > 0 ) mom[8][ly] = x1.s0 * sy;
for(int i=0; i < tileSize_width; i+=VLEN_S) mom[7][ly] = x2.s0 * ly;
mom[6][ly] = x3.s0;
mom[5][ly] = x0.s0 * sy;
mom[4][ly] = x1.s0 * ly;
mom[3][ly] = x2.s0;
mom[2][ly] = py;
mom[1][ly] = x1.s0;
mom[0][ly] = x0.s0;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(ly < 10)
{
for(int i = 1; i < y_rest; i++)
{ {
for(int j=0; j<VLEN_S; j++) mom[ly][0] = mom[ly][i] + mom[ly][0];
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
} }
else }
for(int i=0; i < tileSize_width; i+=VLEN_S) }else
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S); {
} mom[9][ly] = py * sy;
mom[8][ly] = x1.s0 * sy;
mom[7][ly] = x2.s0 * ly;
mom[6][ly] = x3.s0;
mom[5][ly] = x0.s0 * sy;
mom[4][ly] = x1.s0 * ly;
mom[3][ly] = x2.s0;
mom[2][ly] = py;
mom[1][ly] = x1.s0;
mom[0][ly] = x0.s0;
short8 zero = (short8)(0);
short8 full = (short8)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=(VLEN_S))
tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
F mom[10];
__local long m[10][128];
if(lidy < 128)
for(int i=0; i<10; i++)
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0};
int8 x0 = (int8)(0);
int8 x1 = (int8)(0);
int8 x2 = (int8)(0);
long8 x3 = (long8)(0);
for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
{
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
int8 p = convert_int8(tmp[xt/VLEN_S]);
int8 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += convert_long8(xxp) * convert_long8(v_xt);
}
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
int py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((long)py) * sy; // m03
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((long)py) * sy; // m03
lm[8] = ((long)(x1.s0)) * sy; // m12
lm[7] = ((long)(x2.s0)) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >=1; j = j/2 )
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy ==0 &&lidx ==0)
{
for(int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary) if(ly < 128)
{ {
F s = 1./255; mom[0][ly] = mom[0][ly] + mom[0][ly + 128];
for( int mt = 0; mt < 10; mt++ ) mom[1][ly] = mom[1][ly] + mom[1][ly + 128];
mom[mt] *= s; mom[2][ly] = mom[2][ly] + mom[2][ly + 128];
mom[3][ly] = mom[3][ly] + mom[3][ly + 128];
mom[4][ly] = mom[4][ly] + mom[4][ly + 128];
mom[5][ly] = mom[5][ly] + mom[5][ly + 128];
mom[6][ly] = mom[6][ly] + mom[6][ly + 128];
mom[7][ly] = mom[7][ly] + mom[7][ly + 128];
mom[8][ly] = mom[8][ly] + mom[8][ly + 128];
mom[9][ly] = mom[9][ly] + mom[9][ly + 128];
} }
barrier(CLK_LOCAL_MEM_FENCE);
F xm = x * mom[0], ym = y*mom[0]; if(ly < 64)
{
// accumulate moments computed in each tile mom[0][ly] = mom[0][ly] + mom[0][ly + 64];
dst_step /= sizeof(F); mom[1][ly] = mom[1][ly] + mom[1][ly + 64];
mom[2][ly] = mom[2][ly] + mom[2][ly + 64];
// + m00 ( = m00' ) mom[3][ly] = mom[3][ly] + mom[3][ly + 64];
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; mom[4][ly] = mom[4][ly] + mom[4][ly + 64];
mom[5][ly] = mom[5][ly] + mom[5][ly + 64];
// + m10 ( = m10' + x*m00' ) mom[6][ly] = mom[6][ly] + mom[6][ly + 64];
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; mom[7][ly] = mom[7][ly] + mom[7][ly + 64];
mom[8][ly] = mom[8][ly] + mom[8][ly + 64];
// + m01 ( = m01' + y*m00' ) mom[9][ly] = mom[9][ly] + mom[9][ly + 64];
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; }
barrier(CLK_LOCAL_MEM_FENCE);
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) if(ly < 32)
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); {
mom[0][ly] = mom[0][ly] + mom[0][ly + 32];
mom[1][ly] = mom[1][ly] + mom[1][ly + 32];
mom[2][ly] = mom[2][ly] + mom[2][ly + 32];
mom[3][ly] = mom[3][ly] + mom[3][ly + 32];
mom[4][ly] = mom[4][ly] + mom[4][ly + 32];
mom[5][ly] = mom[5][ly] + mom[5][ly + 32];
mom[6][ly] = mom[6][ly] + mom[6][ly + 32];
mom[7][ly] = mom[7][ly] + mom[7][ly + 32];
mom[8][ly] = mom[8][ly] + mom[8][ly + 32];
mom[9][ly] = mom[9][ly] + mom[9][ly + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) if(ly < 16)
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); {
mom[0][ly] = mom[0][ly] + mom[0][ly + 16];
mom[1][ly] = mom[1][ly] + mom[1][ly + 16];
mom[2][ly] = mom[2][ly] + mom[2][ly + 16];
mom[3][ly] = mom[3][ly] + mom[3][ly + 16];
mom[4][ly] = mom[4][ly] + mom[4][ly + 16];
mom[5][ly] = mom[5][ly] + mom[5][ly + 16];
mom[6][ly] = mom[6][ly] + mom[6][ly + 16];
mom[7][ly] = mom[7][ly] + mom[7][ly + 16];
mom[8][ly] = mom[8][ly] + mom[8][ly + 16];
mom[9][ly] = mom[9][ly] + mom[9][ly + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') if(ly < 8)
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; {
mom[0][ly] = mom[0][ly] + mom[0][ly + 8];
mom[1][ly] = mom[1][ly] + mom[1][ly + 8];
mom[2][ly] = mom[2][ly] + mom[2][ly + 8];
mom[3][ly] = mom[3][ly] + mom[3][ly + 8];
mom[4][ly] = mom[4][ly] + mom[4][ly + 8];
mom[5][ly] = mom[5][ly] + mom[5][ly + 8];
mom[6][ly] = mom[6][ly] + mom[6][ly + 8];
mom[7][ly] = mom[7][ly] + mom[7][ly + 8];
mom[8][ly] = mom[8][ly] + mom[8][ly + 8];
mom[9][ly] = mom[9][ly] + mom[9][ly + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') if(ly < 4)
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; {
mom[0][ly] = mom[0][ly] + mom[0][ly + 4];
mom[1][ly] = mom[1][ly] + mom[1][ly + 4];
mom[2][ly] = mom[2][ly] + mom[2][ly + 4];
mom[3][ly] = mom[3][ly] + mom[3][ly + 4];
mom[4][ly] = mom[4][ly] + mom[4][ly + 4];
mom[5][ly] = mom[5][ly] + mom[5][ly + 4];
mom[6][ly] = mom[6][ly] + mom[6][ly + 4];
mom[7][ly] = mom[7][ly] + mom[7][ly + 4];
mom[8][ly] = mom[8][ly] + mom[8][ly + 4];
mom[9][ly] = mom[9][ly] + mom[9][ly + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) if(ly < 2)
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); {
} mom[0][ly] = mom[0][ly] + mom[0][ly + 2];
} mom[1][ly] = mom[1][ly] + mom[1][ly + 2];
mom[2][ly] = mom[2][ly] + mom[2][ly + 2];
mom[3][ly] = mom[3][ly] + mom[3][ly + 2];
mom[4][ly] = mom[4][ly] + mom[4][ly + 2];
mom[5][ly] = mom[5][ly] + mom[5][ly + 2];
mom[6][ly] = mom[6][ly] + mom[6][ly + 2];
mom[7][ly] = mom[7][ly] + mom[7][ly + 2];
mom[8][ly] = mom[8][ly] + mom[8][ly + 2];
mom[9][ly] = mom[9][ly] + mom[9][ly + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, if(ly < 1)
__global F* dst_m, {
int dst_cols, int dst_step, int blocky, mom[0][ly] = mom[0][ly] + mom[0][ly + 1];
int depth, int cn, int coi, int binary, const int TILE_SIZE) mom[1][ly] = mom[1][ly] + mom[1][ly + 1];
{ mom[2][ly] = mom[2][ly] + mom[2][ly + 1];
float tmp_coi[4]; // get the coi data mom[3][ly] = mom[3][ly] + mom[3][ly + 1];
float4 tmp[64] ; mom[4][ly] = mom[4][ly] + mom[4][ly + 1];
int VLEN_F = 4; // vector length of float mom[5][ly] = mom[5][ly] + mom[5][ly + 1];
int gidy = get_global_id(0); mom[6][ly] = mom[6][ly] + mom[6][ly + 1];
int gidx = get_global_id(1); mom[7][ly] = mom[7][ly] + mom[7][ly + 1];
int wgidy = get_group_id(0); mom[8][ly] = mom[8][ly] + mom[8][ly + 1];
int wgidx = get_group_id(1); mom[9][ly] = mom[9][ly] + mom[9][ly + 1];
int lidy = get_local_id(0); }
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/4, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols -x);
int maxIdx = mul24(src_rows, src_cols);
int yOff = (y+lidy)*src_step;
int index;
if ( y+lidy < src_rows )
{
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_F)
{
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_F)
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
} }
float4 zero = (float4)(0);
float4 full = (float4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=4)
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
F mom[10];
__local F m[10][128];
if(lidy < 128)
for(int i = 0; i < 10; i ++)
m[i][lidy] = 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0};
F4 x0 = (F4)(0);
F4 x1 = (F4)(0);
F4 x2 = (F4)(0);
F4 x3 = (F4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
{
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
F4 p = convert_F4(tmp[xt/VLEN_F]);
F4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp * v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
F py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((F)py) * sy; // m03
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight) if(binary)
{
lm[9] = ((F)py) * sy; // m03
lm[8] = ((F)x1.s0) * sy; // m12
lm[7] = ((F)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{ {
if(lidy < j) WT s = 1./255;
for( int i = 0; i < 10; i++ ) if(ly < 10)
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{
for( int mt = 0; mt < 10; mt++ )
mom[mt] = (F)lm[mt];
if(binary)
{ {
F s = 1./255; mom[ly][0] *= s;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
} }
barrier(CLK_LOCAL_MEM_FENCE);
F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
} }
} WT xm = (gidx * 256) * mom[0][0];
WT ym = (gidy * 256) * mom[0][0];
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, if(ly == 0)
__global F* dst_m,
int dst_cols, int dst_step, int blocky,
int depth, int cn, int coi, int binary, const int TILE_SIZE)
{
F tmp_coi[4]; // get the coi data
F4 tmp[64];
int VLEN_D = 4; // length of vetor
int gidy = get_global_id(0);
int gidx = get_global_id(1);
int wgidy = get_group_id(0);
int wgidx = get_group_id(1);
int lidy = get_local_id(0);
int lidx = get_local_id(1);
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/8, TILE_SIZE);
int tileSize_height = min(TILE_SIZE, src_rows - y);
int tileSize_width = min(TILE_SIZE, src_cols - x);
if ( y+lidy < src_rows )
{ {
if(tileSize_width < TILE_SIZE) mom[0][1] = mom[0][0];
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) mom[1][1] = mom[1][0] + xm;
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; mom[2][1] = mom[2][0] + ym;
if( coi > 0 ) mom[3][1] = mom[3][0] + gidx * 256 * (mom[1][0] * 2 + xm);
for(int i=0; i < tileSize_width; i+=VLEN_D) mom[4][1] = mom[4][0] + gidx * 256 * (mom[2][0] + ym) + gidy * 256 * mom[1][0];
{ mom[5][1] = mom[5][0] + gidy * 256 * (mom[2][0] * 2 + ym);
for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++) mom[6][1] = mom[6][0] + gidx * 256 * (3 * mom[3][0] + 256 * gidx * (3 * mom[1][0] + xm));
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1); mom[7][1] = mom[7][0] + gidx * 256 * (2 * (mom[4][0] + 256 * gidy * mom[1][0]) + 256 * gidx * (mom[2][0] + ym)) + 256 * gidy * mom[3][0];
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); mom[8][1] = mom[8][0] + gidy * 256 * (2 * (mom[4][0] + 256 * gidx * mom[2][0]) + 256 * gidy * (mom[1][0] + xm)) + 256 * gidx * mom[5][0];
} mom[9][1] = mom[9][0] + gidy * 256 * (3 * mom[5][0] + 256 * gidy * (3 * mom[2][0] + ym));
else
for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
} }
F4 zero = (F4)(0);
F4 full = (F4)(255);
if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
F mom[10];
__local F m[10][128];
if(lidy < 128)
for(int i=0; i<10; i++)
m[i][lidy]=0;
barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0};
F4 x0 = (F4)(0);
F4 x1 = (F4)(0);
F4 x2 = (F4)(0);
F4 x3 = (F4)(0);
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
{
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
F4 p = tmp[xt/VLEN_D];
F4 xp = v_xt * p, xxp = xp * v_xt;
x0 += p;
x1 += xp;
x2 += xxp;
x3 += xxp *v_xt;
}
x0.s0 += x0.s1 + x0.s2 + x0.s3;
x1.s0 += x1.s1 + x1.s2 + x1.s3;
x2.s0 += x2.s1 + x2.s2 + x2.s3;
x3.s0 += x3.s1 + x3.s2 + x3.s3;
F py = lidy * x0.s0, sy = lidy*lidy;
int bheight = min(tileSize_height, TILE_SIZE/2);
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
{
m[9][lidy-bheight] = ((F)py) * sy; // m03
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
m[6][lidy-bheight] = x3.s0; // m30
m[5][lidy-bheight] = x0.s0 * sy; // m02
m[4][lidy-bheight] = x1.s0 * lidy; // m11
m[3][lidy-bheight] = x2.s0; // m20
m[2][lidy-bheight] = py; // m01
m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00
}
else if(lidy < bheight)
{
lm[9] = ((F)py) * sy; // m03
lm[8] = ((F)x1.s0) * sy; // m12
lm[7] = ((F)x2.s0) * lidy; // m21
lm[6] = x3.s0; // m30
lm[5] = x0.s0 * sy; // m02
lm[4] = x1.s0 * lidy; // m11
lm[3] = x2.s0; // m20
lm[2] = py; // m01
lm[1] = x1.s0; // m10
lm[0] = x0.s0; // m00
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) if(ly < 10)
{
if(lidy < j)
for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lidy == 0&&lidx == 0)
{ {
for( int mt = 0; mt < 10; mt++ ) dst_m[10 * gidy * dst_step + ly * dst_step + gidx] = mom[ly][1];
mom[mt] = (F)lm[mt];
if(binary)
{
F s = 1./255;
for( int mt = 0; mt < 10; mt++ )
mom[mt] *= s;
}
F xm = x * mom[0], ym = y * mom[0];
// accumulate moments computed in each tile
dst_step /= sizeof(F);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
} }
} }
...@@ -10,18 +10,19 @@ using namespace cvtest; ...@@ -10,18 +10,19 @@ using namespace cvtest;
using namespace testing; using namespace testing;
using namespace std; using namespace std;
PARAM_TEST_CASE(MomentsTest, MatType, bool) PARAM_TEST_CASE(MomentsTest, MatType, bool, bool)
{ {
int type; int type;
cv::Mat mat1; cv::Mat mat;
bool test_contours; bool test_contours;
bool binaryImage;
virtual void SetUp() virtual void SetUp()
{ {
type = GET_PARAM(0); type = GET_PARAM(0);
test_contours = GET_PARAM(1); test_contours = GET_PARAM(1);
cv::Size size(10*MWIDTH, 10*MHEIGHT); cv::Size size(10 * MWIDTH, 10 * MHEIGHT);
mat1 = randomMat(size, type, 5, 16, false); mat = randomMat(size, type, 0, 256, false);
binaryImage = GET_PARAM(2);
} }
void Compare(Moments& cpu, Moments& gpu) void Compare(Moments& cpu, Moments& gpu)
...@@ -29,16 +30,13 @@ PARAM_TEST_CASE(MomentsTest, MatType, bool) ...@@ -29,16 +30,13 @@ PARAM_TEST_CASE(MomentsTest, MatType, bool)
Mat gpu_dst, cpu_dst; Mat gpu_dst, cpu_dst;
HuMoments(cpu, cpu_dst); HuMoments(cpu, cpu_dst);
HuMoments(gpu, gpu_dst); HuMoments(gpu, gpu_dst);
EXPECT_MAT_NEAR(gpu_dst,cpu_dst, .5); EXPECT_MAT_NEAR(gpu_dst,cpu_dst, 1e-3);
} }
}; };
OCL_TEST_P(MomentsTest, Mat) OCL_TEST_P(MomentsTest, Mat)
{ {
bool binaryImage = 0; oclMat src_d(mat);
for(int j = 0; j < LOOP_TIMES; j++) for(int j = 0; j < LOOP_TIMES; j++)
{ {
if(test_contours) if(test_contours)
...@@ -53,18 +51,16 @@ OCL_TEST_P(MomentsTest, Mat) ...@@ -53,18 +51,16 @@ OCL_TEST_P(MomentsTest, Mat)
for( size_t i = 0; i < contours.size(); i++ ) for( size_t i = 0; i < contours.size(); i++ )
{ {
Moments m = moments( contours[i], false ); Moments m = moments( contours[i], false );
Moments dm = ocl::ocl_moments( contours[i], false ); Moments dm = ocl::ocl_moments( contours[i]);
Compare(m, dm); Compare(m, dm);
} }
} }
cv::_InputArray _array(mat1); cv::Moments CvMom = cv::moments(mat, binaryImage);
cv::Moments CvMom = cv::moments(_array, binaryImage); cv::Moments oclMom = cv::ocl::ocl_moments(src_d, binaryImage);
cv::Moments oclMom = cv::ocl::ocl_moments(_array, binaryImage);
Compare(CvMom, oclMom); Compare(CvMom, oclMom);
} }
} }
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MomentsTest, Combine( INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MomentsTest, Combine(
Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_64FC1), Values(true,false))); Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1, CV_64FC1), Values(false, true), Values(false, true)));
#endif // HAVE_OPENCL #endif // HAVE_OPENCL
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment