Commit 23244a35 authored by niko's avatar niko

rename all the perf test files

fix the channel 3 bug in matrix operation
perf and buf fix for LUT haardetect convertC3C4 resize warpaffine copytom settom
add convovle
remove stereo
parent e94cd1ec
......@@ -55,22 +55,22 @@ namespace cv
//////////////////////////////// oclMat ////////////////////////////////
////////////////////////////////////////////////////////////////////////
inline oclMat::oclMat() : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0) {}
inline oclMat::oclMat() : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0), download_channels(0) {}
inline oclMat::oclMat(int _rows, int _cols, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
inline oclMat::oclMat(int _rows, int _cols, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0), download_channels(0)
{
if( _rows > 0 && _cols > 0 )
create( _rows, _cols, _type );
}
inline oclMat::oclMat(Size _size, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
inline oclMat::oclMat(Size _size, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0), download_channels(0)
{
if( _size.height > 0 && _size.width > 0 )
create( _size.height, _size.width, _type );
}
inline oclMat::oclMat(int _rows, int _cols, int _type, const Scalar &_s)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0), download_channels(0)
{
if(_rows > 0 && _cols > 0)
{
......@@ -80,7 +80,7 @@ namespace cv
}
inline oclMat::oclMat(Size _size, int _type, const Scalar &_s)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0), download_channels(0)
{
if( _size.height > 0 && _size.width > 0 )
{
......@@ -91,49 +91,53 @@ namespace cv
inline oclMat::oclMat(const oclMat &m)
: flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data),
refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), clCxt(m.clCxt), offset(m.offset), wholerows(m.wholerows), wholecols(m.wholecols)
refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), clCxt(m.clCxt), offset(m.offset), wholerows(m.wholerows), wholecols(m.wholecols), download_channels(m.download_channels)
{
if( refcount )
CV_XADD(refcount, 1);
}
//Fixme, the data is not correct if _data point to the CPU memory
inline oclMat::oclMat(int _rows, int _cols, int _type, void *_data, size_t _step)
: flags(Mat::MAGIC_VAL + (_type &TYPE_MASK)), rows(_rows), cols(_cols), step(_step), data((uchar *)_data), refcount(0),
datastart((uchar *)_data), dataend((uchar *)_data), offset(0), wholerows(_rows), wholecols(_cols)
{
size_t minstep = cols * elemSize();
if( step == Mat::AUTO_STEP )
{
step = minstep;
flags |= Mat::CONTINUOUS_FLAG;
}
else
{
if( rows == 1 ) step = minstep;
CV_DbgAssert( step >= minstep );
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
}
dataend += step * (rows - 1) + minstep;
}
//Fixme, the data is not correct if _data point to the CPU memory
datastart((uchar *)_data), dataend((uchar *)_data), offset(0), wholerows(_rows), wholecols(_cols), download_channels(CV_MAT_CN(_type))
{
cv::Mat m(_rows,_cols,_type,_data,_step);
upload(m);
//size_t minstep = cols * elemSize();
//if( step == Mat::AUTO_STEP )
//{
// step = minstep;
// flags |= Mat::CONTINUOUS_FLAG;
//}
//else
//{
// if( rows == 1 ) step = minstep;
// CV_DbgAssert( step >= minstep );
// flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
//}
//dataend += step * (rows - 1) + minstep;
}
inline oclMat::oclMat(Size _size, int _type, void *_data, size_t _step)
: flags(Mat::MAGIC_VAL + (_type &TYPE_MASK)), rows(_size.height), cols(_size.width),
step(_step), data((uchar *)_data), refcount(0),
datastart((uchar *)_data), dataend((uchar *)_data), offset(0), wholerows(_size.height), wholecols(_size.width)
{
size_t minstep = cols * elemSize();
if( step == Mat::AUTO_STEP )
{
step = minstep;
flags |= Mat::CONTINUOUS_FLAG;
}
else
{
if( rows == 1 ) step = minstep;
CV_DbgAssert( step >= minstep );
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
}
dataend += step * (rows - 1) + minstep;
datastart((uchar *)_data), dataend((uchar *)_data), offset(0), wholerows(_size.height), wholecols(_size.width), download_channels(CV_MAT_CN(_type))
{
cv::Mat m(_size,_type,_data,_step);
upload(m);
//size_t minstep = cols * elemSize();
//if( step == Mat::AUTO_STEP )
//{
// step = minstep;
// flags |= Mat::CONTINUOUS_FLAG;
//}
//else
//{
// if( rows == 1 ) step = minstep;
// CV_DbgAssert( step >= minstep );
// flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
//}
//dataend += step * (rows - 1) + minstep;
}
......@@ -148,6 +152,7 @@ namespace cv
wholerows = m.wholerows;
wholecols = m.wholecols;
offset = m.offset;
download_channels = m.download_channels;
if( rowRange == Range::all() )
rows = m.rows;
else
......@@ -179,7 +184,7 @@ namespace cv
inline oclMat::oclMat(const oclMat &m, const Rect &roi)
: flags(m.flags), rows(roi.height), cols(roi.width),
step(m.step), data(m.data), refcount(m.refcount),
datastart(m.datastart), dataend(m.dataend), clCxt(m.clCxt), offset(m.offset), wholerows(m.wholerows), wholecols(m.wholecols)
datastart(m.datastart), dataend(m.dataend), clCxt(m.clCxt), offset(m.offset), wholerows(m.wholerows), wholecols(m.wholecols), download_channels(m.download_channels)
{
flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
offset += roi.y * step + roi.x * elemSize();
......@@ -192,7 +197,7 @@ namespace cv
}
inline oclMat::oclMat(const Mat &m)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) , offset(0), wholerows(0), wholecols(0)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) , offset(0), wholerows(0), wholecols(0), download_channels(0)
{
//clCxt = Context::getContext();
upload(m);
......@@ -222,6 +227,7 @@ namespace cv
wholerows = m.wholerows;
wholecols = m.wholecols;
refcount = m.refcount;
download_channels = m.download_channels;
}
return *this;
}
......@@ -323,6 +329,7 @@ namespace cv
std::swap( offset, b.offset );
std::swap( wholerows, b.wholerows );
std::swap( wholecols, b.wholecols );
std::swap( download_channels, b.download_channels);
}
inline void oclMat::locateROI( Size &wholeSize, Point &ofs ) const
......@@ -412,28 +419,32 @@ namespace cv
}
//fixme, the ROI operation is not correct.
inline uchar *oclMat::ptr(int y)
{
CV_DbgAssert( (unsigned)y < (unsigned)rows );
CV_Error(CV_GpuNotSupported,"This function hasn't been supported yet.\n");
return data + step * y;
}
inline const uchar *oclMat::ptr(int y) const
{
CV_DbgAssert( (unsigned)y < (unsigned)rows );
CV_Error(CV_GpuNotSupported,"This function hasn't been supported yet.\n");
return data + step * y;
}
template<typename _Tp> inline _Tp *oclMat::ptr(int y)
{
CV_DbgAssert( (unsigned)y < (unsigned)rows );
CV_Error(CV_GpuNotSupported,"This function hasn't been supported yet.\n");
return (_Tp *)(data + step * y);
}
template<typename _Tp> inline const _Tp *oclMat::ptr(int y) const
{
CV_DbgAssert( (unsigned)y < (unsigned)rows );
CV_Error(CV_GpuNotSupported,"This function hasn't been supported yet.\n");
return (const _Tp *)(data + step * y);
}
......
This diff is collapsed.
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include <iomanip>
#ifdef HAVE_OPENCL
using namespace cv;
using namespace cv::ocl;
using namespace cvtest;
using namespace testing;
using namespace std;
#define FILTER_IMAGE "../../../samples/gpu/road.png"
#ifndef MWC_TEST_UTILITY
#define MWC_TEST_UTILITY
// Param class
#ifndef IMPLEMENT_PARAM_CLASS
#define IMPLEMENT_PARAM_CLASS(name, type) \
class name \
{ \
public: \
name ( type arg = type ()) : val_(arg) {} \
operator type () const {return val_;} \
private: \
type val_; \
}; \
inline void PrintTo( name param, std::ostream* os) \
{ \
*os << #name << "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
}
#endif // IMPLEMENT_PARAM_CLASS
#endif // MWC_TEST_UTILITY
IMPLEMENT_PARAM_CLASS(WinSizw48, bool);
PARAM_TEST_CASE(HOG, WinSizw48, bool)
{
bool is48;
vector<float> detector;
virtual void SetUp()
{
is48 = GET_PARAM(0);
if(is48)
{
detector = cv::ocl::HOGDescriptor::getPeopleDetector48x96();
}
else
{
detector = cv::ocl::HOGDescriptor::getPeopleDetector64x128();
}
}
};
TEST_P(HOG, Performance)
{
cv::Mat img = readImage(FILTER_IMAGE,cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
// define HOG related arguments
float scale = 1.05;
int nlevels = 13;
float gr_threshold = 8;
float hit_threshold = 1.4;
bool hit_threshold_auto = true;
int win_width = is48? 48 : 64;
int win_stride_width = 8;
int win_stride_height = 8;
bool gamma_corr = true;
Size win_size(win_width, win_width * 2); //(64, 128) or (48, 96)
Size win_stride(win_stride_width, win_stride_height);
cv::ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
cv::ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
cv::ocl::HOGDescriptor::DEFAULT_NLEVELS);
gpu_hog.setSVMDetector(detector);
double totalgputick=0;
double totalgputick_kernel=0;
double t1=0;
double t2=0;
for(int j = 0; j < LOOP_TIMES+1; j ++)
{
t1 = (double)cvGetTickCount();//gpu start1
ocl::oclMat d_src(img);//upload
t2=(double)cvGetTickCount();//kernel
vector<Rect> found;
gpu_hog.detectMultiScale(d_src, found, hit_threshold, win_stride,
Size(0, 0), scale, gr_threshold);
t2 = (double)cvGetTickCount() - t2;//kernel
// no download time for HOG
t1 = (double)cvGetTickCount() - t1;//gpu end1
if(j == 0)
continue;
totalgputick=t1+totalgputick;
totalgputick_kernel=t2+totalgputick_kernel;
}
cout << "average gpu runtime is " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
cout << "average gpu runtime without data transfer is " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
}
INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, testing::Combine(testing::Values(WinSizw48(false), WinSizw48(true)), testing::Values(false)));
#endif //Have opencl
\ No newline at end of file
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include <iomanip>
#ifdef HAVE_OPENCL
using namespace cv;
using namespace cv::ocl;
using namespace cvtest;
using namespace testing;
using namespace std;
#define FILTER_IMAGE "../../../samples/gpu/road.png"
TEST(SURF, Performance)
{
cv::Mat img = readImage(FILTER_IMAGE,cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
ocl::SURF_OCL d_surf;
ocl::oclMat d_keypoints;
ocl::oclMat d_descriptors;
double totalgputick=0;
double totalgputick_kernel=0;
double t1=0;
double t2=0;
for(int j = 0; j < LOOP_TIMES+1; j ++)
{
t1 = (double)cvGetTickCount();//gpu start1
ocl::oclMat d_src(img);//upload
t2=(double)cvGetTickCount();//kernel
d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
t2 = (double)cvGetTickCount() - t2;//kernel
cv::Mat cpu_kp, cpu_dp;
d_keypoints.download (cpu_kp);//download
d_descriptors.download (cpu_dp);//download
t1 = (double)cvGetTickCount() - t1;//gpu end1
if(j == 0)
continue;
totalgputick=t1+totalgputick;
totalgputick_kernel=t2+totalgputick_kernel;
}
cout << "average gpu runtime is " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
cout << "average gpu runtime without data transfer is " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
}
#endif //Have opencl
\ No newline at end of file
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// Intel License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Fangfang BAI, fangfang@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include "opencv2/core/core.hpp"
#include <iomanip>
using namespace std;
#ifdef HAVE_OPENCL
PARAM_TEST_CASE(HOG,cv::Size,int)
{
cv::Size winSize;
int type;
std::vector<cv::ocl::Info> oclinfo;
virtual void SetUp()
{
winSize = GET_PARAM(0);
type = GET_PARAM(1);
int devnums = getDevice(oclinfo);
CV_Assert(devnums > 0);
}
};
TEST_P(HOG, GetDescriptors)
{
// Load image
cv::Mat img_rgb = readImage("D:road.png");
ASSERT_FALSE(img_rgb.empty());
// Convert image
cv::Mat img;
switch (type)
{
case CV_8UC1:
cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
break;
case CV_8UC4:
default:
cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
break;
}
// HOGs
cv::ocl::HOGDescriptor ocl_hog;
ocl_hog.gamma_correction = true;
// Compute descriptor
cv::ocl::oclMat d_descriptors;
//down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);
double totalgputick=0;
double totalgputick_kernel=0;
double t1=0;
double t2=0;
for(int j = 0; j < LOOP_TIMES+1; j ++)
{
t1 = (double)cvGetTickCount();//gpu start1
cv::ocl::oclMat d_img=cv::ocl::oclMat(img);//upload
t2=(double)cvGetTickCount();//kernel
ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
t2 = (double)cvGetTickCount() - t2;//kernel
cv::Mat down_descriptors;
d_descriptors.download(down_descriptors);
t1 = (double)cvGetTickCount() - t1;//gpu end1
if(j == 0)
continue;
totalgputick=t1+totalgputick;
totalgputick_kernel=t2+totalgputick_kernel;
}
cout << "average gpu runtime is " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
cout << "average gpu runtime without data transfer is " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
}
TEST_P(HOG, Detect)
{
// Load image
cv::Mat img_rgb = readImage("D:road.png");
ASSERT_FALSE(img_rgb.empty());
// Convert image
cv::Mat img;
switch (type)
{
case CV_8UC1:
cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
break;
case CV_8UC4:
default:
cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
break;
}
// HOGs
if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128)))
winSize = cv::Size(64, 128);
cv::ocl::HOGDescriptor ocl_hog(winSize);
ocl_hog.gamma_correction = true;
cv::HOGDescriptor hog;
hog.winSize = winSize;
hog.gammaCorrection = true;
if (winSize.width == 48 && winSize.height == 96)
{
// daimler's base
ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector48x96());
hog.setSVMDetector(hog.getDaimlerPeopleDetector());
}
else if (winSize.width == 64 && winSize.height == 128)
{
ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector64x128());
hog.setSVMDetector(hog.getDefaultPeopleDetector());
}
else
{
ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
hog.setSVMDetector(hog.getDefaultPeopleDetector());
}
// OpenCL detection
std::vector<cv::Point> d_v_locations;
double totalgputick=0;
double totalgputick_kernel=0;
double t1=0;
double t2=0;
for(int j = 0; j < LOOP_TIMES+1; j ++)
{
t1 = (double)cvGetTickCount();//gpu start1
cv::ocl::oclMat d_img=cv::ocl::oclMat(img);//upload
t2=(double)cvGetTickCount();//kernel
ocl_hog.detect(d_img, d_v_locations, 0);
t2 = (double)cvGetTickCount() - t2;//kernel
t1 = (double)cvGetTickCount() - t1;//gpu end1
if(j == 0)
continue;
totalgputick=t1+totalgputick;
totalgputick_kernel=t2+totalgputick_kernel;
}
cout << "average gpu runtime is " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
cout << "average gpu runtime without data transfer is " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
}
INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
#endif //HAVE_OPENCL
......@@ -1155,13 +1155,13 @@ void arithmetic_lut_run(const oclMat &src1, const oclMat &src2, oclMat &dst, str
int rows = src1.rows;
int cols = src1.cols;
//int step = src1.step;
int src_step = src1.step;
int dst_step = dst.step;
int src_step = src1.step/ src1.elemSize();
int dst_step = dst.step/ dst.elemSize();
int whole_rows = src1.wholerows;
int whole_cols = src1.wholecols;
int src_offset = src1.offset;
int dst_offset = dst.offset;
int lut_offset = src2.offset;
int src_offset = src1.offset/ src1.elemSize();
int dst_offset = dst.offset/ dst.elemSize();
int lut_offset = src2.offset/ src2.elemSize();
int left_col = 0, right_col = 0;
size_t localSize[] = {16, 16, 1};
//cl_kernel kernel = openCLGetKernelFromSource(clCxt,&arithm_LUT,kernelName);
......@@ -2381,4 +2381,5 @@ void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
arithmetic_pow_run(x, p, y, kernelName, &arithm_pow);
}
#endif /* !defined (HAVE_OPENCL) */
......@@ -171,10 +171,10 @@ void cv::ocl::Canny(const oclMat& src, CannyBuf& buf, oclMat& dst, double low_th
std::swap( low_thresh, high_thresh );
dst.create(src.size(), CV_8U);
//dst.setTo(Scalar::all(0));
dst.setTo(Scalar::all(0));
buf.create(src.size(), apperture_size);
//buf.edgeBuf.setTo(Scalar::all(0));
buf.edgeBuf.setTo(Scalar::all(0));
if (apperture_size == 3)
{
......@@ -207,11 +207,11 @@ void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& d
std::swap( low_thresh, high_thresh);
dst.create(dx.size(), CV_8U);
//dst.setTo(Scalar::all(0));
dst.setTo(Scalar::all(0));
buf.dx = dx; buf.dy = dy;
buf.create(dx.size(), -1);
//buf.edgeBuf.setTo(Scalar::all(0));
buf.edgeBuf.setTo(Scalar::all(0));
calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
......@@ -367,7 +367,6 @@ void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, voi
while(count > 0)
{
//counter.setTo(0);
args.clear();
size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
......
This diff is collapsed.
......@@ -538,11 +538,11 @@ namespace cv
if(NULL != build_options)
{
src_sign << (int64)source << clCxt->impl->clContext << "_" << build_options;
src_sign << (int64)(*source) << clCxt->impl->clContext << "_" << build_options;
}
else
{
src_sign << (int64)source << clCxt->impl->clContext;
src_sign << (int64)(*source) << clCxt->impl->clContext;
}
srcsign = src_sign.str();
......@@ -562,11 +562,11 @@ namespace cv
strcat(all_build_options, build_options);
if(all_build_options != NULL)
{
filename = clCxt->impl->Binpath + "\\" + kernelName + "_" + clCxt->impl->devName + all_build_options + ".clb";
filename = clCxt->impl->Binpath + kernelName + "_" + clCxt->impl->devName + all_build_options + ".clb";
}
else
{
filename = clCxt->impl->Binpath + "\\" + kernelName + "_" + clCxt->impl->devName + ".clb";
filename = clCxt->impl->Binpath + kernelName + "_" + clCxt->impl->devName + ".clb";
}
FILE *fp;
......
......@@ -125,38 +125,38 @@ __kernel
void LUT_C4_D0( __global uchar4 *dst,
__global uchar4 *src,
__constant uchar *table,
uint rows,
uint cols,
uint channels,
uint whole_rows,
uint whole_cols,
uint src_offset,
uint dst_offset,
uint lut_offset,
uint src_step,
uint dst_step)
int rows,
int cols,
int channels,
int whole_rows,
int whole_cols,
int src_offset,
int dst_offset,
int lut_offset,
int src_step,
int dst_step)
{
uint gidx = get_global_id(0);
uint gidy = get_global_id(1);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
uint lidx = get_local_id(0);
uint lidy = get_local_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int src_index = mad24(gidy,src_step,gidx+src_offset);
int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
__local uchar l[256];
l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
mem_fence(CLK_LOCAL_MEM_FENCE);
//mem_fence(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
gidx = gidx >= cols?cols-1:gidx;
gidy = gidy >= rows?rows-1:gidy;
uint src_index = src_offset/4 + gidy * src_step/4 + gidx;
uint dst_index = dst_offset/4 + gidy * dst_step/4 + gidx;
uchar4 p = src[src_index];
dst[dst_index].x = l[p.x];
dst[dst_index].y = l[p.y];
dst[dst_index].z = l[p.z];
dst[dst_index].w = l[p.w];
if(gidx<cols && gidy<rows)
{
uchar4 p = src[src_index];
uchar4 q;
q.x = l[p.x];
q.y = l[p.y];
q.z = l[p.z];
q.w = l[p.w];
dst[dst_index] = q;
}
}
......@@ -33,13 +33,13 @@
//
//
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#define WORKGROUPSIZE 256
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int dstStep_in_piexl,int pixel_end)
{
int id = get_global_id(0);
//read data from source
//int pixel_end = mul24(cols -1 , rows -1);
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
pixelid = clamp(pixelid,0,pixel_end);
......@@ -54,36 +54,19 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
//permutate the data in LDS to avoid global memory conflict
__local GENTYPE4 rearrange[WORKGROUPSIZE*4];
int lid = get_local_id(0)<<2;
rearrange[lid++] = outpix0;
rearrange[lid++] = outpix1;
rearrange[lid++] = outpix2;
rearrange[lid] = outpix3;
lid = get_local_id(0);
barrier(CLK_LOCAL_MEM_FENCE);
outpix0 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpix1 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpix2 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpix3 = rearrange[lid];
//calculate output index
int4 outx, outy;
int4 startid = mad24((int)get_group_id(0),WORKGROUPSIZE*4,(int)get_local_id(0));
startid.y+=WORKGROUPSIZE;
startid.z+=WORKGROUPSIZE*2;
startid.w+=WORKGROUPSIZE*3;
outx = startid%(int4)cols;
outy = startid/(int4)cols;
int4 addr = mad24(outy,dstStep_in_piexl,outx);
int4 outy = (id<<2)/cols;
int4 outx = (id<<2)%cols;
outx.y++;
outx.z+=2;
outx.w+=3;
outy = select(outy,outy+1,outx>=cols);
outx = select(outx,outx-cols,outx>=cols);
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
//outx = select(outx,(int4)outx.x,outy>=rows);
//outy = select(outy,(int4)outy.x,outy>=rows);
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
if(outx.w<cols && outy.w<rows)
{
dst[addr.x] = outpix0;
......@@ -119,10 +102,10 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY
int x = id % cols;
int4 x4 = (int4)(x,x+1,x+2,x+3);
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
y4=clamp(y4,(int4)0,(int4)(rows-1));
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
//read data from src
pixel0 = src[addr.x];
pixel1 = src[addr.y];
pixel2 = src[addr.z];
......@@ -137,40 +120,23 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY
outpixel2.y = pixel3.x;
outpixel2.z = pixel3.y;
outpixel2.w = pixel3.z;
//permutate the data in LDS to avoid global memory conflict
__local GENTYPE4 rearrange[WORKGROUPSIZE*3];
int lid = mul24((int)get_local_id(0),3);
rearrange[lid++] = pixel0;
rearrange[lid++] = outpixel1;
rearrange[lid] = outpixel2;
barrier(CLK_LOCAL_MEM_FENCE);
lid = get_local_id(0);
pixel0 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpixel1 = rearrange[lid];
lid+=WORKGROUPSIZE;
outpixel2 = rearrange[lid];
//calcultate output index
int3 startid = mad24((int)get_group_id(0),WORKGROUPSIZE*3,(int)get_local_id(0));
startid.y+=WORKGROUPSIZE;
startid.z+=WORKGROUPSIZE*2;
//id = mul24(id>>2 , 3);
if(startid.z <= pixel_end)
int4 outaddr = mul24(id>>2 , 3);
outaddr.y++;
outaddr.z+=2;
//printf("%d ",outaddr.z);
if(outaddr.z <= pixel_end)
{
dst[startid.x] = pixel0;
dst[startid.y] = outpixel1;
dst[startid.z] = outpixel2;
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
dst[outaddr.z] = outpixel2;
}
else if(startid.y <= pixel_end)
else if(outaddr.y <= pixel_end)
{
dst[startid.x] = pixel0;
dst[startid.y] = outpixel1;
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
}
else if(startid.x <= pixel_end)
else if(outaddr.x <= pixel_end)
{
dst[startid.x] = pixel0;
}
dst[outaddr.x] = pixel0;
}
}
......@@ -87,6 +87,7 @@ The length of the convovle kernel supported is only related to the MAX size of L
which is HW related.
Niko
6/29/2011
The info above maybe obsolete.
***********************************************************************************/
......
......@@ -92,6 +92,7 @@ For channels = 2, the RADIUS is no more than LSIZE0
For channels = 4, arbitary RADIUS is supported unless the LDS is not enough
Niko
6/29/2011
The info above maybe obsolete.
***********************************************************************************/
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
......
......@@ -302,7 +302,9 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
nodecounter = splitnode;
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
{
lclcount[0]=0;
//barrier(CLK_LOCAL_MEM_FENCE);
//if(lcl_id == 0)
lclcount[0]=0;
barrier(CLK_LOCAL_MEM_FENCE);
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
......@@ -314,14 +316,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0;queueloop<queuecount_loop && lcl_compute_win_id < queuecount;queueloop++)
for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
int tempnodecounter = lcl_compute_id;
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount) {
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
{
......@@ -353,10 +358,12 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
//}
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter+=lcl_compute_win;
tempnodecounter +=lcl_compute_win;
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount) {
for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
{
stage_sum += partialsum[lcl_id+i];
......@@ -368,11 +375,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
lcl_compute_win_id +=(1<<perfscale);
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
barrier(CLK_LOCAL_MEM_FENCE);
queuecount = lclcount[0];
nodecounter += stageinfo.x;
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_id<queuecount)
{
int temp = lcloutindex[lcl_id<<1];
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
/************************************** convolve **************************************/
__kernel void convolve_D5 (__global float *src, __global float *temp1, __global float *dst,
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight)
{
__local float smem[16 + 2 * 8][16 + 2 * 8];
int x = get_local_id(0);
int y = get_local_id(1);
int gx = get_global_id(0);
int gy = get_global_id(1);
// x | x 0 | 0
// -----------
// x | x 0 | 0
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
// x | x 0 | 0
// -----------
// x | x 0 | 0
smem[y + 16][x] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
barrier(CLK_LOCAL_MEM_FENCE);
if (gx < cols && gy < rows)
{
float res = 0;
for (int i = 0; i < kHeight; ++i)
{
for (int j = 0; j < kWidth; ++j)
{
res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * (k_step>>2) + j];
}
}
dst[gy*(dst_step >> 2)+gx] = res;
}
}
......@@ -8,6 +8,7 @@
// @Authors
// Niko Li, newlife20080214@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Xu Pang, pangxu010@163.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
......@@ -33,89 +34,127 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
#define PARTITAL_HISTGRAM256_COUNT (256)
#define PARTIAL_HISTOGRAM256_COUNT (256)
#define HISTOGRAM256_BIN_COUNT (256)
#define HISTGRAM256_WORK_GROUP_SIZE (256)
#define HISTGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT)
__kernel __attribute__((reqd_work_group_size(256,1,1)))void calc_sub_hist_D0(__global const uchar4* src,
int src_step,
int src_offset,
__global int* buf,
int data_count,
int cols,
int inc_x,
int inc_y,
int dst_offset)
#define HISTOGRAM256_WORK_GROUP_SIZE (256)
#define HISTOGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT)
#define NBANKS (16)
#define NBANKS_BIT (4)
__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
__global const uint4* src,
int src_step, int src_offset,
__global int* globalHist,
int dataCount, int cols,
int inc_x, int inc_y,
int hist_step)
{
int x = get_global_id(0);
int lx = get_local_id(0);
int gx = get_group_id(0);
int total_threads = get_global_size(0);
src += src_offset;
__local int s_hist[HISTGRAM256_LOCAL_MEM_SIZE];
s_hist[lx] = 0;
int pos_y = x / cols;
int pos_x = x - mul24(pos_y, cols);
barrier(CLK_LOCAL_MEM_FENCE);
for(int pos = x; pos < data_count; pos += total_threads)
{
int4 data = convert_int4(src[mad24(pos_y,src_step,pos_x)]);
atomic_inc(s_hist + data.x);
atomic_inc(s_hist + data.y);
atomic_inc(s_hist + data.z);
atomic_inc(s_hist + data.w);
pos_x +=inc_x;
int off = (pos_x >= cols ? -1 : 0);
pos_x = mad24(off,cols,pos_x);
pos_y += inc_y - off;
//pos_x = pos_x > cols ? pos_x - cols : pos_x;
//pos_y = pos_x > cols ? pos_y + 1 : pos_y;
}
barrier(CLK_LOCAL_MEM_FENCE);
buf[ mad24(gx, dst_offset, lx)] = s_hist[lx];
__local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
int gid = get_global_id(0);
int lid = get_local_id(0);
int gx = get_group_id(0);
int gsize = get_global_size(0);
int lsize = get_local_size(0);
const int shift = 8;
const int mask = HISTOGRAM256_BIN_COUNT-1;
int offset = (lid & (NBANKS-1));// lid % NBANKS
uint4 data, temp1, temp2, temp3, temp4;
src += src_offset;
//clear LDS
for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
{
subhist[idx] = 0;
subhist[idx+=lsize] = 0;
subhist[idx+=lsize] = 0;
subhist[idx+=lsize] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
//read and scatter
int y = gid/cols;
int x = gid - mul24(y, cols);
for(int idx=gid; idx<dataCount; idx+=gsize)
{
data = src[mad24(y, src_step, x)];
temp1 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp2 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp3 = ((data & mask) << NBANKS_BIT) + offset;
data >>= shift;
temp4 = ((data & mask) << NBANKS_BIT) + offset;
atomic_inc(subhist + temp1.x);
atomic_inc(subhist + temp1.y);
atomic_inc(subhist + temp1.z);
atomic_inc(subhist + temp1.w);
atomic_inc(subhist + temp2.x);
atomic_inc(subhist + temp2.y);
atomic_inc(subhist + temp2.z);
atomic_inc(subhist + temp2.w);
atomic_inc(subhist + temp3.x);
atomic_inc(subhist + temp3.y);
atomic_inc(subhist + temp3.z);
atomic_inc(subhist + temp3.w);
atomic_inc(subhist + temp4.x);
atomic_inc(subhist + temp4.y);
atomic_inc(subhist + temp4.z);
atomic_inc(subhist + temp4.w);
x += inc_x;
int off = ((x>=cols) ? -1 : 0);
x = mad24(off, cols, x);
y += inc_y - off;
}
barrier(CLK_LOCAL_MEM_FENCE);
//reduce local banks to single histogram per workgroup
int bin1=0, bin2=0, bin3=0, bin4=0;
for(int i=0; i<NBANKS; i+=4)
{
bin1 += subhist[(lid << NBANKS_BIT) + i];
bin2 += subhist[(lid << NBANKS_BIT) + i+1];
bin3 += subhist[(lid << NBANKS_BIT) + i+2];
bin4 += subhist[(lid << NBANKS_BIT) + i+3];
}
globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
}
__kernel void __attribute__((reqd_work_group_size(1,256,1)))calc_sub_hist2_D0( __global const uchar* src,
int src_step,
int src_offset,
__global int* buf,
int left_col,
int cols,
int rows,
int dst_offset)
__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))calc_sub_hist_border_D0(
__global const uchar* src,
int src_step, int src_offset,
__global int* globalHist,
int left_col, int cols,
int rows, int hist_step)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gx = get_group_id(0);
int gy = get_group_id(1);
int gnum = get_num_groups(0);
int output_row = mad24(gy,gnum,gx);
//int lidx = get_local_id(0);
int lidy = get_local_id(1);
__local int s_hist[HISTGRAM256_LOCAL_MEM_SIZE+1];
s_hist[lidy] = 0;
//mem_fence(CLK_LOCAL_MEM_FENCE);
int lidy = get_local_id(1);
int gx = get_group_id(0);
int gy = get_group_id(1);
int gn = get_num_groups(0);
int rowIndex = mad24(gy, gn, gx);
rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1);
__local int subhist[HISTOGRAM256_BIN_COUNT + 1];
subhist[lidy] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
//clamp(gidx,mask,cols-1);
gidx = gidx >= left_col ? cols+gidx : gidx;
//gidy = gidy >= rows?rows-1:gidy;
gidx = ((gidx>left_col) ? (gidx+cols) : gidx);
int src_index = src_offset + mad24(gidy, src_step, gidx);
int p = (int)src[src_index];
atomic_inc(subhist + p);
barrier(CLK_LOCAL_MEM_FENCE);
int src_index = src_offset + mad24(gidy,src_step,gidx);
//int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
//uchar4 p,q;
barrier(CLK_LOCAL_MEM_FENCE);
int p = (int)src[src_index];
p = gidy >= rows ? HISTGRAM256_LOCAL_MEM_SIZE : p;
atomic_inc(s_hist + p);
barrier(CLK_LOCAL_MEM_FENCE);
buf[ mad24(output_row, dst_offset, lidy)] += s_hist[lidy];
globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
}
__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
__global int* hist,
......@@ -126,13 +165,13 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global
int sum = 0;
for(int i = lx; i < PARTITAL_HISTGRAM256_COUNT; i += HISTGRAM256_WORK_GROUP_SIZE)
for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE)
sum += buf[ mad24(i, src_step, gx)];
__local int data[HISTGRAM256_WORK_GROUP_SIZE];
__local int data[HISTOGRAM256_WORK_GROUP_SIZE];
data[lx] = sum;
for(int stride = HISTGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(lx < stride)
......
......@@ -109,10 +109,10 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
int4 val1, val2, val;
int4 sdata1, sdata2, sdata3, sdata4;
int4 pos1 = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
int4 pos2 = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
int4 pos3 = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
int4 pos4 = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
int4 pos1 = mad24((int4)y, (int4)srcstep_in_pixel, x+(int4)srcoffset_in_pixel);
int4 pos2 = mad24((int4)y, (int4)srcstep_in_pixel, x_+(int4)srcoffset_in_pixel);
int4 pos3 = mad24((int4)y_, (int4)srcstep_in_pixel, x+(int4)srcoffset_in_pixel);
int4 pos4 = mad24((int4)y_, (int4)srcstep_in_pixel, x_+(int4)srcoffset_in_pixel);
sdata1.s0 = src[pos1.s0];
sdata1.s1 = src[pos1.s1];
......@@ -136,7 +136,7 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri
val1 = mul24(U1 , sdata1) + mul24(U , sdata2);
val2 = mul24(U1 , sdata3) + mul24(U , sdata4);
val = mul24(V1 , val1) + mul24(V , val2);
val = mul24((int4)V1 , val1) + mul24((int4)V , val2);
//__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx);
//uchar4 dVal = *d;
......@@ -205,8 +205,8 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int4 data1 = convert_int4(src[srcpos.y]);
int4 data2 = convert_int4(src[srcpos.z]);
int4 data3 = convert_int4(src[srcpos.w]);
int4 val = mul24(mul24(U1, V1) , data0) + mul24(mul24(U, V1) , data1)
+mul24(mul24(U1, V) , data2)+mul24(mul24(U, V) , data3);
int4 val = mul24((int4)mul24(U1, V1) , data0) + mul24((int4)mul24(U, V1) , data1)
+mul24((int4)mul24(U1, V) , data2)+mul24((int4)mul24(U, V) , data3);
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
uchar4 uval = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
......@@ -314,7 +314,7 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
sy = min((int)floor(s5), src_rows-1);
uchar4 val;
int4 pos = mad24(sy, srcstep_in_pixel, sx+srcoffset_in_pixel);
int4 pos = mad24((int4)sy, (int4)srcstep_in_pixel, sx+(int4)srcoffset_in_pixel);
val.s0 = src[pos.s0];
val.s1 = src[pos.s1];
val.s2 = src[pos.s2];
......
......@@ -91,8 +91,8 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
F4 X0 = M[0]*DX + M[1]*dy + M[2];
F4 Y0 = M[3]*DX + M[4]*dy + M[5];
F4 W = M[6]*DX + M[7]*dy + M[8];
W = (W!=0) ? 1./W : 0;
F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0;
W = (W!=zero) ? one/W : zero;
short4 X = convert_short4(rint(X0*W));
short4 Y = convert_short4(rint(Y0*W));
int4 sx = convert_int4(X);
......
......@@ -34,7 +34,8 @@
//
//
#define F float
#define F2 float2
#define F4 float4
__kernel void convert_to_S4_C1_D0(
__global const int* restrict srcMat,
__global uchar* dstMat,
......@@ -56,17 +57,41 @@ __kernel void convert_to_S4_C1_D0(
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if ( (x < cols + off_src) & (y < rows) )
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);
//int trans_src[10] = {temp_src1.y,temp_src1.z,temp_src1.w,temp_src.x,temp_src.y,temp_src.z,temp_src.w,temp_src2.x,temp_src2.y,temp_src2.z};
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end) ? convert_uchar_sat(temp_src.x*alpha+beta) : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end) ? convert_uchar_sat(temp_src.y*alpha+beta) : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end) ? convert_uchar_sat(temp_src.z*alpha+beta) : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end) ? convert_uchar_sat(temp_src.w*alpha+beta) : temp_dst.w;
*(__global uchar4*)(dstMat+dstidx) = temp_dst;
if(x+3<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
}
}
}
......@@ -114,17 +139,41 @@ __kernel void convert_to_S5_C1_D0(
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if ( (x < cols + off_src) & (y < rows) )
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = vload4(0,srcMat+srcidx);
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);
//int trans_src[10] = {temp_src1.y,temp_src1.z,temp_src1.w,temp_src.x,temp_src.y,temp_src.z,temp_src.w,temp_src2.x,temp_src2.y,temp_src2.z};
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end) ? convert_uchar_sat(temp_src.x*alpha+beta) : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end) ? convert_uchar_sat(temp_src.y*alpha+beta) : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end) ? convert_uchar_sat(temp_src.z*alpha+beta) : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end) ? convert_uchar_sat(temp_src.w*alpha+beta) : temp_dst.w;
*(__global uchar4*)(dstMat+dstidx) = temp_dst;
if(x+3<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = vload2(0,srcMat+srcidx);
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
}
}
}
__kernel void convert_to_S5_C4_D0(
......
......@@ -34,158 +34,9 @@
//
//
__kernel void copy_to_with_mask_C1_D0(
__global const uchar* restrict srcMat,
__global uchar* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
int dst_addr_start = mad24((uint)y, (uint)dstStep_in_pixel, (uint)dstoffset_in_pixel);
int dst_addr_end = mad24((uint)y, (uint)dstStep_in_pixel, (uint)cols+dstoffset_in_pixel);
int dstidx = mad24((uint)y, (uint)dstStep_in_pixel, (uint)x+ dstoffset_in_pixel) & (int)0xfffffffc;
int vector_off = dstoffset_in_pixel & 3;
int srcidx = mad24((uint)y, (uint)srcStep_in_pixel, (uint)x + srcoffset_in_pixel - vector_off);
int mask_addr_start = mad24((uint)y, (uint)maskStep, (uint)maskoffset);
int mask_addr_end = mad24((uint)y, (uint)maskStep, (uint)cols+maskoffset);
int maskidx = mad24((uint)y, (uint)maskStep, (uint)x + maskoffset - vector_off);
if ( (x < cols + dstoffset_in_pixel) & (y < rows) )
{
uchar4 src_data = vload4(0, srcMat + srcidx);
uchar4 mask_data = vload4(0, maskMat + maskidx);
uchar4 dst_data = *((__global uchar4 *)(dstMat + dstidx));
uchar4 tmp_data;
mask_data.x = ((maskidx + 0 >= mask_addr_start) && (maskidx + 0 < mask_addr_end)) ? mask_data.x : 0;
mask_data.y = ((maskidx + 1 >= mask_addr_start) && (maskidx + 1 < mask_addr_end)) ? mask_data.y : 0;
mask_data.z = ((maskidx + 2 >= mask_addr_start) && (maskidx + 2 < mask_addr_end)) ? mask_data.z : 0;
mask_data.w = ((maskidx + 3 >= mask_addr_start) && (maskidx + 3 < mask_addr_end)) ? mask_data.w : 0;
tmp_data.x = ((dstidx + 0 >= dst_addr_start) && (dstidx + 0 < dst_addr_end) && (mask_data.x))
? src_data.x : dst_data.x;
tmp_data.y = ((dstidx + 1 >= dst_addr_start) && (dstidx + 1 < dst_addr_end) && (mask_data.y))
? src_data.y : dst_data.y;
tmp_data.z = ((dstidx + 2 >= dst_addr_start) && (dstidx + 2 < dst_addr_end) && (mask_data.z))
? src_data.z : dst_data.z;
tmp_data.w = ((dstidx + 3 >= dst_addr_start) && (dstidx + 3 < dst_addr_end) && (mask_data.w))
? src_data.w : dst_data.w;
(*(__global uchar4*)(dstMat+dstidx)) = tmp_data;
}
}
__kernel void copy_to_with_mask_C4_D0(
__global const uchar4* restrict srcMat,
__global uchar4* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}
__kernel void copy_to_with_mask_C1_D4(
__global const int* restrict srcMat,
__global int* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}
__kernel void copy_to_with_mask_C4_D4(
__global const int4* restrict srcMat,
__global int4* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}
__kernel void copy_to_with_mask_C1_D5(
__global const float* restrict srcMat,
__global float* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
}
__kernel void copy_to_with_mask_C4_D5(
__global const float4* restrict srcMat,
__global float4* dstMat,
__kernel void copy_to_with_mask(
__global const GENTYPE* restrict srcMat,
__global GENTYPE* dstMat,
__global const uchar* restrict maskMat,
int cols,
int rows,
......@@ -198,11 +49,13 @@ __kernel void copy_to_with_mask_C4_D5(
{
int x=get_global_id(0);
int y=get_global_id(1);
x = x< cols ? x: cols-1;
y = y< rows ? y: rows-1;
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
if (mask)
{
dstMat[dstidx] = srcMat[srcidx];
}
......
......@@ -40,24 +40,40 @@ __kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
int idx = mad24(y,dstStep_in_pixel,(int)(x+ offset_in_pixel & (int)0xfffffffc));
//int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
//int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
uchar4 out;
out.x = out.y = out.z = out.w = scalar;
if ( (idx>=addr_start)&(idx+3 < addr_end) & (y < rows))
if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
{
*(__global uchar4*)(dstMat+idx) = out;
}
else if(y < rows)
else
{
uchar4 temp = *(__global uchar4*)(dstMat+idx);
temp.x = (idx>=addr_start)&(idx < addr_end)? out.x : temp.x;
temp.y = (idx+1>=addr_start)&(idx+1 < addr_end)? out.y : temp.y;
temp.z = (idx+2>=addr_start)&(idx+2 < addr_end)? out.z : temp.z;
temp.w = (idx+3>=addr_start)&(idx+3 < addr_end)? out.w : temp.w;
*(__global uchar4*)(dstMat+idx) = temp;
if((x+3 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
dstMat[idx+2] = out.z;
dstMat[idx+3] = out.w;
}
if((x+2 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
dstMat[idx+2] = out.z;
}
else if((x+1 < cols) && (y < rows))
{
dstMat[idx] = out.x;
dstMat[idx+1] = out.y;
}
else if((x < cols) && (y < rows))
{
dstMat[idx] = out.x;
}
}
}
......
......@@ -33,81 +33,6 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
/*
__kernel void set_to_with_mask_C1_D0(
float4 scalar,
__global uchar* dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = convert_uchar_sat(scalar.x);
}
}
*/
//#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void set_to_with_mask_C1_D0(
uchar scalar,
__global uchar* dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
int mask_addr_start = mad24(y,maskStep,maskoffset);
int mask_addr_end = mad24(y,maskStep,cols+maskoffset);
int maskidx = mad24(y,maskStep,x+ maskoffset & (int)0xfffffffc);
int off_mask = (maskoffset & 3) - (dstoffset_in_pixel & 3) +3;
if ( (x < cols) & (y < rows) )
{
uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);
uchar4 temp_mask1 = *(__global uchar4*)(maskMat+maskidx-4);
uchar4 temp_mask = *(__global uchar4*)(maskMat+maskidx);
uchar4 temp_mask2 = *(__global uchar4*)(maskMat+maskidx+4);
temp_mask1.x = (maskidx-4 >=mask_addr_start)&(maskidx-4 < mask_addr_end) ? temp_mask1.x : 0;
temp_mask1.y = (maskidx-3 >=mask_addr_start)&(maskidx-3 < mask_addr_end) ? temp_mask1.y : 0;
temp_mask1.z = (maskidx-2 >=mask_addr_start)&(maskidx-2 < mask_addr_end) ? temp_mask1.z : 0;
temp_mask1.w = (maskidx-1 >=mask_addr_start)&(maskidx-1 < mask_addr_end) ? temp_mask1.w : 0;
temp_mask.x = (maskidx >=mask_addr_start)&(maskidx < mask_addr_end) ? temp_mask.x : 0;
temp_mask.y = (maskidx+1 >=mask_addr_start)&(maskidx+1 < mask_addr_end) ? temp_mask.y : 0;
temp_mask.z = (maskidx+2 >=mask_addr_start)&(maskidx+2 < mask_addr_end) ? temp_mask.z : 0;
temp_mask.w = (maskidx+3 >=mask_addr_start)&(maskidx+3 < mask_addr_end) ? temp_mask.w : 0;
temp_mask2.x = (maskidx+4 >=mask_addr_start)&(maskidx+4 < mask_addr_end) ? temp_mask2.x : 0;
temp_mask2.y = (maskidx+5 >=mask_addr_start)&(maskidx+5 < mask_addr_end) ? temp_mask2.y : 0;
temp_mask2.z = (maskidx+6 >=mask_addr_start)&(maskidx+6 < mask_addr_end) ? temp_mask2.z : 0;
temp_mask2.w = (maskidx+7 >=mask_addr_start)&(maskidx+7 < mask_addr_end) ? temp_mask2.w : 0;
uchar trans_mask[10] = {temp_mask1.y,temp_mask1.z,temp_mask1.w,temp_mask.x,temp_mask.y,temp_mask.z,temp_mask.w,temp_mask2.x,temp_mask2.y,temp_mask2.z};
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end)& trans_mask[off_mask] ? scalar : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end)& trans_mask[off_mask+1] ? scalar : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end)& trans_mask[off_mask+2] ? scalar : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end)& trans_mask[off_mask+3] ? scalar : temp_dst.w;
*(__global uchar4*)(dstMat+dstidx) = temp_dst;
}
}
__kernel void set_to_with_mask(
GENTYPE scalar,
__global GENTYPE * dstMat,
......@@ -121,10 +46,12 @@ __kernel void set_to_with_mask(
{
int x=get_global_id(0);
int y=get_global_id(1);
x = x< cols ? x: cols-1;
y = y< rows ? y: rows-1;
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
if (mask)
{
dstMat[dstidx] = scalar;
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -339,24 +339,22 @@ inline int divUp(int total, int grain)
void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName)
{
CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
src.rows == dst.rows && src.cols == dst.cols);
src.rows == dst.rows && src.cols == dst.cols
&& mask.type() == CV_8UC1);
vector<pair<size_t , const void *> > args;
int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1},
{2, 2, 1, 1, 1, 1, 1},
{8, 8, 8, 8 , 4, 4, 4}, //vector length is undefined when channels = 3
{1, 1, 1, 1, 1, 1, 1}
std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
{"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
{"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
{"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
};
char compile_option[32];
sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.channels()-1][dst.depth()].c_str());
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3];
int vector_length = vector_lengths[dst.channels() -1][dst.depth()];
int offset_cols = divUp(dst.offset, dst.elemSize()) & (vector_length - 1);
int cols = vector_length == 1 ? divUp(dst.cols, vector_length) : divUp(dst.cols + offset_cols, vector_length);
globalThreads[0] = divUp(cols, localThreads[0]) * localThreads[0];
globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
globalThreads[2] = 1;
......@@ -376,7 +374,7 @@ void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, strin
args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
openCLExecuteKernel(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
localThreads, args, dst.channels(), dst.depth());
localThreads, args, -1, -1,compile_option);
}
void cv::ocl::oclMat::copyTo( oclMat &m ) const
......@@ -679,10 +677,6 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
globalThreads[2] = 1;
if(dst.type() == CV_8UC1)
{
globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
}
int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
char compile_option[32];
union sc
......@@ -697,7 +691,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
}val;
switch(dst.depth())
{
case 0:
case CV_8U:
val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
......@@ -716,7 +710,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
CV_Error(CV_StsUnsupportedFormat,"unsupported channels");
}
break;
case 1:
case CV_8S:
val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
......@@ -735,7 +729,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
CV_Error(CV_StsUnsupportedFormat,"unsupported channels");
}
break;
case 2:
case CV_16U:
val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
......@@ -754,7 +748,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
CV_Error(CV_StsUnsupportedFormat,"unsupported channels");
}
break;
case 3:
case CV_16S:
val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
......@@ -773,7 +767,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
CV_Error(CV_StsUnsupportedFormat,"unsupported channels");
}
break;
case 4:
case CV_32S:
val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
......@@ -792,7 +786,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
CV_Error(CV_StsUnsupportedFormat,"unsupported channels");
}
break;
case 5:
case CV_32F:
val.fval.s[0] = scalar.val[0];
val.fval.s[1] = scalar.val[1];
val.fval.s[2] = scalar.val[2];
......@@ -811,7 +805,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
CV_Error(CV_StsUnsupportedFormat,"unsupported channels");
}
break;
case 6:
case CV_64F:
val.dval.s[0] = scalar.val[0];
val.dval.s[1] = scalar.val[1];
val.dval.s[2] = scalar.val[2];
......@@ -872,14 +866,7 @@ oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
}
else
{
if(type()==CV_8UC1)
{
set_to_withmask_run(*this, scalar, mask,"set_to_with_mask_C1_D0");
}
else
{
set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
}
set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
}
return *this;
......@@ -942,6 +929,11 @@ void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
/* core logic */
_type &= TYPE_MASK;
download_channels = CV_MAT_CN(_type);
if(download_channels==3)
{
_type = CV_MAKE_TYPE((CV_MAT_DEPTH(_type)),4);
}
if( rows == _rows && cols == _cols && type() == _type && data )
return;
if( data )
......@@ -986,6 +978,7 @@ void cv::ocl::oclMat::release()
step = rows = cols = 0;
offset = wholerows = wholecols = 0;
refcount = 0;
download_channels=0;
}
#endif /* !defined (HAVE_OPENCL) */
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -122,7 +122,8 @@ namespace
SURF_OCL_Invoker(SURF_OCL& surf, const oclMat& img, const oclMat& mask) :
surf_(surf),
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty())
use_mask(!mask.empty()),
imgTex(NULL), sumTex(NULL), maskSumTex(NULL)
{
CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
......@@ -475,6 +476,11 @@ void SURF_OCL_Invoker::bindImgTex(const oclMat& img)
format.image_channel_data_type = CL_UNSIGNED_INT8;
format.image_channel_order = CL_R;
if(imgTex)
{
openCLFree(imgTex);
}
#if CL_VERSION_1_2
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
......@@ -509,6 +515,12 @@ void SURF_OCL_Invoker::bindSumTex(const oclMat& sum)
int err;
format.image_channel_data_type = CL_UNSIGNED_INT32;
format.image_channel_order = CL_R;
if(sumTex)
{
openCLFree(sumTex);
}
#if CL_VERSION_1_2
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
......@@ -542,6 +554,12 @@ void SURF_OCL_Invoker::bindMaskSumTex(const oclMat& maskSum)
int err;
format.image_channel_data_type = CL_UNSIGNED_INT32;
format.image_channel_order = CL_R;
if(maskSumTex)
{
openCLFree(maskSumTex);
}
#if CL_VERSION_1_2
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
......
This diff is collapsed.
......@@ -54,6 +54,8 @@ IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
const char* TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
#define MTEMP_SIZES testing::Values(cv::Size(128, 256), cv::Size(1024, 768))
PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMethod)
{
cv::Size size;
......@@ -157,7 +159,7 @@ TEST_P(MatchTemplate32F, Accuracy)
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
testing::Combine(
DIFFERENT_SIZES,
MTEMP_SIZES,
testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
testing::Values(Channels(1), Channels(3),Channels(4)),
ALL_TEMPLATE_METHODS
......@@ -165,7 +167,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
);
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
DIFFERENT_SIZES,
MTEMP_SIZES,
testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
testing::Values(Channels(1), Channels(3),Channels(4)),
testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
......
......@@ -263,22 +263,4 @@ void PrintTo(const Inverse &inverse, std::ostream *os)
else
(*os) << "direct";
}
cv::ocl::oclMat createMat(cv::Size size,int type,bool useRoi)
{
cv::Size size0 = size;
if (useRoi)
{
size0.width += randomInt(5, 15);
size0.height += randomInt(5, 15);
}
cv::ocl::oclMat d_m(size0, type);
if (size0 != size)
d_m = cv::ocl::oclMat(size.width,size.height,type); // suspicious point
return d_m;
}
cv::ocl::oclMat loadMat(const cv::Mat& m, bool useRoi)
{
cv::ocl::oclMat d_m = ::createMat(m.size(), m.type(), useRoi);
d_m.upload(m);
return d_m;
}
......@@ -237,6 +237,4 @@ void run_perf_test();
IMPLEMENT_PARAM_CLASS(Channels, int)
#endif // IMPLEMENT_PARAM_CLASS
cv::ocl::oclMat createMat(cv::Size size,int type,bool useRoi);
cv::ocl::oclMat loadMat(const cv::Mat& m, bool useRoi);
#endif // __OPENCV_TEST_UTILITY_HPP__
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment