Commit 2e685dcf authored by niko's avatar niko

performance & bug fix for resize erode dilate sobel remap

parent a1d8091e
......@@ -4,7 +4,7 @@ if(NOT HAVE_OPENCL)
endif()
set(the_description "OpenCL-accelerated Computer Vision")
ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree)
ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_ts)
ocv_module_include_directories()
......
......@@ -325,7 +325,7 @@ PARAM_TEST_CASE(LaplacianTestBase, MatType, int)
ksize = GET_PARAM(1);
cv::RNG& rng = TS::ptr()->get_rng();
cv::Size size = cv::Size(2560, 2560);
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
mat = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
......@@ -468,7 +468,7 @@ PARAM_TEST_CASE(ErodeDilateBase, MatType, bool)
// iterations = GET_PARAM(1);
cv::RNG& rng = TS::ptr()->get_rng();
cv::Size size = cv::Size(2560, 2560);
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
mat1 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
......@@ -679,7 +679,7 @@ PARAM_TEST_CASE(Sobel, MatType, int, int, int, int)
dx = 2; dy=0;
cv::RNG& rng = TS::ptr()->get_rng();
cv::Size size = cv::Size(2560, 2560);
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
mat1 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
......@@ -817,7 +817,7 @@ PARAM_TEST_CASE(Scharr, MatType, int, int, int)
dx = 1; dy=0;
cv::RNG& rng = TS::ptr()->get_rng();
cv::Size size = cv::Size(2560, 2560);
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
mat1 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
......@@ -956,7 +956,7 @@ PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int)
bordertype = GET_PARAM(2);
cv::RNG& rng = TS::ptr()->get_rng();
cv::Size size = cv::Size(2560, 2560);
cv::Size size = cv::Size(MWIDTH, MHEIGHT);
sigma1 = rng.uniform(0.1, 1.0);
sigma2 = rng.uniform(0.1, 1.0);
......
This diff is collapsed.
......@@ -260,7 +260,7 @@ namespace cv
CV_Assert((!map2.data || map2.size()== map1.size()));
dst.create(map1.size(), src.type());
string kernelName;
......@@ -394,8 +394,15 @@ namespace cv
args.push_back( make_pair(sizeof(cl_int),(void*)&map1.cols));
args.push_back( make_pair(sizeof(cl_int),(void*)&map1.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue));
}
if(src.clCxt -> impl -> double_support != 0)
{
args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue));
}
else
{
args.push_back( make_pair(sizeof(cl_float4),(void*)&borderValue));
}
}
openCLExecuteKernel(clCxt,&imgproc_remap,kernelName,globalThreads,localThreads,args,src.channels(),src.depth());
}
......
......@@ -44,9 +44,9 @@
//M*/
#include "precomp.hpp"
#include "threadsafe.h"
#include "Threadsafe.h"
#include <iomanip>
#include "binarycaching.hpp"
#include "binaryCaching.hpp"
using namespace cv;
using namespace cv::ocl;
......
......@@ -90,9 +90,9 @@ Niko
***********************************************************************************/
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D0
(__global const float * restrict src,
__global uchar * dst,
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
(__global const GENTYPE_SRC * restrict src,
__global GENTYPE_DST * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
......@@ -111,10 +111,10 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_
int start_addr = mad24(y,src_step_in_pixel,x);
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
int i;
float sum;
float temp[READ_TIMES_COL];
GENTYPE_SRC sum;
GENTYPE_SRC temp[READ_TIMES_COL];
__local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
__local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
//read pixels from src
for(i = 0;i<READ_TIMES_COL;i++)
......@@ -141,170 +141,6 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
dst[start_addr] = convert_uchar_sat(sum);
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C4_D0
(__global const float4 * restrict src,
__global uchar4 * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
//const int src_offset_x,
//const int src_offset_y,
const int dst_step_in_pixel,
const int dst_offset_in_pixel,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_addr = mad24(y,src_step_in_pixel,x);
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
int i;
float4 sum;
float4 temp[READ_TIMES_COL];
__local float4 LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
//read pixels from src
for(i = 0;i<READ_TIMES_COL;i++)
{
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
current_addr = current_addr < end_addr ? current_addr : 0;
temp[i] = src[current_addr];
}
//save pixels to lds
for(i = 0;i<READ_TIMES_COL;i++)
{
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
for(i=1;i<=RADIUSY;i++)
{
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
dst[start_addr] = convert_uchar4_sat(sum);
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D5
(__global const float * restrict src,
__global float * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
//const int src_offset_x,
//const int src_offset_y,
const int dst_step_in_pixel,
const int dst_offset_in_pixel,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_addr = mad24(y,src_step_in_pixel,x);
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
int i;
float sum;
float temp[READ_TIMES_COL];
__local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
//read pixels from src
for(i = 0;i<READ_TIMES_COL;i++)
{
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
current_addr = current_addr < end_addr ? current_addr : 0;
temp[i] = src[current_addr];
}
//save pixels to lds
for(i = 0;i<READ_TIMES_COL;i++)
{
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
for(i=1;i<=RADIUSY;i++)
{
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
dst[start_addr] = sum;
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C4_D5
(__global const float4 * restrict src,
__global float4 * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
//const int src_offset_x,
//const int src_offset_y,
const int dst_step_in_pixel,
const int dst_offset_in_pixel,
__constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_addr = mad24(y,src_step_in_pixel,x);
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
int i;
float4 sum;
float4 temp[READ_TIMES_COL];
__local float4 LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
//read pixels from src
for(i = 0;i<READ_TIMES_COL;i++)
{
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
current_addr = current_addr < end_addr ? current_addr : 0;
temp[i] = src[current_addr];
}
//save pixels to lds
for(i = 0;i<READ_TIMES_COL;i++)
{
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
for(i=1;i<=RADIUSY;i++)
{
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
{
start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
dst[start_addr] = sum;
dst[start_addr] = convert_to_DST(sum);
}
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Zhang Ying, zhangying913@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#pragma OPENCL FP_CONTRACT ON
#define UCHAR_MIN 0
__kernel void dilate_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset,
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
{
int mX = get_global_id(0);
int mY = get_global_id(1);
int kX = mX - anX, kY = mY - anY;
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
float4 maxVal = (float4)(-FLT_MAX);
int k=0;
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
{
for(int j=0;j<ksX; j++, kX++)
{
int current_addr = mad24(kY,srcStep,kX) + srcOffset;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
float4 v = src[current_addr];
uchar now = mat_kernel[k++];
float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(-FLT_MAX);
maxVal = max(maxVal , flag);
}
}
if(mX < cols && mY < rows)
dst[mY * dstStep + mX + dstOffset] = (maxVal);
}
__kernel void dilate_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset,
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
{
int mX = (get_global_id(0)<<2) - (dstOffset&3);
int mY = get_global_id(1);
int kX = mX - anX, kY = mY - anY;
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
float4 maxVal = (float4)(-FLT_MAX);
int k=0;
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
{
for(int j=0;j<ksX;j++, kX++)
{
int start = mad24(kY,srcStep,kX) + srcOffset;
start = ((start < end_addr) && (start > 0)) ? start : 0;
int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
float8 sVal = (float8)(src[start>>2], src[start2>>2]);
float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
int det = start & 3;
float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
uchar now = mat_kernel[k++];
float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal;
flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : -FLT_MAX;
flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : -FLT_MAX;
flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : -FLT_MAX;
flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : -FLT_MAX;
maxVal = max(maxVal , flag);
}
}
if(mY < rows && mX < cols)
{
__global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset);
float4 dVal = *d;
maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x;
maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y;
maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z;
maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w;
*d = (maxVal);
}
}
__kernel void dilate_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset,
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
{
int mX = (get_global_id(0)<<2) - (dstOffset&3);;
int mY = get_global_id(1);
int kX = mX - anX, kY = mY - anY;
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
uchar4 maxVal = (uchar4)(UCHAR_MIN);
int k=0;
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
{
for(int j=0;j<ksX;j++, kX++)
{
int start = mad24(kY,srcStep,kX) + srcOffset;
start = ((start < end_addr) && (start > 0)) ? start : 0;
int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]);
uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
int det = start & 3;
uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : maxVal;
flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : UCHAR_MIN;
flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : UCHAR_MIN;
flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : UCHAR_MIN;
flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : UCHAR_MIN;
maxVal = max(maxVal , flag);
}
}
if(mY < rows)
{
__global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset);
uchar4 dVal = *d;
maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x;
maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y;
maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z;
maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w;
*d = (maxVal);
}
}
__kernel void dilate_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset,
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
{
int mX = get_global_id(0);
int mY = get_global_id(1);
int kX = mX - anX, kY = mY - anY;
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
uchar4 maxVal = (uchar4)(UCHAR_MIN);
int k=0;
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
{
for(int j=0;j<ksX;j++, kX++)
{
int current_addr = mad24(kY,srcStep,kX) + srcOffset;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
uchar4 v = src[current_addr];
uchar now = mat_kernel[k++];
uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal;
maxVal = max(maxVal , flag);
}
}
if(mX < cols && mY < rows)
dst[mY * dstStep + mX + dstOffset] = (maxVal);
}
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Zero Lin, zero.lin@amd.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
__kernel void erode_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset,
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
{
int mX = get_global_id(0);
int mY = get_global_id(1);
int kX = mX - anX, kY = mY - anY;
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
float4 minVal = (float4)(3.4e+38);
int k=0;
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
{
for(int j=0;j<ksX; j++, kX++)
{
int current_addr = mad24(kY,srcStep,kX) + srcOffset;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
float4 v = src[current_addr];
uchar now = mat_kernel[k++];
float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38);
minVal = min(minVal , flag);
}
}
if(mX < cols && mY < rows)
dst[mY * dstStep + mX + dstOffset] = (minVal);
}
__kernel void erode_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset,
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
{
int mX = (get_global_id(0)<<2) - (dstOffset&3);
int mY = get_global_id(1);
int kX = mX - anX, kY = mY - anY;
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
float4 minVal = (float4)(3.4e+38);
int k=0;
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
{
for(int j=0;j<ksX;j++, kX++)
{
int start = mad24(kY,srcStep,kX) + srcOffset;
start = ((start < end_addr) && (start > 0)) ? start : 0;
int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
float8 sVal = (float8)(src[start>>2], src[start2>>2]);
float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
int det = start & 3;
float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
uchar now = mat_kernel[k++];
float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38);
flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 3.4e+38;
flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 3.4e+38;
flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 3.4e+38;
flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 3.4e+38;
minVal = min(minVal , flag);
}
}
if(mY < rows && mX < cols)
{
__global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset);
float4 dVal = *d;
minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x;
minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y;
minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z;
minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w;
*d = (minVal);
}
}
__kernel void erode_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset,
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
{
int mX = (get_global_id(0)<<2) - (dstOffset&3);
int mY = get_global_id(1);
int kX = mX - anX, kY = mY - anY;
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
uchar4 minVal = (uchar4)(0xff);
int k=0;
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
{
for(int j=0;j<ksX;j++, kX++)
{
int start = mad24(kY,srcStep,kX) + srcOffset;
start = ((start < end_addr) && (start > 0)) ? start : 0;
int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]);
uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
int det = start & 3;
uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : (uchar4)(0xff);
flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 0xff;
flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 0xff;
flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 0xff;
flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 0xff;
minVal = min(minVal , flag);
}
}
if(mY < rows)
{
__global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset);
uchar4 dVal = *d;
minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x;
minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y;
minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z;
minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w;
*d = (minVal);
}
}
__kernel void erode_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset,
int mincols, int maxcols, int minrows, int maxrows, int cols, int rows,
int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
{
int mX = get_global_id(0);
int mY = get_global_id(1);
int kX = mX - anX, kY = mY - anY;
int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
uchar4 minVal = (uchar4)(0xff);
int k=0;
for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
{
for(int j=0;j<ksX;j++, kX++)
{
int current_addr = mad24(kY,srcStep,kX) + srcOffset;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
uchar4 v = src[current_addr];
uchar now = mat_kernel[k++];
uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (uchar4)(0xff);
minVal = min(minVal , flag);
}
}
if(mX < cols && mY < rows)
dst[mY * dstStep + mX + dstOffset] = (minVal);
}
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Zero Lin, zero.lin@amd.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
#ifdef ERODE
#define MORPH_OP(A,B) min((A),(B))
#endif
#ifdef DILATE
#define MORPH_OP(A,B) max((A),(B))
#endif
//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii
#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
#ifndef GENTYPE
__kernel void morph_C1_D0(__global const uchar * restrict src,
__global uchar *dst,
int src_offset_x, int src_offset_y,
int cols, int rows,
int src_step_in_pixel, int dst_step_in_pixel,
__constant uchar * mat_kernel,
int src_whole_cols, int src_whole_rows,
int dst_offset_in_pixel)
{
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int x = get_group_id(0)*4*LSIZE0;
int y = get_group_id(1)*LSIZE1;
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
int width = (end_x -start_x+4)>>2;
int offset = src_offset_x-RADIUSX & 3;
int start_y = y+src_offset_y-RADIUSY;
int point1 = mad24(l_y,LSIZE0,l_x);
int point2 = point1 + LSIZE0*LSIZE1;
int tl_x = (point1 % width)<<2;
int tl_y = point1 / width;
int tl_x2 = (point2 % width)<<2;
int tl_y2 = point2 / width;
int cur_x = start_x + tl_x;
int cur_y = start_y + tl_y;
int cur_x2 = start_x + tl_x2;
int cur_y2 = start_y + tl_y2;
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
uchar4 temp0,temp1;
__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
temp0 = *(__global uchar4*)&src[start_addr];
temp1 = *(__global uchar4*)&src[start_addr2];
//judge if read out of boundary
temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
LDS_DAT[point1] = temp0;
LDS_DAT[point2] = temp1;
barrier(CLK_LOCAL_MEM_FENCE);
uchar4 res = (uchar4)VAL;
for(int i=0;i<2*RADIUSY+1;i++)
for(int j=0;j<2*RADIUSX+1;j++)
{
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res;
}
int gidx = get_global_id(0)<<2;
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
{
*(__global uchar4*)&dst[out_addr] = res;
}
else
{
if(gidx+3<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
dst[out_addr+2] = res.z;
dst[out_addr+3] = res.w;
}
else if(gidx+2<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
dst[out_addr+2] = res.z;
}
else if(gidx+1<cols && gidy<rows)
{
dst[out_addr] = res.x;
dst[out_addr+1] = res.y;
}
else if(gidx<cols && gidy<rows)
{
dst[out_addr] = res.x;
}
}
}
#else
__kernel void morph(__global const GENTYPE * restrict src,
__global GENTYPE *dst,
int src_offset_x, int src_offset_y,
int cols, int rows,
int src_step_in_pixel, int dst_step_in_pixel,
__constant uchar * mat_kernel,
int src_whole_cols, int src_whole_rows,
int dst_offset_in_pixel)
{
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int x = get_group_id(0)*LSIZE0;
int y = get_group_id(1)*LSIZE1;
int start_x = x+src_offset_x-RADIUSX;
int end_x = x + src_offset_x+LSIZE0+RADIUSX;
int width = end_x -start_x+1;
int start_y = y+src_offset_y-RADIUSY;
int point1 = mad24(l_y,LSIZE0,l_x);
int point2 = point1 + LSIZE0*LSIZE1;
int tl_x = point1 % width;
int tl_y = point1 / width;
int tl_x2 = point2 % width;
int tl_y2 = point2 / width;
int cur_x = start_x + tl_x;
int cur_y = start_y + tl_y;
int cur_x2 = start_x + tl_x2;
int cur_y2 = start_y + tl_y2;
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
GENTYPE temp0,temp1;
__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
//read pixels from src
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
temp0 = src[start_addr];
temp1 = src[start_addr2];
//judge if read out of boundary
temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
LDS_DAT[point1] = temp0;
LDS_DAT[point2] = temp1;
barrier(CLK_LOCAL_MEM_FENCE);
GENTYPE res = (GENTYPE)VAL;
for(int i=0;i<2*RADIUSY+1;i++)
for(int j=0;j<2*RADIUSX+1;j++)
{
res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res;
}
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx<cols && gidy<rows)
{
dst[out_addr] = res;
}
}
#endif
This diff is collapsed.
......@@ -588,6 +588,13 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern
sprintf(compile_option, "-D GENTYPE=int");
args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
break;
case 2:
sprintf(compile_option, "-D GENTYPE=int2");
cl_int2 i2val;
i2val.s[0] = val.ival.s[0];
i2val.s[1] = val.ival.s[1];
args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=int4");
args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
......
......@@ -44,7 +44,7 @@
//M*/
#include "precomp.hpp"
#include "threadsafe.h"
#include "Threadsafe.h"
CriticalSection::CriticalSection()
{
......
......@@ -958,7 +958,7 @@ TEST_P(Remap, Mat)
if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1))
{
cout << "LINEAR don't support the map1Type and map2Type" << endl;
return;
return;
}
int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment