Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
81c6adb9
Commit
81c6adb9
authored
Jun 18, 2012
by
Marina Kolpakova
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
resize area with block scan
parent
9a9f212d
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
158 additions
and
32 deletions
+158
-32
resize.cu
modules/gpu/src/cuda/resize.cu
+145
-19
test_resize.cpp
modules/gpu/test/test_resize.cpp
+13
-13
No files found.
modules/gpu/src/cuda/resize.cu
View file @
81c6adb9
...
@@ -485,35 +485,134 @@ namespace cv { namespace gpu { namespace device
...
@@ -485,35 +485,134 @@ namespace cv { namespace gpu { namespace device
}
}
}
}
template <typename T>
enum ScanKind { exclusive, inclusive } ;
__global__ void resize_area_scan_x(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer)
template <ScanKind Kind , class T>
__device__ __forceinline__ T scan_warp ( volatile T *ptr , const unsigned int idx = threadIdx.x )
{
{
typedef typename scan_traits<T>::scan_line_type W;
const unsigned int lane = idx & 31;
extern __shared__ W line[];
scan_x(src,fx,fy, buffer,line, 0);
if ( lane >= 1) ptr [idx ] = ptr [idx - 1] + ptr [idx];
if ( lane >= 2) ptr [idx ] = ptr [idx - 2] + ptr [idx];
if ( lane >= 4) ptr [idx ] = ptr [idx - 4] + ptr [idx];
if ( lane >= 8) ptr [idx ] = ptr [idx - 8] + ptr [idx];
if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx];
if( Kind == inclusive )
return ptr [idx ];
else
return (lane > 0) ? ptr [idx - 1] : 0;
}
}
template <
typename
T>
template <
ScanKind Kind , class
T>
__
global__ void resize_area_scan_y(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffe
r)
__
device__ __forceinline__ T scan_block( volatile T *pt
r)
{
{
typedef typename scan_traits<T>::scan_line_type W;
const unsigned int idx = threadIdx.x;
extern __shared__ W line[];
const unsigned int lane = idx & 31;
scan_y(buffer,fx, fy, dst, line, 0);
const unsigned int warp = idx >> 5;
T val = scan_warp <Kind>( ptr , idx );
__syncthreads ();
if( lane == 31 )
ptr [ warp ] = ptr [idx ];
__syncthreads ();
if( warp == 0 )
scan_warp<inclusive>( ptr , idx );
__syncthreads ();
if ( warp > 0)
val = ptr [warp -1] + val;
__syncthreads ();
ptr[idx] = val;
__syncthreads ();
return val ;
}
}
template <typename T> struct InterAreaDispatcherStream
template<typename T, typename W>
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines)
{
{
static void call(const DevMem2D_<T> src, int fx, int fy, DevMem2D_<T> dst, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer, cudaStream_t stream)
extern __shared__ W sbuf[];
const unsigned int tid = threadIdx. x;
// load line-block on shared memory
int y = blockIdx.x / thred_lines;
int input_stride = (blockIdx.x - y * thred_lines) * blockDim.x;
int x = input_stride + tid;
// store global data in shared memory
sbuf[tid] = src(y, x);
__syncthreads();
scan_block<inclusive, W>(sbuf);
float scale = __fdividef(1.f, fx);
int out_stride = input_stride / fx;
int count = blockDim.x / fx;
if (tid < count)
{
{
resize_area_scan_x<T><<<src.rows, (src.cols >> 1), src.cols * sizeof(typename scan_traits<T>::scan_line_type) >>>(src, dst, fx, fy, buffer);
int start_idx = (tid == 0)? 0 : tid * fx - 1;
int end_idx = tid * fx + fx - 1;
resize_area_scan_y<T><<<dst.cols, (src.rows >> 1), src.rows * sizeof(typename scan_traits<T>::scan_line_type) >>>(src, dst, fx, fy, buffer)
;
W start = (tid == 0)? (W)0:sbuf[start_idx]
;
cudaSafeCall( cudaGetLastError() )
;
W end = sbuf[end_idx]
;
if (stream == 0)
if (blockIdx.x == 0)
cudaSafeCall( cudaDeviceSynchronize() );
printf("%d~~~~~~~~ start_idx %d, end_idx %d, start %f, end %f\n",
tid, start_idx, end_idx, start, end);
dst(y, out_stride + tid) = (end - start);
}
}
template<typename T, typename W>
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines)
{
extern __shared__ W sbuf[];
const unsigned int tid = threadIdx. x;
// load line-block on shared memory
int x = blockIdx.x / thred_lines;
int global_stride = (blockIdx.x % thred_lines) * blockDim.x;
if (!tid) printf("STRIDE : %d", global_stride);
int y = global_stride + tid;
// store global data in shared memory
sbuf[tid] = src(y, x);
__syncthreads();
scan_block<inclusive, W>(sbuf);
float scale = __fdividef(1.f, fx * fy);
int out_stride = global_stride / fx;
int count = blockDim.x / fx;
if (tid < count)
{
int start_idx = (tid == 0)? 0 : tid * fx - 1;
int end_idx = tid * fx + fx - 1;
W start = (tid == 0)? (W)0:sbuf[start_idx];
W end = sbuf[end_idx];
if (blockIdx.x == 0)
printf("!!!!!!!!%d~~~~~~~~ start_idx %d, end_idx %d, start %f, end %f\n",
tid, start_idx, end_idx, start, end);
dst(out_stride + tid, x) = saturate_cast<T>((end - start) * scale);
}
}
}
};
template <typename T>
template <typename T>
void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
...
@@ -521,10 +620,37 @@ namespace cv { namespace gpu { namespace device
...
@@ -521,10 +620,37 @@ namespace cv { namespace gpu { namespace device
{
{
(void)interpolation;
(void)interpolation;
//TODO: add assert to picture size
int iscale_x = round(fx);
int iscale_x = round(fx);
int iscale_y = round(fy);
int iscale_y = round(fy);
InterAreaDispatcherStream<T>::call(src, iscale_x, iscale_y, dst, buffer, stream);
const int warps = 4;
const int threads = 32 * warps;
int thred_lines = divUp(src.cols, threads);
int blocks = src.rows * thred_lines;
printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n",
src.cols, warps, threads, thred_lines, blocks);
typedef typename scan_traits<T>::scan_line_type smem_type;
resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
(src, buffer, iscale_x, iscale_y, thred_lines);
thred_lines = divUp(src.rows, threads);
blocks = dst.cols * thred_lines;
printf("device code executed for Y coordinate with:\nwarps %d, threads %d, thred_lines %d, blocks %d\n",
warps, threads, thred_lines, blocks);
resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
(buffer, dst, iscale_x, iscale_y, thred_lines);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream);
template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream);
...
...
modules/gpu/test/test_resize.cpp
View file @
81c6adb9
...
@@ -195,19 +195,19 @@ TEST_P(ResizeArea, Accuracy)
...
@@ -195,19 +195,19 @@ TEST_P(ResizeArea, Accuracy)
cv
::
resize
(
src
,
dst_cpu
,
cv
::
Size
(),
coeff
,
coeff
,
interpolation
);
cv
::
resize
(
src
,
dst_cpu
,
cv
::
Size
(),
coeff
,
coeff
,
interpolation
);
//
cv::Mat gpu_buff;
cv
::
Mat
gpu_buff
;
//
buffer.download(gpu_buff);
buffer
.
download
(
gpu_buff
);
//
cv::Mat gpu;
cv
::
Mat
gpu
;
//
dst.download(gpu);
dst
.
download
(
gpu
);
//
std::cout << src
//
std::cout << src
//
<< std::endl << std::endl
//
<< std::endl << std::endl
//
<< gpu_buff
//
<< gpu_buff
//
<< std::endl << std::endl
//
<< std::endl << std::endl
//
<< gpu
//
<< gpu
//
<< std::endl << std::endl
//
<< std::endl << std::endl
//
<< dst_cpu<< std::endl;
//
<< dst_cpu<< std::endl;
EXPECT_MAT_NEAR
(
dst_cpu
,
dst
,
src
.
depth
()
==
CV_32F
?
1e-2
:
1.0
);
EXPECT_MAT_NEAR
(
dst_cpu
,
dst
,
src
.
depth
()
==
CV_32F
?
1e-2
:
1.0
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment