Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
f56d9c34
Commit
f56d9c34
authored
Dec 15, 2010
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added support of remaining image number of channels into gpu::sum
parent
d8a7ff1e
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
277 additions
and
1 deletion
+277
-1
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+258
-0
arithm.cpp
tests/gpu/src/arithm.cpp
+19
-1
No files found.
modules/gpu/src/cuda/mathfunc.cu
View file @
f56d9c34
...
...
@@ -1645,6 +1645,246 @@ namespace cv { namespace gpu { namespace mathfunc
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
{
typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
__threadfence();
unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
{
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
{
typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
Op::call(val.z), Op::call(val.w));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
smem[tid + 3 * nthreads] = sum.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
__threadfence();
unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
{
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
}
}
} // namespace sum
...
...
@@ -1670,6 +1910,16 @@ namespace cv { namespace gpu { namespace mathfunc
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
}
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -1710,6 +1960,14 @@ namespace cv { namespace gpu { namespace mathfunc
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
...
...
tests/gpu/src/arithm.cpp
View file @
f56d9c34
...
...
@@ -950,7 +950,25 @@ struct CV_GpuSumTest: CvTest
b
=
sum
(
GpuMat
(
src
));
if
(
abs
(
a
[
0
]
-
b
[
0
])
+
abs
(
a
[
1
]
-
b
[
1
])
>
src
.
size
().
area
()
*
max_err
)
{
ts
->
printf
(
CvTS
::
CONSOLE
,
"cols: %d, rows: %d, expected: %f, actual: %f
\n
"
,
src
.
cols
,
src
.
rows
,
a
[
0
],
b
[
0
]);
ts
->
printf
(
CvTS
::
CONSOLE
,
"2 cols: %d, rows: %d, expected: %f, actual: %f
\n
"
,
src
.
cols
,
src
.
rows
,
a
[
1
],
b
[
1
]);
ts
->
set_failed_test_info
(
CvTS
::
FAIL_INVALID_OUTPUT
);
return
;
}
gen
(
1
+
rand
()
%
500
,
1
+
rand
()
%
500
,
CV_MAKETYPE
(
type
,
3
),
src
);
a
=
sum
(
src
);
b
=
sum
(
GpuMat
(
src
));
if
(
abs
(
a
[
0
]
-
b
[
0
])
+
abs
(
a
[
1
]
-
b
[
1
])
+
abs
(
a
[
2
]
-
b
[
2
])
>
src
.
size
().
area
()
*
max_err
)
{
ts
->
printf
(
CvTS
::
CONSOLE
,
"3 cols: %d, rows: %d, expected: %f, actual: %f
\n
"
,
src
.
cols
,
src
.
rows
,
a
[
2
],
b
[
2
]);
ts
->
set_failed_test_info
(
CvTS
::
FAIL_INVALID_OUTPUT
);
return
;
}
gen
(
1
+
rand
()
%
500
,
1
+
rand
()
%
500
,
CV_MAKETYPE
(
type
,
4
),
src
);
a
=
sum
(
src
);
b
=
sum
(
GpuMat
(
src
));
if
(
abs
(
a
[
0
]
-
b
[
0
])
+
abs
(
a
[
1
]
-
b
[
1
])
+
abs
(
a
[
2
]
-
b
[
2
])
+
abs
(
a
[
3
]
-
b
[
3
])
>
src
.
size
().
area
()
*
max_err
)
{
ts
->
printf
(
CvTS
::
CONSOLE
,
"4 cols: %d, rows: %d, expected: %f, actual: %f
\n
"
,
src
.
cols
,
src
.
rows
,
a
[
3
],
b
[
3
]);
ts
->
set_failed_test_info
(
CvTS
::
FAIL_INVALID_OUTPUT
);
return
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment