Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
dcdbaef3
Commit
dcdbaef3
authored
Jan 21, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #13658 from nglee:dev_CudaShflUpCompat
parents
20d82cf6
970293a2
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
385 additions
and
34 deletions
+385
-34
clahe.cu
modules/cudaimgproc/src/cuda/clahe.cu
+9
-15
scan.hpp
modules/cudev/include/opencv2/cudev/block/scan.hpp
+141
-4
integral.hpp
modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+4
-4
scan.hpp
modules/cudev/include/opencv2/cudev/warp/scan.hpp
+39
-10
shuffle.hpp
modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+52
-1
test_scan.cu
modules/cudev/test/test_scan.cu
+140
-0
No files found.
modules/cudaimgproc/src/cuda/clahe.cu
View file @
dcdbaef3
...
@@ -42,15 +42,9 @@
...
@@ -42,15 +42,9 @@
#if !defined CUDA_DISABLER
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/cudev.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/scan.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
using namespace cv::cuda;
using namespace cv::cudev;
using namespace cv::cuda::device;
namespace clahe
namespace clahe
{
{
...
@@ -73,7 +67,7 @@ namespace clahe
...
@@ -73,7 +67,7 @@ namespace clahe
for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
{
{
const int data = srcPtr[j];
const int data = srcPtr[j];
Emulation::smem
::atomicAdd(&smem[data], 1);
::atomicAdd(&smem[data], 1);
}
}
}
}
...
@@ -96,7 +90,7 @@ namespace clahe
...
@@ -96,7 +90,7 @@ namespace clahe
// find number of overall clipped samples
// find number of overall clipped samples
r
educe<256>(smem, clipped, tid, plus<int>());
blockR
educe<256>(smem, clipped, tid, plus<int>());
// broadcast evaluated value
// broadcast evaluated value
...
@@ -128,10 +122,10 @@ namespace clahe
...
@@ -128,10 +122,10 @@ namespace clahe
calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
cudaSafeCall
( cudaGetLastError() );
CV_CUDEV_SAFE_CALL
( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall
( cudaDeviceSynchronize() );
CV_CUDEV_SAFE_CALL
( cudaDeviceSynchronize() );
}
}
__global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
__global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
...
@@ -173,13 +167,13 @@ namespace clahe
...
@@ -173,13 +167,13 @@ namespace clahe
const dim3 block(32, 8);
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
cudaSafeCall
( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
CV_CUDEV_SAFE_CALL
( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
cudaSafeCall
( cudaGetLastError() );
CV_CUDEV_SAFE_CALL
( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall
( cudaDeviceSynchronize() );
CV_CUDEV_SAFE_CALL
( cudaDeviceSynchronize() );
}
}
}
}
...
...
modules/cudev/include/opencv2/cudev/block/scan.hpp
View file @
dcdbaef3
...
@@ -48,12 +48,134 @@
...
@@ -48,12 +48,134 @@
#include "../common.hpp"
#include "../common.hpp"
#include "../warp/scan.hpp"
#include "../warp/scan.hpp"
#include "../warp/warp.hpp"
namespace
cv
{
namespace
cudev
{
namespace
cv
{
namespace
cudev
{
//! @addtogroup cudev
//! @addtogroup cudev
//! @{
//! @{
#if __CUDACC_VER_MAJOR__ >= 9
// Usage Note
// - THREADS_NUM should be equal to the number of threads in this block.
// - smem must be able to contain at least n elements of type T, where n is equal to the number
// of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
//
// Dev Note
// - Starting from CUDA 9.0, support for Fermi is dropped. So CV_CUDEV_ARCH >= 300 is implied.
// - "For Pascal and earlier architectures (CV_CUDEV_ARCH < 700), all threads in mask must execute
// the same warp intrinsic instruction in convergence, and the union of all values in mask must
// be equal to the warp's active mask."
// (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#independent-thread-scheduling-7-x)
// - Above restriction does not apply starting from Volta (CV_CUDEV_ARCH >= 700). We just need to
// take care so that "all non-exited threads named in mask must execute the same intrinsic with
// the same mask."
// (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#warp-description)
template
<
int
THREADS_NUM
,
typename
T
>
__device__
T
blockScanInclusive
(
T
data
,
volatile
T
*
smem
,
uint
tid
)
{
const
int
residual
=
THREADS_NUM
&
(
WARP_SIZE
-
1
);
#if CV_CUDEV_ARCH < 700
const
uint
residual_mask
=
(
1U
<<
residual
)
-
1
;
#endif
if
(
THREADS_NUM
>
WARP_SIZE
)
{
// bottom-level inclusive warp scan
#if CV_CUDEV_ARCH >= 700
T
warpResult
=
warpScanInclusive
(
0xFFFFFFFFU
,
data
);
#else
T
warpResult
;
if
(
0
==
residual
)
warpResult
=
warpScanInclusive
(
0xFFFFFFFFU
,
data
);
else
{
const
int
n_warps
=
divUp
(
THREADS_NUM
,
WARP_SIZE
);
const
int
warp_num
=
Warp
::
warpId
();
if
(
warp_num
<
n_warps
-
1
)
warpResult
=
warpScanInclusive
(
0xFFFFFFFFU
,
data
);
else
{
// We are at the last threads of a block whose number of threads
// is not a multiple of the warp size
warpResult
=
warpScanInclusive
(
residual_mask
,
data
);
}
}
#endif
__syncthreads
();
// save top elements of each warp for exclusive warp scan
// sync to wait for warp scans to complete (because smem is being overwritten)
if
((
tid
&
(
WARP_SIZE
-
1
))
==
(
WARP_SIZE
-
1
))
{
smem
[
tid
>>
LOG_WARP_SIZE
]
=
warpResult
;
}
__syncthreads
();
int
quot
=
THREADS_NUM
/
WARP_SIZE
;
if
(
tid
<
quot
)
{
// grab top warp elements
T
val
=
smem
[
tid
];
uint
mask
=
(
1LLU
<<
quot
)
-
1
;
if
(
0
==
residual
)
{
// calculate exclusive scan and write back to shared memory
smem
[
tid
]
=
warpScanExclusive
(
mask
,
val
);
}
else
{
// calculate inclusive scan and write back to shared memory with offset 1
smem
[
tid
+
1
]
=
warpScanInclusive
(
mask
,
val
);
if
(
tid
==
0
)
smem
[
0
]
=
0
;
}
}
__syncthreads
();
// return updated warp scans
return
warpResult
+
smem
[
tid
>>
LOG_WARP_SIZE
];
}
else
{
#if CV_CUDEV_ARCH >= 700
return
warpScanInclusive
(
0xFFFFFFFFU
,
data
);
#else
if
(
THREADS_NUM
==
WARP_SIZE
)
return
warpScanInclusive
(
0xFFFFFFFFU
,
data
);
else
return
warpScanInclusive
(
residual_mask
,
data
);
#endif
}
}
template
<
int
THREADS_NUM
,
typename
T
>
__device__
__forceinline__
T
blockScanExclusive
(
T
data
,
volatile
T
*
smem
,
uint
tid
)
{
return
blockScanInclusive
<
THREADS_NUM
>
(
data
,
smem
,
tid
)
-
data
;
}
#else // __CUDACC_VER_MAJOR__ >= 9
// Usage Note
// - THREADS_NUM should be equal to the number of threads in this block.
// - (>= Kepler) smem must be able to contain at least n elements of type T, where n is equal to the number
// of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
// - (Fermi) smem must be able to contain at least n elements of type T, where n is equal to the number
// of threads in this block (= THREADS_NUM).
template
<
int
THREADS_NUM
,
typename
T
>
template
<
int
THREADS_NUM
,
typename
T
>
__device__
T
blockScanInclusive
(
T
data
,
volatile
T
*
smem
,
uint
tid
)
__device__
T
blockScanInclusive
(
T
data
,
volatile
T
*
smem
,
uint
tid
)
{
{
...
@@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
...
@@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
__syncthreads
();
__syncthreads
();
if
(
tid
<
(
THREADS_NUM
/
WARP_SIZE
))
int
quot
=
THREADS_NUM
/
WARP_SIZE
;
if
(
tid
<
quot
)
{
{
// grab top warp elements
// grab top warp elements
T
val
=
smem
[
tid
];
T
val
=
smem
[
tid
];
// calculate exclusive scan and write back to shared memory
if
(
0
==
(
THREADS_NUM
&
(
WARP_SIZE
-
1
)))
smem
[
tid
]
=
warpScanExclusive
(
val
,
smem
,
tid
);
{
// calculate exclusive scan and write back to shared memory
smem
[
tid
]
=
warpScanExclusive
(
val
,
smem
,
tid
);
}
else
{
// calculate inclusive scan and write back to shared memory with offset 1
smem
[
tid
+
1
]
=
warpScanInclusive
(
val
,
smem
,
tid
);
if
(
tid
==
0
)
smem
[
0
]
=
0
;
}
}
}
__syncthreads
();
__syncthreads
();
// return updated warp scans
with exclusive scan results
// return updated warp scans
return
warpResult
+
smem
[
tid
>>
LOG_WARP_SIZE
];
return
warpResult
+
smem
[
tid
>>
LOG_WARP_SIZE
];
}
}
else
else
...
@@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
...
@@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
return
blockScanInclusive
<
THREADS_NUM
>
(
data
,
smem
,
tid
)
-
data
;
return
blockScanInclusive
<
THREADS_NUM
>
(
data
,
smem
,
tid
)
-
data
;
}
}
#endif // __CUDACC_VER_MAJOR__ >= 9
//! @}
//! @}
}}
}}
...
...
modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
View file @
dcdbaef3
...
@@ -215,7 +215,7 @@ namespace integral_detail
...
@@ -215,7 +215,7 @@ namespace integral_detail
#pragma unroll
#pragma unroll
for
(
int
i
=
1
;
i
<
32
;
i
*=
2
)
for
(
int
i
=
1
;
i
<
32
;
i
*=
2
)
{
{
const
int
n
=
shfl_up
(
sum
,
i
,
32
);
const
int
n
=
compatible_
shfl_up
(
sum
,
i
,
32
);
if
(
lane_id
>=
i
)
if
(
lane_id
>=
i
)
{
{
...
@@ -245,9 +245,9 @@ namespace integral_detail
...
@@ -245,9 +245,9 @@ namespace integral_detail
int
warp_sum
=
sums
[
lane_id
];
int
warp_sum
=
sums
[
lane_id
];
#pragma unroll
#pragma unroll
for
(
int
i
=
1
;
i
<
=
32
;
i
*=
2
)
for
(
int
i
=
1
;
i
<
32
;
i
*=
2
)
{
{
const
int
n
=
shfl_up
(
warp_sum
,
i
,
32
);
const
int
n
=
compatible_
shfl_up
(
warp_sum
,
i
,
32
);
if
(
lane_id
>=
i
)
if
(
lane_id
>=
i
)
warp_sum
+=
n
;
warp_sum
+=
n
;
...
@@ -453,7 +453,7 @@ namespace integral_detail
...
@@ -453,7 +453,7 @@ namespace integral_detail
for
(
int
i
=
1
;
i
<=
8
;
i
*=
2
)
for
(
int
i
=
1
;
i
<=
8
;
i
*=
2
)
{
{
T
n
=
shfl_up
(
partial_sum
,
i
,
32
);
T
n
=
compatible_
shfl_up
(
partial_sum
,
i
,
32
);
if
(
lane_id
>=
i
)
if
(
lane_id
>=
i
)
partial_sum
+=
n
;
partial_sum
+=
n
;
...
...
modules/cudev/include/opencv2/cudev/warp/scan.hpp
View file @
dcdbaef3
...
@@ -55,6 +55,36 @@ namespace cv { namespace cudev {
...
@@ -55,6 +55,36 @@ namespace cv { namespace cudev {
//! @addtogroup cudev
//! @addtogroup cudev
//! @{
//! @{
#if __CUDACC_VER_MAJOR__ >= 9
// Starting from CUDA 9.0, support for Fermi is dropped.
// So CV_CUDEV_ARCH >= 300 is implied.
template
<
typename
T
>
__device__
T
warpScanInclusive
(
uint
mask
,
T
data
)
{
const
uint
laneId
=
Warp
::
laneId
();
// scan on shufl functions
#pragma unroll
for
(
int
i
=
1
;
i
<=
(
WARP_SIZE
/
2
);
i
*=
2
)
{
const
T
val
=
shfl_up_sync
(
mask
,
data
,
i
);
if
(
laneId
>=
i
)
data
+=
val
;
}
return
data
;
}
template
<
typename
T
>
__device__
__forceinline__
T
warpScanExclusive
(
uint
mask
,
T
data
)
{
return
warpScanInclusive
(
mask
,
data
)
-
data
;
}
#else // __CUDACC_VER_MAJOR__ >= 9
template
<
typename
T
>
template
<
typename
T
>
__device__
T
warpScanInclusive
(
T
data
,
volatile
T
*
smem
,
uint
tid
)
__device__
T
warpScanInclusive
(
T
data
,
volatile
T
*
smem
,
uint
tid
)
{
{
...
@@ -75,19 +105,16 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
...
@@ -75,19 +105,16 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
return
data
;
return
data
;
#else
#else
uint
pos
=
2
*
tid
-
(
tid
&
(
WARP_SIZE
-
1
));
const
uint
laneId
=
Warp
::
laneId
();
smem
[
pos
]
=
0
;
pos
+=
WARP_SIZE
;
smem
[
tid
]
=
data
;
smem
[
pos
]
=
data
;
smem
[
pos
]
+=
smem
[
pos
-
1
];
#pragma unroll
smem
[
pos
]
+=
smem
[
pos
-
2
];
for
(
int
i
=
1
;
i
<=
(
WARP_SIZE
/
2
);
i
*=
2
)
smem
[
pos
]
+=
smem
[
pos
-
4
];
if
(
laneId
>=
i
)
smem
[
pos
]
+=
smem
[
pos
-
8
];
smem
[
tid
]
+=
smem
[
tid
-
i
];
smem
[
pos
]
+=
smem
[
pos
-
16
];
return
smem
[
pos
];
return
smem
[
tid
];
#endif
#endif
}
}
...
@@ -97,6 +124,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
...
@@ -97,6 +124,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
return
warpScanInclusive
(
data
,
smem
,
tid
)
-
data
;
return
warpScanInclusive
(
data
,
smem
,
tid
)
-
data
;
}
}
#endif // __CUDACC_VER_MAJOR__ >= 9
//! @}
//! @}
}}
}}
...
...
modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
View file @
dcdbaef3
...
@@ -48,6 +48,8 @@
...
@@ -48,6 +48,8 @@
#include "../common.hpp"
#include "../common.hpp"
#include "../util/vec_traits.hpp"
#include "../util/vec_traits.hpp"
#include "../block/block.hpp"
#include "warp.hpp"
namespace
cv
{
namespace
cudev
{
namespace
cv
{
namespace
cudev
{
...
@@ -59,7 +61,7 @@ namespace cv { namespace cudev {
...
@@ -59,7 +61,7 @@ namespace cv { namespace cudev {
#if __CUDACC_VER_MAJOR__ >= 9
#if __CUDACC_VER_MAJOR__ >= 9
# define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
# define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
//
# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
# define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
#endif
#endif
...
@@ -155,6 +157,53 @@ CV_CUDEV_SHFL_VEC_INST(double)
...
@@ -155,6 +157,53 @@ CV_CUDEV_SHFL_VEC_INST(double)
// shfl_up
// shfl_up
template
<
typename
T
>
__device__
__forceinline__
T
compatible_shfl_up
(
T
val
,
uint
delta
,
int
width
=
warpSize
)
{
#if __CUDACC_VER_MAJOR__ < 9
return
shfl_up
(
val
,
delta
,
width
);
#else // __CUDACC_VER_MAJOR__ < 9
#if CV_CUDEV_ARCH >= 700
return
shfl_up_sync
(
0xFFFFFFFFU
,
val
,
delta
,
width
);
#else
const
int
block_size
=
Block
::
blockSize
();
const
int
residual
=
block_size
&
(
warpSize
-
1
);
if
(
0
==
residual
)
return
shfl_up_sync
(
0xFFFFFFFFU
,
val
,
delta
,
width
);
else
{
const
int
n_warps
=
divUp
(
block_size
,
warpSize
);
const
int
warp_id
=
Warp
::
warpId
();
if
(
warp_id
<
n_warps
-
1
)
return
shfl_up_sync
(
0xFFFFFFFFU
,
val
,
delta
,
width
);
else
{
// We are at the last threads of a block whose number of threads
// is not a multiple of the warp size
uint
mask
=
(
1LU
<<
residual
)
-
1
;
return
shfl_up_sync
(
mask
,
val
,
delta
,
width
);
}
}
#endif
#endif // __CUDACC_VER_MAJOR__ < 9
}
#if __CUDACC_VER_MAJOR__ >= 9
template
<
typename
T
>
__device__
__forceinline__
T
shfl_up_sync
(
uint
mask
,
T
val
,
uint
delta
,
int
width
=
warpSize
)
{
return
(
T
)
__shfl_up_sync
(
mask
,
val
,
delta
,
width
);
}
#else
__device__
__forceinline__
uchar
shfl_up
(
uchar
val
,
uint
delta
,
int
width
=
warpSize
)
__device__
__forceinline__
uchar
shfl_up
(
uchar
val
,
uint
delta
,
int
width
=
warpSize
)
{
{
return
(
uchar
)
__shfl_up
((
int
)
val
,
delta
,
width
);
return
(
uchar
)
__shfl_up
((
int
)
val
,
delta
,
width
);
...
@@ -244,6 +293,8 @@ CV_CUDEV_SHFL_UP_VEC_INST(double)
...
@@ -244,6 +293,8 @@ CV_CUDEV_SHFL_UP_VEC_INST(double)
#undef CV_CUDEV_SHFL_UP_VEC_INST
#undef CV_CUDEV_SHFL_UP_VEC_INST
#endif
// shfl_down
// shfl_down
__device__
__forceinline__
uchar
shfl_down
(
uchar
val
,
uint
delta
,
int
width
=
warpSize
)
__device__
__forceinline__
uchar
shfl_down
(
uchar
val
,
uint
delta
,
int
width
=
warpSize
)
...
...
modules/cudev/test/test_scan.cu
0 → 100644
View file @
dcdbaef3
#include "test_precomp.hpp"
using namespace cv;
using namespace cv::cudev;
using namespace cvtest;
// BlockScanInt
template <int THREADS_NUM>
__global__ void int_kernel(int* data)
{
uint tid = Block::threadLineId();
#if CV_CUDEV_ARCH >= 300
const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
__shared__ int smem[n_warps];
#else
__shared__ int smem[THREADS_NUM];
#endif
data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
}
#define BLOCK_SCAN_INT_TEST(block_size) \
TEST(BlockScanInt, BlockSize##block_size) \
{ \
Mat src = randomMat(Size(block_size, 1), CV_32SC1, 0, 1024); \
\
GpuMat d_src; \
d_src.upload(src); \
\
for (int col = 1; col < block_size; col++) \
src.at<int>(0, col) += src.at<int>(0, col - 1); \
\
int_kernel<block_size><<<1, block_size>>>((int*)d_src.data); \
\
CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize()); \
\
EXPECT_MAT_NEAR(d_src, src, 0); \
}
BLOCK_SCAN_INT_TEST(29)
BLOCK_SCAN_INT_TEST(30)
BLOCK_SCAN_INT_TEST(32)
BLOCK_SCAN_INT_TEST(40)
BLOCK_SCAN_INT_TEST(41)
BLOCK_SCAN_INT_TEST(59)
BLOCK_SCAN_INT_TEST(60)
BLOCK_SCAN_INT_TEST(64)
BLOCK_SCAN_INT_TEST(70)
BLOCK_SCAN_INT_TEST(71)
BLOCK_SCAN_INT_TEST(109)
BLOCK_SCAN_INT_TEST(110)
BLOCK_SCAN_INT_TEST(128)
BLOCK_SCAN_INT_TEST(130)
BLOCK_SCAN_INT_TEST(131)
BLOCK_SCAN_INT_TEST(189)
BLOCK_SCAN_INT_TEST(200)
BLOCK_SCAN_INT_TEST(256)
BLOCK_SCAN_INT_TEST(300)
BLOCK_SCAN_INT_TEST(311)
BLOCK_SCAN_INT_TEST(489)
BLOCK_SCAN_INT_TEST(500)
BLOCK_SCAN_INT_TEST(512)
BLOCK_SCAN_INT_TEST(600)
BLOCK_SCAN_INT_TEST(611)
BLOCK_SCAN_INT_TEST(1024)
// BlockScanDouble
template <int THREADS_NUM>
__global__ void double_kernel(double* data)
{
uint tid = Block::threadLineId();
#if CV_CUDEV_ARCH >= 300
const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
__shared__ double smem[n_warps];
#else
__shared__ double smem[THREADS_NUM];
#endif
data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
}
#define BLOCK_SCAN_DOUBLE_TEST(block_size) \
TEST(BlockScanDouble, BlockSize##block_size) \
{ \
Mat src = randomMat(Size(block_size, 1), CV_64FC1, 0.0, 1.0); \
\
GpuMat d_src; \
d_src.upload(src); \
\
for (int col = 1; col < block_size; col++) \
src.at<double>(0, col) += src.at<double>(0, col - 1); \
\
double_kernel<block_size><<<1, block_size>>>((double*)d_src.data); \
\
CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize()); \
\
EXPECT_MAT_NEAR(d_src, src, 1e-10); \
}
BLOCK_SCAN_DOUBLE_TEST(29)
BLOCK_SCAN_DOUBLE_TEST(30)
BLOCK_SCAN_DOUBLE_TEST(32)
BLOCK_SCAN_DOUBLE_TEST(40)
BLOCK_SCAN_DOUBLE_TEST(41)
BLOCK_SCAN_DOUBLE_TEST(59)
BLOCK_SCAN_DOUBLE_TEST(60)
BLOCK_SCAN_DOUBLE_TEST(64)
BLOCK_SCAN_DOUBLE_TEST(70)
BLOCK_SCAN_DOUBLE_TEST(71)
BLOCK_SCAN_DOUBLE_TEST(109)
BLOCK_SCAN_DOUBLE_TEST(110)
BLOCK_SCAN_DOUBLE_TEST(128)
BLOCK_SCAN_DOUBLE_TEST(130)
BLOCK_SCAN_DOUBLE_TEST(131)
BLOCK_SCAN_DOUBLE_TEST(189)
BLOCK_SCAN_DOUBLE_TEST(200)
BLOCK_SCAN_DOUBLE_TEST(256)
BLOCK_SCAN_DOUBLE_TEST(300)
BLOCK_SCAN_DOUBLE_TEST(311)
BLOCK_SCAN_DOUBLE_TEST(489)
BLOCK_SCAN_DOUBLE_TEST(500)
BLOCK_SCAN_DOUBLE_TEST(512)
BLOCK_SCAN_DOUBLE_TEST(600)
BLOCK_SCAN_DOUBLE_TEST(611)
BLOCK_SCAN_DOUBLE_TEST(1024)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment