test_scan.cu 4.56 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140

#include "test_precomp.hpp"

using namespace cv;
using namespace cv::cudev;
using namespace cvtest;

// BlockScanInt

template <int THREADS_NUM>
__global__ void int_kernel(int* data)
{
    uint tid = Block::threadLineId();

#if CV_CUDEV_ARCH >= 300
    const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
    __shared__ int smem[n_warps];
#else
    __shared__ int smem[THREADS_NUM];
#endif

    data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
}

#define BLOCK_SCAN_INT_TEST(block_size)                                 \
    TEST(BlockScanInt, BlockSize##block_size)                           \
    {                                                                   \
        Mat src = randomMat(Size(block_size, 1), CV_32SC1, 0, 1024);    \
                                                                        \
        GpuMat d_src;                                                   \
        d_src.upload(src);                                              \
                                                                        \
        for (int col = 1; col < block_size; col++)                      \
            src.at<int>(0, col) += src.at<int>(0, col - 1);             \
                                                                        \
        int_kernel<block_size><<<1, block_size>>>((int*)d_src.data);    \
                                                                        \
        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());                    \
                                                                        \
        EXPECT_MAT_NEAR(d_src, src, 0);                                 \
    }

BLOCK_SCAN_INT_TEST(29)
BLOCK_SCAN_INT_TEST(30)
BLOCK_SCAN_INT_TEST(32)
BLOCK_SCAN_INT_TEST(40)
BLOCK_SCAN_INT_TEST(41)

BLOCK_SCAN_INT_TEST(59)
BLOCK_SCAN_INT_TEST(60)
BLOCK_SCAN_INT_TEST(64)
BLOCK_SCAN_INT_TEST(70)
BLOCK_SCAN_INT_TEST(71)

BLOCK_SCAN_INT_TEST(109)
BLOCK_SCAN_INT_TEST(110)
BLOCK_SCAN_INT_TEST(128)
BLOCK_SCAN_INT_TEST(130)
BLOCK_SCAN_INT_TEST(131)

BLOCK_SCAN_INT_TEST(189)
BLOCK_SCAN_INT_TEST(200)
BLOCK_SCAN_INT_TEST(256)
BLOCK_SCAN_INT_TEST(300)
BLOCK_SCAN_INT_TEST(311)

BLOCK_SCAN_INT_TEST(489)
BLOCK_SCAN_INT_TEST(500)
BLOCK_SCAN_INT_TEST(512)
BLOCK_SCAN_INT_TEST(600)
BLOCK_SCAN_INT_TEST(611)

BLOCK_SCAN_INT_TEST(1024)

// BlockScanDouble

template <int THREADS_NUM>
__global__ void double_kernel(double* data)
{
    uint tid = Block::threadLineId();

#if CV_CUDEV_ARCH >= 300
    const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
    __shared__ double smem[n_warps];
#else
    __shared__ double smem[THREADS_NUM];
#endif

    data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
}

#define BLOCK_SCAN_DOUBLE_TEST(block_size)                                  \
    TEST(BlockScanDouble, BlockSize##block_size)                            \
    {                                                                       \
        Mat src = randomMat(Size(block_size, 1), CV_64FC1, 0.0, 1.0);       \
                                                                            \
        GpuMat d_src;                                                       \
        d_src.upload(src);                                                  \
                                                                            \
        for (int col = 1; col < block_size; col++)                          \
            src.at<double>(0, col) += src.at<double>(0, col - 1);           \
                                                                            \
        double_kernel<block_size><<<1, block_size>>>((double*)d_src.data);  \
                                                                            \
        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());                        \
                                                                            \
        EXPECT_MAT_NEAR(d_src, src, 1e-10);                                 \
    }

BLOCK_SCAN_DOUBLE_TEST(29)
BLOCK_SCAN_DOUBLE_TEST(30)
BLOCK_SCAN_DOUBLE_TEST(32)
BLOCK_SCAN_DOUBLE_TEST(40)
BLOCK_SCAN_DOUBLE_TEST(41)

BLOCK_SCAN_DOUBLE_TEST(59)
BLOCK_SCAN_DOUBLE_TEST(60)
BLOCK_SCAN_DOUBLE_TEST(64)
BLOCK_SCAN_DOUBLE_TEST(70)
BLOCK_SCAN_DOUBLE_TEST(71)

BLOCK_SCAN_DOUBLE_TEST(109)
BLOCK_SCAN_DOUBLE_TEST(110)
BLOCK_SCAN_DOUBLE_TEST(128)
BLOCK_SCAN_DOUBLE_TEST(130)
BLOCK_SCAN_DOUBLE_TEST(131)

BLOCK_SCAN_DOUBLE_TEST(189)
BLOCK_SCAN_DOUBLE_TEST(200)
BLOCK_SCAN_DOUBLE_TEST(256)
BLOCK_SCAN_DOUBLE_TEST(300)
BLOCK_SCAN_DOUBLE_TEST(311)

BLOCK_SCAN_DOUBLE_TEST(489)
BLOCK_SCAN_DOUBLE_TEST(500)
BLOCK_SCAN_DOUBLE_TEST(512)
BLOCK_SCAN_DOUBLE_TEST(600)
BLOCK_SCAN_DOUBLE_TEST(611)

BLOCK_SCAN_DOUBLE_TEST(1024)