Commit f7b40cdc authored by peng xiao's avatar peng xiao

Add a macro to call additional barrier function on the fly

parent 55c9a7c8
...@@ -747,21 +747,42 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc ...@@ -747,21 +747,42 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
#define op(A, B) (*A)+(B) #define op(A, B) (*A)+(B)
data[tid] = *partial_reduction; data[tid] = *partial_reduction;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
if (tid < 16) if (tid < 16)
{
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
#if WAVE_SIZE < 16
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) if (tid < 8)
{
#endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
#if WAVE_SIZE < 8
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4) if (tid < 4)
{
#endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
#if WAVE_SIZE < 4
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2) if (tid < 2)
{
#endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
#if WAVE_SIZE < 2
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1) if (tid < 1)
{
#endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
#undef WAVE_SIZE
#undef op #undef op
} }
...@@ -1087,44 +1108,67 @@ void reduce_sum25( ...@@ -1087,44 +1108,67 @@ void reduce_sum25(
int tid int tid
) )
{ {
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
// first step is to reduce from 25 to 16 // first step is to reduce from 25 to 16
if (tid < 9) // use 9 threads if (tid < 9)
{ {
sdata1[tid] += sdata1[tid + 16]; sdata1[tid] += sdata1[tid + 16];
sdata2[tid] += sdata2[tid + 16]; sdata2[tid] += sdata2[tid + 16];
sdata3[tid] += sdata3[tid + 16]; sdata3[tid] += sdata3[tid + 16];
sdata4[tid] += sdata4[tid + 16]; sdata4[tid] += sdata4[tid + 16];
#if WAVE_SIZE < 16
} }
barrier(CLK_LOCAL_MEM_FENCE);
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
if (tid < 8) if (tid < 8)
{ {
#endif
sdata1[tid] += sdata1[tid + 8]; sdata1[tid] += sdata1[tid + 8];
sdata1[tid] += sdata1[tid + 4];
sdata1[tid] += sdata1[tid + 2];
sdata1[tid] += sdata1[tid + 1];
sdata2[tid] += sdata2[tid + 8]; sdata2[tid] += sdata2[tid + 8];
sdata2[tid] += sdata2[tid + 4];
sdata2[tid] += sdata2[tid + 2];
sdata2[tid] += sdata2[tid + 1];
sdata3[tid] += sdata3[tid + 8]; sdata3[tid] += sdata3[tid + 8];
sdata3[tid] += sdata3[tid + 4];
sdata3[tid] += sdata3[tid + 2];
sdata3[tid] += sdata3[tid + 1];
sdata4[tid] += sdata4[tid + 8]; sdata4[tid] += sdata4[tid + 8];
#if WAVE_SIZE < 8
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
sdata1[tid] += sdata1[tid + 4];
sdata2[tid] += sdata2[tid + 4];
sdata3[tid] += sdata3[tid + 4];
sdata4[tid] += sdata4[tid + 4]; sdata4[tid] += sdata4[tid + 4];
#if WAVE_SIZE < 4
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
sdata1[tid] += sdata1[tid + 2];
sdata2[tid] += sdata2[tid + 2];
sdata3[tid] += sdata3[tid + 2];
sdata4[tid] += sdata4[tid + 2]; sdata4[tid] += sdata4[tid + 2];
#if WAVE_SIZE < 2
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
sdata1[tid] += sdata1[tid + 1];
sdata2[tid] += sdata2[tid + 1];
sdata3[tid] += sdata3[tid + 1];
sdata4[tid] += sdata4[tid + 1]; sdata4[tid] += sdata4[tid + 1];
} }
#undef WAVE_SIZE
} }
__kernel __kernel
void compute_descriptors64( void compute_descriptors64(
IMAGE_INT8 imgTex, IMAGE_INT8 imgTex,
volatile __global float * descriptors, __global float * descriptors,
__global const float * keypoints, __global const float * keypoints,
int descriptors_step, int descriptors_step,
int keypoints_step, int keypoints_step,
...@@ -1158,14 +1202,13 @@ __kernel ...@@ -1158,14 +1202,13 @@ __kernel
sdyabs[tid] = fabs(sdy[tid]); // |dy| array sdyabs[tid] = fabs(sdy[tid]); // |dy| array
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25) if (tid < 25)
{ {
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
// write dx, dy, |dx|, |dy| // write dx, dy, |dx|, |dy|
if (tid == 0) if (tid == 0)
...@@ -1180,7 +1223,7 @@ __kernel ...@@ -1180,7 +1223,7 @@ __kernel
__kernel __kernel
void compute_descriptors128( void compute_descriptors128(
IMAGE_INT8 imgTex, IMAGE_INT8 imgTex,
__global volatile float * descriptors, __global float * descriptors,
__global float * keypoints, __global float * keypoints,
int descriptors_step, int descriptors_step,
int keypoints_step, int keypoints_step,
...@@ -1229,13 +1272,15 @@ __kernel ...@@ -1229,13 +1272,15 @@ __kernel
sd2[tid] = sdx[tid]; sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]); sdabs2[tid] = fabs(sdx[tid]);
} }
//barrier(CLK_LOCAL_MEM_FENCE); }
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
//barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
__global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
if (tid < 25)
{
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (tid == 0) if (tid == 0)
{ {
...@@ -1259,11 +1304,14 @@ __kernel ...@@ -1259,11 +1304,14 @@ __kernel
sd2[tid] = sdy[tid]; sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]); sdabs2[tid] = fabs(sdy[tid]);
} }
//barrier(CLK_LOCAL_MEM_FENCE); }
barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
//barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (tid == 0) if (tid == 0)
{ {
...@@ -1274,6 +1322,103 @@ __kernel ...@@ -1274,6 +1322,103 @@ __kernel
} }
} }
} }
void reduce_sum128(volatile __local float* smem, int tid)
{
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
if (tid < 64)
{
smem[tid] += smem[tid + 64];
#if WAVE_SIZE < 64
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
{
#endif
smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
#endif
smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
smem[tid] += smem[tid + 8];
#if WAVE_SIZE < 8
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
smem[tid] += smem[tid + 4];
#if WAVE_SIZE < 4
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
smem[tid] += smem[tid + 2];
#if WAVE_SIZE < 2
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
smem[tid] += smem[tid + 1];
}
}
void reduce_sum64(volatile __local float* smem, int tid)
{
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
if (tid < 32)
{
smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
#endif
smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
smem[tid] += smem[tid + 8];
#if WAVE_SIZE < 8
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
smem[tid] += smem[tid + 4];
#if WAVE_SIZE < 4
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
smem[tid] += smem[tid + 2];
#if WAVE_SIZE < 2
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
smem[tid] += smem[tid + 1];
}
}
__kernel __kernel
void normalize_descriptors128(__global float * descriptors, int descriptors_step) void normalize_descriptors128(__global float * descriptors, int descriptors_step)
...@@ -1288,22 +1433,10 @@ __kernel ...@@ -1288,22 +1433,10 @@ __kernel
sqDesc[get_local_id(0)] = lookup * lookup; sqDesc[get_local_id(0)] = lookup * lookup;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < 64) reduce_sum128(sqDesc, get_local_id(0));
sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// reduction to get total
if (get_local_id(0) < 32)
{
volatile __local float* smem = sqDesc;
smem[get_local_id(0)] += smem[get_local_id(0) + 32];
smem[get_local_id(0)] += smem[get_local_id(0) + 16];
smem[get_local_id(0)] += smem[get_local_id(0) + 8];
smem[get_local_id(0)] += smem[get_local_id(0) + 4];
smem[get_local_id(0)] += smem[get_local_id(0) + 2];
smem[get_local_id(0)] += smem[get_local_id(0) + 1];
}
// compute length (square root) // compute length (square root)
volatile __local float len; volatile __local float len;
...@@ -1329,18 +1462,9 @@ __kernel ...@@ -1329,18 +1462,9 @@ __kernel
sqDesc[get_local_id(0)] = lookup * lookup; sqDesc[get_local_id(0)] = lookup * lookup;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// reduction to get total
if (get_local_id(0) < 32)
{
volatile __local float* smem = sqDesc;
smem[get_local_id(0)] += smem[get_local_id(0) + 32]; reduce_sum64(sqDesc, get_local_id(0));
smem[get_local_id(0)] += smem[get_local_id(0) + 16]; barrier(CLK_LOCAL_MEM_FENCE);
smem[get_local_id(0)] += smem[get_local_id(0) + 8];
smem[get_local_id(0)] += smem[get_local_id(0) + 4];
smem[get_local_id(0)] += smem[get_local_id(0) + 2];
smem[get_local_id(0)] += smem[get_local_id(0) + 1];
}
// compute length (square root) // compute length (square root)
volatile __local float len; volatile __local float len;
......
...@@ -75,7 +75,7 @@ namespace cv ...@@ -75,7 +75,7 @@ namespace cv
} }
static inline int divUp(size_t total, size_t grain) static inline size_t divUp(size_t total, size_t grain)
{ {
return (total + grain - 1) / grain; return (total + grain - 1) / grain;
} }
......
...@@ -144,9 +144,10 @@ PARAM_TEST_CASE(SURF, HessianThreshold, Octaves, OctaveLayers, Extended, Upright ...@@ -144,9 +144,10 @@ PARAM_TEST_CASE(SURF, HessianThreshold, Octaves, OctaveLayers, Extended, Upright
} }
}; };
TEST_P(SURF, DISABLED_Detector) TEST_P(SURF, Detector)
{ {
cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE); // the data path should be opencv/samples
cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "c/fruits.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(image.empty()); ASSERT_FALSE(image.empty());
cv::ocl::SURF_OCL surf; cv::ocl::SURF_OCL surf;
...@@ -179,7 +180,7 @@ TEST_P(SURF, DISABLED_Detector) ...@@ -179,7 +180,7 @@ TEST_P(SURF, DISABLED_Detector)
TEST_P(SURF, DISABLED_Descriptor) TEST_P(SURF, DISABLED_Descriptor)
{ {
cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE); cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "c/fruits.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(image.empty()); ASSERT_FALSE(image.empty());
cv::ocl::SURF_OCL surf; cv::ocl::SURF_OCL surf;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment