Commit 504008db authored by yao's avatar yao

Fix ocl::bruteforcematcher crash on Intel OCL

parent 620c6994
...@@ -51,7 +51,6 @@ using namespace cv; ...@@ -51,7 +51,6 @@ using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
using namespace std; using namespace std;
using namespace std;
namespace cv namespace cv
{ {
namespace ocl namespace ocl
...@@ -62,7 +61,7 @@ namespace cv ...@@ -62,7 +61,7 @@ namespace cv
} }
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask, void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, int distType) const oclMat &trainIdx, const oclMat &distance, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -77,7 +76,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat ...@@ -77,7 +76,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
...@@ -103,7 +102,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int ...@@ -103,7 +102,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int
} }
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void match(const oclMat &query, const oclMat &train, const oclMat &mask, void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, int distType) const oclMat &trainIdx, const oclMat &distance, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -117,7 +116,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask, ...@@ -117,7 +116,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask,
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
...@@ -143,7 +142,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o ...@@ -143,7 +142,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o
//radius_matchUnrolledCached //radius_matchUnrolledCached
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -159,7 +158,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist ...@@ -159,7 +158,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance )); args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
...@@ -183,7 +182,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist ...@@ -183,7 +182,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
//radius_match //radius_match
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -198,7 +197,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c ...@@ -198,7 +197,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance )); args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
...@@ -472,7 +471,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD ...@@ -472,7 +471,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD
//knn match Dispatcher //knn match Dispatcher
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask, void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, int distType) const oclMat &trainIdx, const oclMat &distance, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -487,7 +486,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl ...@@ -487,7 +486,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
...@@ -507,7 +506,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl ...@@ -507,7 +506,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
} }
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, int distType) const oclMat &trainIdx, const oclMat &distance, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -521,7 +520,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, ...@@ -521,7 +520,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
...@@ -540,7 +539,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, ...@@ -540,7 +539,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
} }
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType) void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
...@@ -554,7 +553,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat ...@@ -554,7 +553,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
...@@ -573,7 +572,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat ...@@ -573,7 +572,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
} }
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType) void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
...@@ -586,7 +585,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, ...@@ -586,7 +585,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask,
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
...@@ -691,7 +690,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o ...@@ -691,7 +690,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o
} }
} }
static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
{ {
findKnnMatch<256>(k, trainIdx, distance, allDist, distType); findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
} }
...@@ -1007,6 +1006,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons ...@@ -1007,6 +1006,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons
void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, vector<DMatch> &matches, const oclMat &mask) void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, vector<DMatch> &matches, const oclMat &mask)
{ {
assert(mask.empty()); // mask is not supported at the moment
oclMat trainIdx, distance; oclMat trainIdx, distance;
matchSingle(query, train, trainIdx, distance, mask); matchSingle(query, train, trainIdx, distance, mask);
matchDownload(trainIdx, distance, matches); matchDownload(trainIdx, distance, matches);
...@@ -1696,4 +1696,6 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto ...@@ -1696,4 +1696,6 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto
oclMat trainIdx, imgIdx, distance, nMatches; oclMat trainIdx, imgIdx, distance, nMatches;
radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks); radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult); radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
} }
\ No newline at end of file
...@@ -3,14 +3,16 @@ ...@@ -3,14 +3,16 @@
int bit1Count(float x) int bit1Count(float x)
{ {
int c = 0; int c = 0;
int ix = (int)x; int ix = (int)x;
for (int i = 0 ; i < 32 ; i++)
{ for (int i = 0 ; i < 32 ; i++)
c += ix & 0x1; {
ix >>= 1; c += ix & 0x1;
} ix >>= 1;
return (float)c; }
return (float)c;
} }
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
local size: dim0 is block_size, dim1 is block_size. local size: dim0 is block_size, dim1 is block_size.
...@@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size. ...@@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size.
__kernel void BruteForceMatch_UnrollMatch( __kernel void BruteForceMatch_UnrollMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
...@@ -30,113 +32,122 @@ __kernel void BruteForceMatch_UnrollMatch( ...@@ -30,113 +32,122 @@ __kernel void BruteForceMatch_UnrollMatch(
int train_cols, int train_cols,
int step, int step,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
__local float *s_query = sharebuffer; __local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * max_desc_len; __local float *s_train = sharebuffer + block_size * max_desc_len;
int queryIdx = groupidx * block_size + lidy; int queryIdx = groupidx * block_size + lidy;
// load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++) // load the query into local memory.
{ for (int i = 0 ; i < max_desc_len / block_size; i ++)
int loadx = lidx + i * block_size; {
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; int loadx = lidx + i * block_size;
} s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
}
float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1; float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1;
// loopUnrolledCached to find the best trainIdx and best distance.
volatile int imgIdx = 0; // loopUnrolledCached to find the best trainIdx and best distance.
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) volatile int imgIdx = 0;
{
float result = 0; for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
for (int i = 0 ; i < max_desc_len / block_size ; i++) {
{ float result = 0;
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size; for (int i = 0 ; i < max_desc_len / block_size ; i++)
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; {
//load a block_size * block_size block into local train.
//synchronize to make sure each elem for reduceIteration in share memory is written already. const int loadx = lidx + i * block_size;
barrier(CLK_LOCAL_MEM_FENCE); s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to //synchronize to make sure each elem for reduceIteration in share memory is written already.
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ barrier(CLK_LOCAL_MEM_FENCE);
switch(distType) /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
{ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
case 0:
for (int j = 0 ; j < block_size ; j++) switch (distType)
{ {
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); case 0:
}
break; for (int j = 0 ; j < block_size ; j++)
case 1: {
for (int j = 0 ; j < block_size ; j++) result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
{ }
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr; break;
} case 1:
break;
case 2: for (int j = 0 ; j < block_size ; j++)
for (int j = 0 ; j < block_size ; j++) {
{ float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += qr * qr;
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); }
}
break; break;
} case 2:
barrier(CLK_LOCAL_MEM_FENCE); for (int j = 0 ; j < block_size ; j++)
} {
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
int trainIdx = t * block_size + lidx; result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
{ break;
//bestImgIdx = imgIdx; }
myBestDistance = result;
myBestTrainIdx = trainIdx; barrier(CLK_LOCAL_MEM_FENCE);
} }
}
int trainIdx = t * block_size + lidx;
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float*)(sharebuffer); if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
__local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); {
//bestImgIdx = imgIdx;
//find BestMatch myBestDistance = result;
s_distance += lidy * block_size; myBestTrainIdx = trainIdx;
s_trainIdx += lidy * block_size; }
s_distance[lidx] = myBestDistance; }
s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)(sharebuffer);
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++) //find BestMatch
{ s_distance += lidy * block_size;
if (myBestDistance > s_distance[k]) s_trainIdx += lidy * block_size;
{ s_distance[lidx] = myBestDistance;
myBestDistance = s_distance[k]; s_trainIdx[lidx] = myBestTrainIdx;
myBestTrainIdx = s_trainIdx[k];
} barrier(CLK_LOCAL_MEM_FENCE);
}
//reduce -- now all reduce implement in each threads.
if (queryIdx < query_rows && lidx == 0) for (int k = 0 ; k < block_size; k++)
{ {
bestTrainIdx[queryIdx] = myBestTrainIdx; if (myBestDistance > s_distance[k])
bestDistance[queryIdx] = myBestDistance; {
} myBestDistance = s_distance[k];
myBestTrainIdx = s_trainIdx[k];
}
}
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
} }
__kernel void BruteForceMatch_Match( __kernel void BruteForceMatch_Match(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
...@@ -147,108 +158,115 @@ __kernel void BruteForceMatch_Match( ...@@ -147,108 +158,115 @@ __kernel void BruteForceMatch_Match(
int train_cols, int train_cols,
int step, int step,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy; const int queryIdx = groupidx * block_size + lidy;
float myBestDistance = MAX_FLOAT; float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1; int myBestTrainIdx = -1;
__local float *s_query = sharebuffer; __local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size; __local float *s_train = sharebuffer + block_size * block_size;
// loop // loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{ {
//Dist dist; //Dist dist;
float result = 0; float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
{ for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
const int loadx = lidx + i * block_size; {
//load query and train into local memory const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = 0; //load query and train into local memory
s_train[lidx * block_size + lidy] = 0; s_query[lidy * block_size + lidx] = 0;
s_train[lidx * block_size + lidy] = 0;
if (loadx < query_cols)
{ if (loadx < query_cols)
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; {
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
} s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{ switch (distType)
case 0: {
for (int j = 0 ; j < block_size ; j++) case 0:
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); for (int j = 0 ; j < block_size ; j++)
} {
break; result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
case 1: }
for (int j = 0 ; j < block_size ; j++)
{ break;
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; case 1:
result += qr * qr;
} for (int j = 0 ; j < block_size ; j++)
break; {
case 2: float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
for (int j = 0 ; j < block_size ; j++) result += qr * qr;
{ }
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); break;
} case 2:
break;
} for (int j = 0 ; j < block_size ; j++)
{
barrier(CLK_LOCAL_MEM_FENCE); //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
} result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
}
const int trainIdx = t * block_size + lidx;
break;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) }
{
//myBestImgidx = imgIdx; barrier(CLK_LOCAL_MEM_FENCE);
myBestDistance = result; }
myBestTrainIdx = trainIdx;
} const int trainIdx = t * block_size + lidx;
}
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
barrier(CLK_LOCAL_MEM_FENCE); {
//myBestImgidx = imgIdx;
__local float *s_distance = (__local float *)sharebuffer; myBestDistance = result;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); myBestTrainIdx = trainIdx;
}
//findBestMatch }
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size; barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx; __local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
barrier(CLK_LOCAL_MEM_FENCE);
//findBestMatch
//reduce -- now all reduce implement in each threads. s_distance += lidy * block_size;
for (int k = 0 ; k < block_size; k++) s_trainIdx += lidy * block_size;
{ s_distance[lidx] = myBestDistance;
if (myBestDistance > s_distance[k]) s_trainIdx[lidx] = myBestTrainIdx;
{
myBestDistance = s_distance[k]; barrier(CLK_LOCAL_MEM_FENCE);
myBestTrainIdx = s_trainIdx[k];
} //reduce -- now all reduce implement in each threads.
} for (int k = 0 ; k < block_size; k++)
{
if (queryIdx < query_rows && lidx == 0) if (myBestDistance > s_distance[k])
{ {
bestTrainIdx[queryIdx] = myBestTrainIdx; myBestDistance = s_distance[k];
bestDistance[queryIdx] = myBestDistance; myBestTrainIdx = s_trainIdx[k];
} }
}
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
} }
//radius_unrollmatch //radius_unrollmatch
...@@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( ...@@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
float maxDistance, float maxDistance,
__global float *mask, //__global float *mask,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__global int *nMatches, __global int *nMatches,
...@@ -271,71 +289,78 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( ...@@ -271,71 +289,78 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
int step, int step,
int ostep, int ostep,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1); const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy; const int queryIdx = groupidy * block_size + lidy;
const int trainIdx = groupidx * block_size + lidx; const int trainIdx = groupidx * block_size + lidx;
__local float *s_query = sharebuffer; __local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size; __local float *s_train = sharebuffer + block_size * block_size;
float result = 0; float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
{ for (int i = 0 ; i < max_desc_len / block_size ; ++i)
//load a block_size * block_size block into local train. {
const int loadx = lidx + i * block_size; //load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE); //synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{ switch (distType)
case 0: {
for (int j = 0 ; j < block_size ; ++j) case 0:
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); for (int j = 0 ; j < block_size ; ++j)
} {
break; result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
case 1: }
for (int j = 0 ; j < block_size ; ++j)
{ break;
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; case 1:
result += qr * qr;
} for (int j = 0 ; j < block_size ; ++j)
break; {
case 2: float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
for (int j = 0 ; j < block_size ; ++j) result += qr * qr;
{ }
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
} break;
break; case 2:
}
for (int j = 0 ; j < block_size ; ++j)
barrier(CLK_LOCAL_MEM_FENCE); {
} result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
{ break;
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); }
if(ind < bestTrainIdx_cols) barrier(CLK_LOCAL_MEM_FENCE);
{ }
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; {
} unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
}
if (ind < bestTrainIdx_cols)
{
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
}
}
} }
//radius_match //radius_match
...@@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch( ...@@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
float maxDistance, float maxDistance,
__global float *mask, //__global float *mask,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__global int *nMatches, __global int *nMatches,
...@@ -357,78 +382,85 @@ __kernel void BruteForceMatch_RadiusMatch( ...@@ -357,78 +382,85 @@ __kernel void BruteForceMatch_RadiusMatch(
int step, int step,
int ostep, int ostep,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1); const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy; const int queryIdx = groupidy * block_size + lidy;
const int trainIdx = groupidx * block_size + lidx; const int trainIdx = groupidx * block_size + lidx;
__local float *s_query = sharebuffer; __local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size; __local float *s_train = sharebuffer + block_size * block_size;
float result = 0; float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
{ for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
//load a block_size * block_size block into local train. {
const int loadx = lidx + i * block_size; //load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE); //synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{ switch (distType)
case 0: {
for (int j = 0 ; j < block_size ; ++j) case 0:
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); for (int j = 0 ; j < block_size ; ++j)
} {
break; result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
case 1: }
for (int j = 0 ; j < block_size ; ++j)
{ break;
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; case 1:
result += qr * qr;
} for (int j = 0 ; j < block_size ; ++j)
break; {
case 2: float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
for (int j = 0 ; j < block_size ; ++j) result += qr * qr;
{ }
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
} break;
break; case 2:
}
for (int j = 0 ; j < block_size ; ++j)
barrier(CLK_LOCAL_MEM_FENCE); {
} result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
{ break;
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); }
if(ind < bestTrainIdx_cols) barrier(CLK_LOCAL_MEM_FENCE);
{ }
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; {
} unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
}
if (ind < bestTrainIdx_cols)
{
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
}
}
} }
__kernel void BruteForceMatch_knnUnrollMatch( __kernel void BruteForceMatch_knnUnrollMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global int2 *bestTrainIdx, __global int2 *bestTrainIdx,
__global float2 *bestDistance, __global float2 *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
...@@ -440,169 +472,178 @@ __kernel void BruteForceMatch_knnUnrollMatch( ...@@ -440,169 +472,178 @@ __kernel void BruteForceMatch_knnUnrollMatch(
int train_cols, int train_cols,
int step, int step,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy; const int queryIdx = groupidx * block_size + lidy;
local float *s_query = sharebuffer; local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * max_desc_len; local float *s_train = sharebuffer + block_size * max_desc_len;
// load the query into local memory. // load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++) for (int i = 0 ; i < max_desc_len / block_size; i ++)
{ {
int loadx = lidx + i * block_size; int loadx = lidx + i * block_size;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
} }
float myBestDistance1 = MAX_FLOAT; float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT; float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1; int myBestTrainIdx2 = -1;
//loopUnrolledCached //loopUnrolledCached
volatile int imgIdx = 0; volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{ for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
float result = 0; {
for (int i = 0 ; i < max_desc_len / block_size ; i++) float result = 0;
{
const int loadX = lidx + i * block_size; for (int i = 0 ; i < max_desc_len / block_size ; i++)
//load a block_size * block_size block into local train. {
const int loadx = lidx + i * block_size; const int loadX = lidx + i * block_size;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; //load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
//synchronize to make sure each elem for reduceIteration in share memory is written already. s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
barrier(CLK_LOCAL_MEM_FENCE);
//synchronize to make sure each elem for reduceIteration in share memory is written already.
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to barrier(CLK_LOCAL_MEM_FENCE);
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
switch(distType) sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
{
case 0: switch (distType)
for (int j = 0 ; j < block_size ; j++) {
{ case 0:
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
} for (int j = 0 ; j < block_size ; j++)
break; {
case 1: result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
for (int j = 0 ; j < block_size ; j++) }
{
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; break;
result += qr * qr; case 1:
}
break; for (int j = 0 ; j < block_size ; j++)
case 2: {
for (int j = 0 ; j < block_size ; j++) float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
{ result += qr * qr;
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); }
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
} break;
break; case 2:
}
for (int j = 0 ; j < block_size ; j++)
barrier(CLK_LOCAL_MEM_FENCE); {
} //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
const int trainIdx = t * block_size + lidx; }
if (queryIdx < query_rows && trainIdx < train_rows) break;
{ }
if (result < myBestDistance1)
{ barrier(CLK_LOCAL_MEM_FENCE);
myBestDistance2 = myBestDistance1; }
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = result; const int trainIdx = t * block_size + lidx;
myBestTrainIdx1 = trainIdx;
} if (queryIdx < query_rows && trainIdx < train_rows)
else if (result < myBestDistance2) {
{ if (result < myBestDistance1)
myBestDistance2 = result; {
myBestTrainIdx2 = trainIdx; myBestDistance2 = myBestDistance1;
} myBestTrainIdx2 = myBestTrainIdx1;
} myBestDistance1 = result;
} myBestTrainIdx1 = trainIdx;
}
barrier(CLK_LOCAL_MEM_FENCE); else if (result < myBestDistance2)
{
local float *s_distance = (local float *)sharebuffer; myBestDistance2 = result;
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); myBestTrainIdx2 = trainIdx;
}
// find BestMatch }
s_distance += lidy * block_size; }
s_trainIdx += lidy * block_size;
barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1; local float *s_distance = (local float *)sharebuffer;
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
float bestDistance1 = MAX_FLOAT;
float bestDistance2 = MAX_FLOAT; // find BestMatch
int bestTrainIdx1 = -1; s_distance += lidy * block_size;
int bestTrainIdx2 = -1; s_trainIdx += lidy * block_size;
barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance1;
if (lidx == 0) s_trainIdx[lidx] = myBestTrainIdx1;
{
for (int i = 0 ; i < block_size ; i++) float bestDistance1 = MAX_FLOAT;
{ float bestDistance2 = MAX_FLOAT;
float val = s_distance[i]; int bestTrainIdx1 = -1;
if (val < bestDistance1) int bestTrainIdx2 = -1;
{ barrier(CLK_LOCAL_MEM_FENCE);
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1; if (lidx == 0)
{
bestDistance1 = val; for (int i = 0 ; i < block_size ; i++)
bestTrainIdx1 = s_trainIdx[i]; {
} float val = s_distance[i];
else if (val < bestDistance2)
{ if (val < bestDistance1)
bestDistance2 = val; {
bestTrainIdx2 = s_trainIdx[i]; bestDistance2 = bestDistance1;
} bestTrainIdx2 = bestTrainIdx1;
}
} bestDistance1 = val;
bestTrainIdx1 = s_trainIdx[i];
barrier(CLK_LOCAL_MEM_FENCE); }
else if (val < bestDistance2)
s_distance[lidx] = myBestDistance2; {
s_trainIdx[lidx] = myBestTrainIdx2; bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
barrier(CLK_LOCAL_MEM_FENCE); }
}
if (lidx == 0) }
{
for (int i = 0 ; i < block_size ; i++) barrier(CLK_LOCAL_MEM_FENCE);
{
float val = s_distance[i]; s_distance[lidx] = myBestDistance2;
s_trainIdx[lidx] = myBestTrainIdx2;
if (val < bestDistance2)
{ barrier(CLK_LOCAL_MEM_FENCE);
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i]; if (lidx == 0)
} {
} for (int i = 0 ; i < block_size ; i++)
} {
float val = s_distance[i];
myBestDistance1 = bestDistance1;
myBestDistance2 = bestDistance2; if (val < bestDistance2)
{
myBestTrainIdx1 = bestTrainIdx1; bestDistance2 = val;
myBestTrainIdx2 = bestTrainIdx2; bestTrainIdx2 = s_trainIdx[i];
}
if (queryIdx < query_rows && lidx == 0) }
{ }
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); myBestDistance1 = bestDistance1;
} myBestDistance2 = bestDistance2;
myBestTrainIdx1 = bestTrainIdx1;
myBestTrainIdx2 = bestTrainIdx2;
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
} }
__kernel void BruteForceMatch_knnMatch( __kernel void BruteForceMatch_knnMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global int2 *bestTrainIdx, __global int2 *bestTrainIdx,
__global float2 *bestDistance, __global float2 *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
...@@ -613,166 +654,174 @@ __kernel void BruteForceMatch_knnMatch( ...@@ -613,166 +654,174 @@ __kernel void BruteForceMatch_knnMatch(
int train_cols, int train_cols,
int step, int step,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
const int groupidx = get_group_id(0); const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy; const int queryIdx = groupidx * block_size + lidy;
local float *s_query = sharebuffer; local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * block_size; local float *s_train = sharebuffer + block_size * block_size;
float myBestDistance1 = MAX_FLOAT; float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT; float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1; int myBestTrainIdx2 = -1;
//loop //loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{ {
float result = 0.0f; float result = 0.0f;
for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
{ for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
const int loadx = lidx + i * block_size; {
//load query and train into local memory const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = 0; //load query and train into local memory
s_train[lidx * block_size + lidy] = 0; s_query[lidy * block_size + lidx] = 0;
s_train[lidx * block_size + lidy] = 0;
if (loadx < query_cols)
{ if (loadx < query_cols)
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; {
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
} s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{ switch (distType)
case 0: {
for (int j = 0 ; j < block_size ; j++) case 0:
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); for (int j = 0 ; j < block_size ; j++)
} {
break; result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
case 1: }
for (int j = 0 ; j < block_size ; j++)
{ break;
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; case 1:
result += qr * qr;
} for (int j = 0 ; j < block_size ; j++)
break; {
case 2: float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
for (int j = 0 ; j < block_size ; j++) result += qr * qr;
{ }
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); break;
} case 2:
break;
} for (int j = 0 ; j < block_size ; j++)
{
barrier(CLK_LOCAL_MEM_FENCE); //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
} result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
}
const int trainIdx = t * block_size + lidx;
break;
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) }
{
if (result < myBestDistance1) barrier(CLK_LOCAL_MEM_FENCE);
{ }
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1; const int trainIdx = t * block_size + lidx;
myBestDistance1 = result;
myBestTrainIdx1 = trainIdx; if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
} {
else if (result < myBestDistance2) if (result < myBestDistance1)
{ {
myBestDistance2 = result; myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = trainIdx; myBestTrainIdx2 = myBestTrainIdx1;
} myBestDistance1 = result;
} myBestTrainIdx1 = trainIdx;
} }
else if (result < myBestDistance2)
barrier(CLK_LOCAL_MEM_FENCE); {
myBestDistance2 = result;
__local float *s_distance = (__local float *)sharebuffer; myBestTrainIdx2 = trainIdx;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); }
}
//findBestMatch }
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size; barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance1; __local float *s_distance = (__local float *)sharebuffer;
s_trainIdx[lidx] = myBestTrainIdx1; __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
float bestDistance1 = MAX_FLOAT; //findBestMatch
float bestDistance2 = MAX_FLOAT; s_distance += lidy * block_size;
int bestTrainIdx1 = -1; s_trainIdx += lidy * block_size;
int bestTrainIdx2 = -1;
barrier(CLK_LOCAL_MEM_FENCE); s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1;
if (lidx == 0)
{ float bestDistance1 = MAX_FLOAT;
for (int i = 0 ; i < block_size ; i++) float bestDistance2 = MAX_FLOAT;
{ int bestTrainIdx1 = -1;
float val = s_distance[i]; int bestTrainIdx2 = -1;
if (val < bestDistance1) barrier(CLK_LOCAL_MEM_FENCE);
{
bestDistance2 = bestDistance1; if (lidx == 0)
bestTrainIdx2 = bestTrainIdx1; {
for (int i = 0 ; i < block_size ; i++)
bestDistance1 = val; {
bestTrainIdx1 = s_trainIdx[i]; float val = s_distance[i];
}
else if (val < bestDistance2) if (val < bestDistance1)
{ {
bestDistance2 = val; bestDistance2 = bestDistance1;
bestTrainIdx2 = s_trainIdx[i]; bestTrainIdx2 = bestTrainIdx1;
}
} bestDistance1 = val;
} bestTrainIdx1 = s_trainIdx[i];
}
barrier(CLK_LOCAL_MEM_FENCE); else if (val < bestDistance2)
{
s_distance[lidx] = myBestDistance2; bestDistance2 = val;
s_trainIdx[lidx] = myBestTrainIdx2; bestTrainIdx2 = s_trainIdx[i];
}
barrier(CLK_LOCAL_MEM_FENCE); }
}
if (lidx == 0)
{ barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0 ; i < block_size ; i++)
{ s_distance[lidx] = myBestDistance2;
float val = s_distance[i]; s_trainIdx[lidx] = myBestTrainIdx2;
if (val < bestDistance2) barrier(CLK_LOCAL_MEM_FENCE);
{
bestDistance2 = val; if (lidx == 0)
bestTrainIdx2 = s_trainIdx[i]; {
} for (int i = 0 ; i < block_size ; i++)
} {
} float val = s_distance[i];
myBestDistance1 = bestDistance1; if (val < bestDistance2)
myBestDistance2 = bestDistance2; {
bestDistance2 = val;
myBestTrainIdx1 = bestTrainIdx1; bestTrainIdx2 = s_trainIdx[i];
myBestTrainIdx2 = bestTrainIdx2; }
}
if (queryIdx < query_rows && lidx == 0) }
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); myBestDistance1 = bestDistance1;
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); myBestDistance2 = bestDistance2;
}
myBestTrainIdx1 = bestTrainIdx1;
myBestTrainIdx2 = bestTrainIdx2;
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
} }
kernel void BruteForceMatch_calcDistanceUnrolled( kernel void BruteForceMatch_calcDistanceUnrolled(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global float *allDist, __global float *allDist,
__local float *sharebuffer, __local float *sharebuffer,
int block_size, int block_size,
...@@ -784,13 +833,13 @@ kernel void BruteForceMatch_calcDistanceUnrolled( ...@@ -784,13 +833,13 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
int step, int step,
int distType) int distType)
{ {
/* Todo */ /* Todo */
} }
kernel void BruteForceMatch_calcDistance( kernel void BruteForceMatch_calcDistance(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global float *allDist, __global float *allDist,
__local float *sharebuffer, __local float *sharebuffer,
int block_size, int block_size,
...@@ -801,16 +850,16 @@ kernel void BruteForceMatch_calcDistance( ...@@ -801,16 +850,16 @@ kernel void BruteForceMatch_calcDistance(
int step, int step,
int distType) int distType)
{ {
/* Todo */ /* Todo */
} }
kernel void BruteForceMatch_findBestMatch( kernel void BruteForceMatch_findBestMatch(
__global float *allDist, __global float *allDist,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
int k, int k,
int block_size int block_size
) )
{ {
/* Todo */ /* Todo */
} }
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment