Commit 504008db authored by yao's avatar yao

Fix ocl::bruteforcematcher crash on Intel OCL

parent 620c6994
...@@ -51,7 +51,6 @@ using namespace cv; ...@@ -51,7 +51,6 @@ using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
using namespace std; using namespace std;
using namespace std;
namespace cv namespace cv
{ {
namespace ocl namespace ocl
...@@ -62,7 +61,7 @@ namespace cv ...@@ -62,7 +61,7 @@ namespace cv
} }
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask, void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, int distType) const oclMat &trainIdx, const oclMat &distance, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -77,7 +76,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat ...@@ -77,7 +76,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
...@@ -103,7 +102,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int ...@@ -103,7 +102,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int
} }
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void match(const oclMat &query, const oclMat &train, const oclMat &mask, void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, int distType) const oclMat &trainIdx, const oclMat &distance, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -117,7 +116,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask, ...@@ -117,7 +116,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask,
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
...@@ -143,7 +142,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o ...@@ -143,7 +142,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o
//radius_matchUnrolledCached //radius_matchUnrolledCached
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -159,7 +158,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist ...@@ -159,7 +158,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance )); args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
...@@ -183,7 +182,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist ...@@ -183,7 +182,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
//radius_match //radius_match
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask, void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType) const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -198,7 +197,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c ...@@ -198,7 +197,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance )); args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
...@@ -472,7 +471,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD ...@@ -472,7 +471,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD
//knn match Dispatcher //knn match Dispatcher
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask, void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, int distType) const oclMat &trainIdx, const oclMat &distance, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -487,7 +486,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl ...@@ -487,7 +486,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
...@@ -507,7 +506,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl ...@@ -507,7 +506,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
} }
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
const oclMat &trainIdx, const oclMat &distance, int distType) const oclMat &trainIdx, const oclMat &distance, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
...@@ -521,7 +520,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, ...@@ -521,7 +520,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
...@@ -540,7 +539,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask, ...@@ -540,7 +539,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
} }
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType) void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
...@@ -554,7 +553,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat ...@@ -554,7 +553,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
...@@ -573,7 +572,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat ...@@ -573,7 +572,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
} }
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType) void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
{ {
cv::ocl::Context *ctx = query.clCxt; cv::ocl::Context *ctx = query.clCxt;
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
...@@ -586,7 +585,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, ...@@ -586,7 +585,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask,
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
args.push_back( make_pair( smemSize, (void *)NULL)); args.push_back( make_pair( smemSize, (void *)NULL));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size )); args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
...@@ -691,7 +690,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o ...@@ -691,7 +690,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o
} }
} }
static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
{ {
findKnnMatch<256>(k, trainIdx, distance, allDist, distType); findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
} }
...@@ -1007,6 +1006,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons ...@@ -1007,6 +1006,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons
void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, vector<DMatch> &matches, const oclMat &mask) void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, vector<DMatch> &matches, const oclMat &mask)
{ {
assert(mask.empty()); // mask is not supported at the moment
oclMat trainIdx, distance; oclMat trainIdx, distance;
matchSingle(query, train, trainIdx, distance, mask); matchSingle(query, train, trainIdx, distance, mask);
matchDownload(trainIdx, distance, matches); matchDownload(trainIdx, distance, matches);
...@@ -1697,3 +1697,5 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto ...@@ -1697,3 +1697,5 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto
radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks); radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult); radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
} }
...@@ -5,11 +5,13 @@ int bit1Count(float x) ...@@ -5,11 +5,13 @@ int bit1Count(float x)
{ {
int c = 0; int c = 0;
int ix = (int)x; int ix = (int)x;
for (int i = 0 ; i < 32 ; i++) for (int i = 0 ; i < 32 ; i++)
{ {
c += ix & 0x1; c += ix & 0x1;
ix >>= 1; ix >>= 1;
} }
return (float)c; return (float)c;
} }
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
...@@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size. ...@@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size.
__kernel void BruteForceMatch_UnrollMatch( __kernel void BruteForceMatch_UnrollMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
...@@ -30,7 +32,7 @@ __kernel void BruteForceMatch_UnrollMatch( ...@@ -30,7 +32,7 @@ __kernel void BruteForceMatch_UnrollMatch(
int train_cols, int train_cols,
int step, int step,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
...@@ -40,6 +42,7 @@ __kernel void BruteForceMatch_UnrollMatch( ...@@ -40,6 +42,7 @@ __kernel void BruteForceMatch_UnrollMatch(
__local float *s_train = sharebuffer + block_size * max_desc_len; __local float *s_train = sharebuffer + block_size * max_desc_len;
int queryIdx = groupidx * block_size + lidy; int queryIdx = groupidx * block_size + lidy;
// load the query into local memory. // load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++) for (int i = 0 ; i < max_desc_len / block_size; i ++)
{ {
...@@ -52,9 +55,11 @@ __kernel void BruteForceMatch_UnrollMatch( ...@@ -52,9 +55,11 @@ __kernel void BruteForceMatch_UnrollMatch(
// loopUnrolledCached to find the best trainIdx and best distance. // loopUnrolledCached to find the best trainIdx and best distance.
volatile int imgIdx = 0; volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{ {
float result = 0; float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++) for (int i = 0 ; i < max_desc_len / block_size ; i++)
{ {
//load a block_size * block_size block into local train. //load a block_size * block_size block into local train.
...@@ -67,27 +72,33 @@ __kernel void BruteForceMatch_UnrollMatch( ...@@ -67,27 +72,33 @@ __kernel void BruteForceMatch_UnrollMatch(
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType) switch (distType)
{ {
case 0: case 0:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
} }
break; break;
case 1: case 1:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr; result += qr * qr;
} }
break; break;
case 2: case 2:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
} }
break; break;
} }
...@@ -105,8 +116,8 @@ __kernel void BruteForceMatch_UnrollMatch( ...@@ -105,8 +116,8 @@ __kernel void BruteForceMatch_UnrollMatch(
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float*)(sharebuffer); __local float *s_distance = (__local float *)(sharebuffer);
__local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//find BestMatch //find BestMatch
s_distance += lidy * block_size; s_distance += lidy * block_size;
...@@ -136,7 +147,7 @@ __kernel void BruteForceMatch_UnrollMatch( ...@@ -136,7 +147,7 @@ __kernel void BruteForceMatch_UnrollMatch(
__kernel void BruteForceMatch_Match( __kernel void BruteForceMatch_Match(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
...@@ -147,7 +158,7 @@ __kernel void BruteForceMatch_Match( ...@@ -147,7 +158,7 @@ __kernel void BruteForceMatch_Match(
int train_cols, int train_cols,
int step, int step,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
...@@ -166,6 +177,7 @@ __kernel void BruteForceMatch_Match( ...@@ -166,6 +177,7 @@ __kernel void BruteForceMatch_Match(
{ {
//Dist dist; //Dist dist;
float result = 0; float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
{ {
const int loadx = lidx + i * block_size; const int loadx = lidx + i * block_size;
...@@ -184,27 +196,33 @@ __kernel void BruteForceMatch_Match( ...@@ -184,27 +196,33 @@ __kernel void BruteForceMatch_Match(
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType) switch (distType)
{ {
case 0: case 0:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
} }
break; break;
case 1: case 1:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr; result += qr * qr;
} }
break; break;
case 2: case 2:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
} }
break; break;
} }
...@@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( ...@@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
float maxDistance, float maxDistance,
__global float *mask, //__global float *mask,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__global int *nMatches, __global int *nMatches,
...@@ -271,7 +289,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( ...@@ -271,7 +289,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
int step, int step,
int ostep, int ostep,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
...@@ -285,6 +303,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( ...@@ -285,6 +303,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
__local float *s_train = sharebuffer + block_size * block_size; __local float *s_train = sharebuffer + block_size * block_size;
float result = 0; float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; ++i) for (int i = 0 ; i < max_desc_len / block_size ; ++i)
{ {
//load a block_size * block_size block into local train. //load a block_size * block_size block into local train.
...@@ -299,26 +318,32 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( ...@@ -299,26 +318,32 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType) switch (distType)
{ {
case 0: case 0:
for (int j = 0 ; j < block_size ; ++j) for (int j = 0 ; j < block_size ; ++j)
{ {
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
} }
break; break;
case 1: case 1:
for (int j = 0 ; j < block_size ; ++j) for (int j = 0 ; j < block_size ; ++j)
{ {
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr; result += qr * qr;
} }
break; break;
case 2: case 2:
for (int j = 0 ; j < block_size ; ++j) for (int j = 0 ; j < block_size ; ++j)
{ {
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
} }
break; break;
} }
...@@ -329,7 +354,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch( ...@@ -329,7 +354,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
{ {
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if(ind < bestTrainIdx_cols) if (ind < bestTrainIdx_cols)
{ {
//bestImgIdx = imgIdx; //bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
...@@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch( ...@@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
float maxDistance, float maxDistance,
__global float *mask, //__global float *mask,
__global int *bestTrainIdx, __global int *bestTrainIdx,
__global float *bestDistance, __global float *bestDistance,
__global int *nMatches, __global int *nMatches,
...@@ -357,7 +382,7 @@ __kernel void BruteForceMatch_RadiusMatch( ...@@ -357,7 +382,7 @@ __kernel void BruteForceMatch_RadiusMatch(
int step, int step,
int ostep, int ostep,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
...@@ -371,6 +396,7 @@ __kernel void BruteForceMatch_RadiusMatch( ...@@ -371,6 +396,7 @@ __kernel void BruteForceMatch_RadiusMatch(
__local float *s_train = sharebuffer + block_size * block_size; __local float *s_train = sharebuffer + block_size * block_size;
float result = 0; float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
{ {
//load a block_size * block_size block into local train. //load a block_size * block_size block into local train.
...@@ -385,26 +411,32 @@ __kernel void BruteForceMatch_RadiusMatch( ...@@ -385,26 +411,32 @@ __kernel void BruteForceMatch_RadiusMatch(
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType) switch (distType)
{ {
case 0: case 0:
for (int j = 0 ; j < block_size ; ++j) for (int j = 0 ; j < block_size ; ++j)
{ {
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
} }
break; break;
case 1: case 1:
for (int j = 0 ; j < block_size ; ++j) for (int j = 0 ; j < block_size ; ++j)
{ {
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr; result += qr * qr;
} }
break; break;
case 2: case 2:
for (int j = 0 ; j < block_size ; ++j) for (int j = 0 ; j < block_size ; ++j)
{ {
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
} }
break; break;
} }
...@@ -415,7 +447,7 @@ __kernel void BruteForceMatch_RadiusMatch( ...@@ -415,7 +447,7 @@ __kernel void BruteForceMatch_RadiusMatch(
{ {
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if(ind < bestTrainIdx_cols) if (ind < bestTrainIdx_cols)
{ {
//bestImgIdx = imgIdx; //bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
...@@ -428,7 +460,7 @@ __kernel void BruteForceMatch_RadiusMatch( ...@@ -428,7 +460,7 @@ __kernel void BruteForceMatch_RadiusMatch(
__kernel void BruteForceMatch_knnUnrollMatch( __kernel void BruteForceMatch_knnUnrollMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global int2 *bestTrainIdx, __global int2 *bestTrainIdx,
__global float2 *bestDistance, __global float2 *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
...@@ -440,7 +472,7 @@ __kernel void BruteForceMatch_knnUnrollMatch( ...@@ -440,7 +472,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
int train_cols, int train_cols,
int step, int step,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
...@@ -464,9 +496,11 @@ __kernel void BruteForceMatch_knnUnrollMatch( ...@@ -464,9 +496,11 @@ __kernel void BruteForceMatch_knnUnrollMatch(
//loopUnrolledCached //loopUnrolledCached
volatile int imgIdx = 0; volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{ {
float result = 0; float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++) for (int i = 0 ; i < max_desc_len / block_size ; i++)
{ {
const int loadX = lidx + i * block_size; const int loadX = lidx + i * block_size;
...@@ -480,27 +514,33 @@ __kernel void BruteForceMatch_knnUnrollMatch( ...@@ -480,27 +514,33 @@ __kernel void BruteForceMatch_knnUnrollMatch(
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType) switch (distType)
{ {
case 0: case 0:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
} }
break; break;
case 1: case 1:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr; result += qr * qr;
} }
break; break;
case 2: case 2:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
} }
break; break;
} }
...@@ -549,6 +589,7 @@ __kernel void BruteForceMatch_knnUnrollMatch( ...@@ -549,6 +589,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
for (int i = 0 ; i < block_size ; i++) for (int i = 0 ; i < block_size ; i++)
{ {
float val = s_distance[i]; float val = s_distance[i];
if (val < bestDistance1) if (val < bestDistance1)
{ {
bestDistance2 = bestDistance1; bestDistance2 = bestDistance1;
...@@ -602,7 +643,7 @@ __kernel void BruteForceMatch_knnUnrollMatch( ...@@ -602,7 +643,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
__kernel void BruteForceMatch_knnMatch( __kernel void BruteForceMatch_knnMatch(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global int2 *bestTrainIdx, __global int2 *bestTrainIdx,
__global float2 *bestDistance, __global float2 *bestDistance,
__local float *sharebuffer, __local float *sharebuffer,
...@@ -613,7 +654,7 @@ __kernel void BruteForceMatch_knnMatch( ...@@ -613,7 +654,7 @@ __kernel void BruteForceMatch_knnMatch(
int train_cols, int train_cols,
int step, int step,
int distType int distType
) )
{ {
const int lidx = get_local_id(0); const int lidx = get_local_id(0);
const int lidy = get_local_id(1); const int lidy = get_local_id(1);
...@@ -632,7 +673,8 @@ __kernel void BruteForceMatch_knnMatch( ...@@ -632,7 +673,8 @@ __kernel void BruteForceMatch_knnMatch(
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{ {
float result = 0.0f; float result = 0.0f;
for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
{ {
const int loadx = lidx + i * block_size; const int loadx = lidx + i * block_size;
//load query and train into local memory //load query and train into local memory
...@@ -650,27 +692,33 @@ __kernel void BruteForceMatch_knnMatch( ...@@ -650,27 +692,33 @@ __kernel void BruteForceMatch_knnMatch(
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType) switch (distType)
{ {
case 0: case 0:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
} }
break; break;
case 1: case 1:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr; result += qr * qr;
} }
break; break;
case 2: case 2:
for (int j = 0 ; j < block_size ; j++) for (int j = 0 ; j < block_size ; j++)
{ {
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]); result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
} }
break; break;
} }
...@@ -719,6 +767,7 @@ __kernel void BruteForceMatch_knnMatch( ...@@ -719,6 +767,7 @@ __kernel void BruteForceMatch_knnMatch(
for (int i = 0 ; i < block_size ; i++) for (int i = 0 ; i < block_size ; i++)
{ {
float val = s_distance[i]; float val = s_distance[i];
if (val < bestDistance1) if (val < bestDistance1)
{ {
bestDistance2 = bestDistance1; bestDistance2 = bestDistance1;
...@@ -772,7 +821,7 @@ __kernel void BruteForceMatch_knnMatch( ...@@ -772,7 +821,7 @@ __kernel void BruteForceMatch_knnMatch(
kernel void BruteForceMatch_calcDistanceUnrolled( kernel void BruteForceMatch_calcDistanceUnrolled(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global float *allDist, __global float *allDist,
__local float *sharebuffer, __local float *sharebuffer,
int block_size, int block_size,
...@@ -790,7 +839,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled( ...@@ -790,7 +839,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
kernel void BruteForceMatch_calcDistance( kernel void BruteForceMatch_calcDistance(
__global float *query, __global float *query,
__global float *train, __global float *train,
__global float *mask, //__global float *mask,
__global float *allDist, __global float *allDist,
__local float *sharebuffer, __local float *sharebuffer,
int block_size, int block_size,
...@@ -810,7 +859,7 @@ kernel void BruteForceMatch_findBestMatch( ...@@ -810,7 +859,7 @@ kernel void BruteForceMatch_findBestMatch(
__global float *bestDistance, __global float *bestDistance,
int k, int k,
int block_size int block_size
) )
{ {
/* Todo */ /* Todo */
} }
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment