Fix ocl::bruteforcematcher crash on Intel OCL

504008db · yao · 620c6994 · 504008db · 504008db
Commit 504008db authored Feb 28, 2013 by yao
Hide whitespace changes
Inline Side-by-side

Showing with 731 additions and 680 deletions

brute_force_matcher.cpp modules/ocl/src/brute_force_matcher.cpp +21 -20

brute_force_match.cl modules/ocl/src/kernels/brute_force_match.cl +710 -660

No files found.
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -51,7 +51,6 @@ using namespace cv;
 using namespace cv::ocl;
 using namespace std;

-using namespace std;
 namespace cv
 {
    namespace ocl
@@ -62,7 +61,7 @@ namespace cv
 }

 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask,
+void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -77,7 +76,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
    {
        args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( make_pair( smemSize, (void *)NULL));
@@ -103,7 +102,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int
 }

 template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
-void match(const oclMat &query, const oclMat &train, const oclMat &mask,
+void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
           const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -117,7 +116,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( make_pair( smemSize, (void *)NULL));
@@ -143,7 +142,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o

 //radius_matchUnrolledCached
 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
+void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -159,7 +158,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
        args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
        args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
@@ -183,7 +182,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist

 //radius_match
 template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
-void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
+void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                  const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -198,7 +197,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
        args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
        args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
@@ -472,7 +471,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD

 //knn match Dispatcher
 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask,
+void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                             const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -487,7 +486,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
    {
        args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( make_pair( smemSize, (void *)NULL));
@@ -507,7 +506,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
 }

 template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
-void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
+void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
               const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -521,7 +520,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( make_pair( smemSize, (void *)NULL));
@@ -540,7 +539,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
 }

 template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
-void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType)
+void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
@@ -554,7 +553,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
    {
        args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
        args.push_back( make_pair( smemSize, (void *)NULL));
        args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
@@ -573,7 +572,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
 }

 template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
-void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType)
+void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
@@ -586,7 +585,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
        args.push_back( make_pair( smemSize, (void *)NULL));
        args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
@@ -691,7 +690,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o
    }
 }

-static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
+void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
 {
    findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
 }
@@ -1007,6 +1006,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons

 void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, vector<DMatch> &matches, const oclMat &mask)
 {
+	assert(mask.empty()); // mask is not supported at the moment
    oclMat trainIdx, distance;
    matchSingle(query, train, trainIdx, distance, mask);
    matchDownload(trainIdx, distance, matches);
@@ -1696,4 +1696,6 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto
    oclMat trainIdx, imgIdx, distance, nMatches;
    radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
    radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
-}
\ No newline at end of file
+}
+
+
--- a/modules/ocl/src/kernels/brute_force_match.cl
+++ b/modules/ocl/src/kernels/brute_force_match.cl
@@ -3,14 +3,16 @@

 int bit1Count(float x)
 {
-    int c = 0;
-    int ix = (int)x;
-    for (int i = 0 ; i < 32 ; i++)
-    {
-        c += ix & 0x1;
-        ix >>= 1;
-    }
-    return (float)c;
+	int c = 0;
+	int ix = (int)x;
+	
+	for (int i = 0 ; i < 32 ; i++)
+	{
+		c += ix & 0x1;
+		ix >>= 1;
+	}
+	
+	return (float)c;
 }
 /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
 local size: dim0 is block_size, dim1 is block_size.
@@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size.
 __kernel void BruteForceMatch_UnrollMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
@@ -30,113 +32,122 @@ __kernel void BruteForceMatch_UnrollMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-
-    __local float *s_query = sharebuffer;
-    __local float *s_train = sharebuffer + block_size * max_desc_len;
-
-    int queryIdx = groupidx * block_size + lidy;
-    // load the query into local memory.
-    for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
-    {
-        int loadx = lidx + i * block_size;
-        s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-    }
-
-    float myBestDistance = MAX_FLOAT;
-    int myBestTrainIdx = -1;
-
-    // loopUnrolledCached to find the best trainIdx and best distance.
-    volatile int imgIdx = 0;
-    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
-    {
-        float result = 0;
-        for (int i = 0 ; i < max_desc_len / block_size ; i++)
-        {
-            //load a block_size * block_size block into local train.
-            const int loadx = lidx + i * block_size;
-            s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-
-            //synchronize to make sure each elem for reduceIteration in share memory is written already.
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch(distType)
-            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                }
-                break;
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        int trainIdx = t * block_size + lidx;
-
-        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
-        {
-            //bestImgIdx = imgIdx;
-            myBestDistance = result;
-            myBestTrainIdx = trainIdx;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    __local float *s_distance = (__local float*)(sharebuffer);
-    __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-
-    //find BestMatch
-    s_distance += lidy * block_size;
-    s_trainIdx += lidy * block_size;
-    s_distance[lidx] = myBestDistance;
-    s_trainIdx[lidx] = myBestTrainIdx;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //reduce -- now all reduce implement in each threads.
-    for (int k = 0 ; k < block_size; k++)
-    {
-        if (myBestDistance > s_distance[k])
-        {
-            myBestDistance = s_distance[k];
-            myBestTrainIdx = s_trainIdx[k];
-        }
-    }
-
-    if (queryIdx < query_rows && lidx == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
+	const int lidx = get_local_id(0);
+	const int lidy = get_local_id(1);
+	const int groupidx = get_group_id(0);
+	
+	__local float *s_query = sharebuffer;
+	__local float *s_train = sharebuffer + block_size * max_desc_len;
+	
+	int queryIdx = groupidx * block_size + lidy;
+	
+	// load the query into local memory.
+	for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
+	{
+		int loadx = lidx + i * block_size;
+		s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+	}
+	
+	float myBestDistance = MAX_FLOAT;
+	int myBestTrainIdx = -1;
+	
+	// loopUnrolledCached to find the best trainIdx and best distance.
+	volatile int imgIdx = 0;
+	
+	for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+	{
+		float result = 0;
+		
+		for (int i = 0 ; i < max_desc_len / block_size ; i++)
+		{
+			//load a block_size * block_size block into local train.
+			const int loadx = lidx + i * block_size;
+			s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+			
+			//synchronize to make sure each elem for reduceIteration in share memory is written already.
+			barrier(CLK_LOCAL_MEM_FENCE);
+			
+			/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+			sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+			
+			switch (distType)
+			{
+				case 0:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
+					}
+					
+					break;
+				case 1:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
+						result += qr * qr;
+					}
+					
+					break;
+				case 2:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+						result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+					}
+					
+					break;
+			}
+			
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+		
+		int trainIdx = t * block_size + lidx;
+		
+		if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
+		{
+			//bestImgIdx = imgIdx;
+			myBestDistance = result;
+			myBestTrainIdx = trainIdx;
+		}
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	__local float *s_distance = (__local float *)(sharebuffer);
+	__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+	
+	//find BestMatch
+	s_distance += lidy * block_size;
+	s_trainIdx += lidy * block_size;
+	s_distance[lidx] = myBestDistance;
+	s_trainIdx[lidx] = myBestTrainIdx;
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	//reduce -- now all reduce implement in each threads.
+	for (int k = 0 ; k < block_size; k++)
+	{
+		if (myBestDistance > s_distance[k])
+		{
+			myBestDistance = s_distance[k];
+			myBestTrainIdx = s_trainIdx[k];
+		}
+	}
+	
+	if (queryIdx < query_rows && lidx == 0)
+	{
+		bestTrainIdx[queryIdx] = myBestTrainIdx;
+		bestDistance[queryIdx] = myBestDistance;
+	}
 }

 __kernel void BruteForceMatch_Match(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
@@ -147,108 +158,115 @@ __kernel void BruteForceMatch_Match(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-
-    const int queryIdx = groupidx * block_size + lidy;
-
-    float myBestDistance = MAX_FLOAT;
-    int myBestTrainIdx = -1;
-
-    __local float *s_query = sharebuffer;
-    __local float *s_train = sharebuffer + block_size * block_size;
-
-    // loop
-    for (int t = 0 ;  t < (train_rows + block_size - 1) / block_size ; t++)
-    {
-        //Dist dist;
-        float result = 0;
-        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
-        {
-            const int loadx = lidx + i * block_size;
-            //load query and train into local memory
-            s_query[lidy * block_size + lidx] = 0;
-            s_train[lidx * block_size + lidy] = 0;
-
-            if (loadx < query_cols)
-            {
-                s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
-                s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch(distType)
-            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
-                }
-                break;
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        const int trainIdx = t * block_size + lidx;
-
-        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
-        {
-            //myBestImgidx = imgIdx;
-            myBestDistance = result;
-            myBestTrainIdx = trainIdx;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    __local float *s_distance = (__local float *)sharebuffer;
-    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-
-    //findBestMatch
-    s_distance += lidy * block_size;
-    s_trainIdx += lidy * block_size;
-    s_distance[lidx] = myBestDistance;
-    s_trainIdx[lidx] = myBestTrainIdx;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //reduce -- now all reduce implement in each threads.
-    for (int k = 0 ; k < block_size; k++)
-    {
-        if (myBestDistance > s_distance[k])
-        {
-            myBestDistance = s_distance[k];
-            myBestTrainIdx = s_trainIdx[k];
-        }
-    }
-
-    if (queryIdx < query_rows && lidx == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
+	const int lidx = get_local_id(0);
+	const int lidy = get_local_id(1);
+	const int groupidx = get_group_id(0);
+	
+	const int queryIdx = groupidx * block_size + lidy;
+	
+	float myBestDistance = MAX_FLOAT;
+	int myBestTrainIdx = -1;
+	
+	__local float *s_query = sharebuffer;
+	__local float *s_train = sharebuffer + block_size * block_size;
+	
+	// loop
+	for (int t = 0 ;  t < (train_rows + block_size - 1) / block_size ; t++)
+	{
+		//Dist dist;
+		float result = 0;
+		
+		for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
+		{
+			const int loadx = lidx + i * block_size;
+			//load query and train into local memory
+			s_query[lidy * block_size + lidx] = 0;
+			s_train[lidx * block_size + lidy] = 0;
+			
+			if (loadx < query_cols)
+			{
+				s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+				s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+			}
+			
+			barrier(CLK_LOCAL_MEM_FENCE);
+			
+			/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+			sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+			
+			switch (distType)
+			{
+				case 0:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+					}
+					
+					break;
+				case 1:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+						result += qr * qr;
+					}
+					
+					break;
+				case 2:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+						result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+					}
+					
+					break;
+			}
+			
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+		
+		const int trainIdx = t * block_size + lidx;
+		
+		if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
+		{
+			//myBestImgidx = imgIdx;
+			myBestDistance = result;
+			myBestTrainIdx = trainIdx;
+		}
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	__local float *s_distance = (__local float *)sharebuffer;
+	__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+	
+	//findBestMatch
+	s_distance += lidy * block_size;
+	s_trainIdx += lidy * block_size;
+	s_distance[lidx] = myBestDistance;
+	s_trainIdx[lidx] = myBestTrainIdx;
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	//reduce -- now all reduce implement in each threads.
+	for (int k = 0 ; k < block_size; k++)
+	{
+		if (myBestDistance > s_distance[k])
+		{
+			myBestDistance = s_distance[k];
+			myBestTrainIdx = s_trainIdx[k];
+		}
+	}
+	
+	if (queryIdx < query_rows && lidx == 0)
+	{
+		bestTrainIdx[queryIdx] = myBestTrainIdx;
+		bestDistance[queryIdx] = myBestDistance;
+	}
 }

 //radius_unrollmatch
@@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    __global float *query,
    __global float *train,
    float maxDistance,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
@@ -271,71 +289,78 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    int step,
    int ostep,
    int distType
-    )
+)
 {
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-    const int groupidy = get_group_id(1);
-
-    const int queryIdx = groupidy * block_size + lidy;
-    const int trainIdx = groupidx * block_size + lidx;
-
-    __local float *s_query = sharebuffer;
-    __local float *s_train = sharebuffer + block_size * block_size;
-
-    float result = 0;
-    for (int i = 0 ; i < max_desc_len / block_size ; ++i)
-    {
-        //load a block_size * block_size block into local train.
-        const int loadx = lidx + i * block_size;
-
-        s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-        s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-
-        //synchronize to make sure each elem for reduceIteration in share memory is written already.
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-        switch(distType)
-        {
-        case 0:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-            }
-            break;
-        case 1:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                result += qr * qr;
-            }
-            break;
-        case 2:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-            }
-            break;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
-    {
-        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
-
-        if(ind < bestTrainIdx_cols)
-        {
-            //bestImgIdx = imgIdx;
-            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
-            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
-        }
-    }
+	const int lidx = get_local_id(0);
+	const int lidy = get_local_id(1);
+	const int groupidx = get_group_id(0);
+	const int groupidy = get_group_id(1);
+	
+	const int queryIdx = groupidy * block_size + lidy;
+	const int trainIdx = groupidx * block_size + lidx;
+	
+	__local float *s_query = sharebuffer;
+	__local float *s_train = sharebuffer + block_size * block_size;
+	
+	float result = 0;
+	
+	for (int i = 0 ; i < max_desc_len / block_size ; ++i)
+	{
+		//load a block_size * block_size block into local train.
+		const int loadx = lidx + i * block_size;
+		
+		s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+		s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+		
+		//synchronize to make sure each elem for reduceIteration in share memory is written already.
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+		sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+		
+		switch (distType)
+		{
+			case 0:
+			
+				for (int j = 0 ; j < block_size ; ++j)
+				{
+					result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+				}
+				
+				break;
+			case 1:
+			
+				for (int j = 0 ; j < block_size ; ++j)
+				{
+					float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+					result += qr * qr;
+				}
+				
+				break;
+			case 2:
+			
+				for (int j = 0 ; j < block_size ; ++j)
+				{
+					result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+				}
+				
+				break;
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	
+	if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
+	{
+		unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+		
+		if (ind < bestTrainIdx_cols)
+		{
+			//bestImgIdx = imgIdx;
+			bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+			bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+		}
+	}
 }

 //radius_match
@@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    __global float *query,
    __global float *train,
    float maxDistance,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
@@ -357,78 +382,85 @@ __kernel void BruteForceMatch_RadiusMatch(
    int step,
    int ostep,
    int distType
-    )
+)
 {
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-    const int groupidy = get_group_id(1);
-
-    const int queryIdx = groupidy * block_size + lidy;
-    const int trainIdx = groupidx * block_size + lidx;
-
-    __local float *s_query = sharebuffer;
-    __local float *s_train = sharebuffer + block_size * block_size;
-
-    float result = 0;
-    for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
-    {
-        //load a block_size * block_size block into local train.
-        const int loadx = lidx + i * block_size;
-
-        s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-        s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-
-        //synchronize to make sure each elem for reduceIteration in share memory is written already.
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-        switch(distType)
-        {
-        case 0:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-            }
-            break;
-        case 1:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                result += qr * qr;
-            }
-            break;
-        case 2:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-            }
-            break;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
-    {
-        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
-
-        if(ind < bestTrainIdx_cols)
-        {
-            //bestImgIdx = imgIdx;
-            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
-            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
-        }
-    }
+	const int lidx = get_local_id(0);
+	const int lidy = get_local_id(1);
+	const int groupidx = get_group_id(0);
+	const int groupidy = get_group_id(1);
+	
+	const int queryIdx = groupidy * block_size + lidy;
+	const int trainIdx = groupidx * block_size + lidx;
+	
+	__local float *s_query = sharebuffer;
+	__local float *s_train = sharebuffer + block_size * block_size;
+	
+	float result = 0;
+	
+	for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
+	{
+		//load a block_size * block_size block into local train.
+		const int loadx = lidx + i * block_size;
+		
+		s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+		s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+		
+		//synchronize to make sure each elem for reduceIteration in share memory is written already.
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+		sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+		
+		switch (distType)
+		{
+			case 0:
+			
+				for (int j = 0 ; j < block_size ; ++j)
+				{
+					result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+				}
+				
+				break;
+			case 1:
+			
+				for (int j = 0 ; j < block_size ; ++j)
+				{
+					float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+					result += qr * qr;
+				}
+				
+				break;
+			case 2:
+			
+				for (int j = 0 ; j < block_size ; ++j)
+				{
+					result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+				}
+				
+				break;
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	
+	if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
+	{
+		unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+		
+		if (ind < bestTrainIdx_cols)
+		{
+			//bestImgIdx = imgIdx;
+			bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+			bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+		}
+	}
 }


 __kernel void BruteForceMatch_knnUnrollMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
@@ -440,169 +472,178 @@ __kernel void BruteForceMatch_knnUnrollMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-
-    const int queryIdx = groupidx * block_size + lidy;
-    local float *s_query = sharebuffer;
-    local float *s_train = sharebuffer + block_size * max_desc_len;
-
-    // load the query into local memory.
-    for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
-    {
-        int loadx = lidx + i * block_size;
-        s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-    }
-
-    float myBestDistance1 = MAX_FLOAT;
-    float myBestDistance2 = MAX_FLOAT;
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
-
-    //loopUnrolledCached
-    volatile int imgIdx = 0;
-    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
-    {
-        float result = 0;
-        for (int i = 0 ; i < max_desc_len / block_size ; i++)
-        {
-            const int loadX = lidx + i * block_size;
-            //load a block_size * block_size block into local train.
-            const int loadx = lidx + i * block_size;
-            s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-
-            //synchronize to make sure each elem for reduceIteration in share memory is written already.
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch(distType)
-            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                }
-                break;
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        const int trainIdx = t * block_size + lidx;
-
-        if (queryIdx < query_rows && trainIdx < train_rows)
-        {
-            if (result < myBestDistance1)
-            {
-                myBestDistance2 = myBestDistance1;
-                myBestTrainIdx2 = myBestTrainIdx1;
-                myBestDistance1 = result;
-                myBestTrainIdx1 = trainIdx;
-            }
-            else if (result < myBestDistance2)
-            {
-                myBestDistance2 = result;
-                myBestTrainIdx2 = trainIdx;
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    local float *s_distance = (local float *)sharebuffer;
-    local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
-
-    // find BestMatch
-    s_distance += lidy * block_size;
-    s_trainIdx += lidy * block_size;
-
-    s_distance[lidx] = myBestDistance1;
-    s_trainIdx[lidx] = myBestTrainIdx1;
-
-    float bestDistance1 = MAX_FLOAT;
-    float bestDistance2 = MAX_FLOAT;
-    int bestTrainIdx1 = -1;
-    int bestTrainIdx2 = -1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lidx == 0)
-    {
-        for (int i = 0 ; i < block_size ; i++)
-        {
-            float val = s_distance[i];
-            if (val < bestDistance1)
-            {
-                bestDistance2 = bestDistance1;
-                bestTrainIdx2 = bestTrainIdx1;
-
-                bestDistance1 = val;
-                bestTrainIdx1 = s_trainIdx[i];
-            }
-            else if (val < bestDistance2)
-            {
-                bestDistance2 = val;
-                bestTrainIdx2 = s_trainIdx[i];
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    s_distance[lidx] = myBestDistance2;
-    s_trainIdx[lidx] = myBestTrainIdx2;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lidx == 0)
-    {
-        for (int i = 0 ; i < block_size ; i++)
-        {
-            float val = s_distance[i];
-
-            if (val < bestDistance2)
-            {
-                bestDistance2 = val;
-                bestTrainIdx2 = s_trainIdx[i];
-            }
-        }
-    }
-
-    myBestDistance1 = bestDistance1;
-    myBestDistance2 = bestDistance2;
-
-    myBestTrainIdx1 = bestTrainIdx1;
-    myBestTrainIdx2 = bestTrainIdx2;
-
-    if (queryIdx < query_rows && lidx == 0)
-    {
-        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
-        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
-    }
+	const int lidx = get_local_id(0);
+	const int lidy = get_local_id(1);
+	const int groupidx = get_group_id(0);
+	
+	const int queryIdx = groupidx * block_size + lidy;
+	local float *s_query = sharebuffer;
+	local float *s_train = sharebuffer + block_size * max_desc_len;
+	
+	// load the query into local memory.
+	for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
+	{
+		int loadx = lidx + i * block_size;
+		s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+	}
+	
+	float myBestDistance1 = MAX_FLOAT;
+	float myBestDistance2 = MAX_FLOAT;
+	int myBestTrainIdx1 = -1;
+	int myBestTrainIdx2 = -1;
+	
+	//loopUnrolledCached
+	volatile int imgIdx = 0;
+	
+	for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+	{
+		float result = 0;
+		
+		for (int i = 0 ; i < max_desc_len / block_size ; i++)
+		{
+			const int loadX = lidx + i * block_size;
+			//load a block_size * block_size block into local train.
+			const int loadx = lidx + i * block_size;
+			s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+			
+			//synchronize to make sure each elem for reduceIteration in share memory is written already.
+			barrier(CLK_LOCAL_MEM_FENCE);
+			
+			/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+			sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+			
+			switch (distType)
+			{
+				case 0:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
+					}
+					
+					break;
+				case 1:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
+						result += qr * qr;
+					}
+					
+					break;
+				case 2:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+						result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+					}
+					
+					break;
+			}
+			
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+		
+		const int trainIdx = t * block_size + lidx;
+		
+		if (queryIdx < query_rows && trainIdx < train_rows)
+		{
+			if (result < myBestDistance1)
+			{
+				myBestDistance2 = myBestDistance1;
+				myBestTrainIdx2 = myBestTrainIdx1;
+				myBestDistance1 = result;
+				myBestTrainIdx1 = trainIdx;
+			}
+			else if (result < myBestDistance2)
+			{
+				myBestDistance2 = result;
+				myBestTrainIdx2 = trainIdx;
+			}
+		}
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	local float *s_distance = (local float *)sharebuffer;
+	local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
+	
+	// find BestMatch
+	s_distance += lidy * block_size;
+	s_trainIdx += lidy * block_size;
+	
+	s_distance[lidx] = myBestDistance1;
+	s_trainIdx[lidx] = myBestTrainIdx1;
+	
+	float bestDistance1 = MAX_FLOAT;
+	float bestDistance2 = MAX_FLOAT;
+	int bestTrainIdx1 = -1;
+	int bestTrainIdx2 = -1;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	if (lidx == 0)
+	{
+		for (int i = 0 ; i < block_size ; i++)
+		{
+			float val = s_distance[i];
+			
+			if (val < bestDistance1)
+			{
+				bestDistance2 = bestDistance1;
+				bestTrainIdx2 = bestTrainIdx1;
+				
+				bestDistance1 = val;
+				bestTrainIdx1 = s_trainIdx[i];
+			}
+			else if (val < bestDistance2)
+			{
+				bestDistance2 = val;
+				bestTrainIdx2 = s_trainIdx[i];
+			}
+		}
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	s_distance[lidx] = myBestDistance2;
+	s_trainIdx[lidx] = myBestTrainIdx2;
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	if (lidx == 0)
+	{
+		for (int i = 0 ; i < block_size ; i++)
+		{
+			float val = s_distance[i];
+			
+			if (val < bestDistance2)
+			{
+				bestDistance2 = val;
+				bestTrainIdx2 = s_trainIdx[i];
+			}
+		}
+	}
+	
+	myBestDistance1 = bestDistance1;
+	myBestDistance2 = bestDistance2;
+	
+	myBestTrainIdx1 = bestTrainIdx1;
+	myBestTrainIdx2 = bestTrainIdx2;
+	
+	if (queryIdx < query_rows && lidx == 0)
+	{
+		bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
+		bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
+	}
 }

 __kernel void BruteForceMatch_knnMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
@@ -613,166 +654,174 @@ __kernel void BruteForceMatch_knnMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-
-    const int queryIdx = groupidx * block_size + lidy;
-    local float *s_query = sharebuffer;
-    local float *s_train = sharebuffer + block_size * block_size;
-
-    float myBestDistance1 = MAX_FLOAT;
-    float myBestDistance2 = MAX_FLOAT;
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
-
-    //loop
-    for (int  t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
-    {
-        float result = 0.0f;
-        for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
-        {
-            const int loadx = lidx + i * block_size;
-            //load query and train into local memory
-            s_query[lidy * block_size + lidx] = 0;
-            s_train[lidx * block_size + lidy] = 0;
-
-            if (loadx < query_cols)
-            {
-                s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
-                s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch(distType)
-            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
-                }
-                break;
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        const int trainIdx = t * block_size + lidx;
-
-        if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
-        {
-            if (result < myBestDistance1)
-            {
-                myBestDistance2 = myBestDistance1;
-                myBestTrainIdx2 = myBestTrainIdx1;
-                myBestDistance1 = result;
-                myBestTrainIdx1 = trainIdx;
-            }
-            else if (result < myBestDistance2)
-            {
-                myBestDistance2 = result;
-                myBestTrainIdx2 = trainIdx;
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    __local float *s_distance = (__local float *)sharebuffer;
-    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-
-    //findBestMatch
-    s_distance += lidy * block_size;
-    s_trainIdx += lidy * block_size;
-
-    s_distance[lidx] = myBestDistance1;
-    s_trainIdx[lidx] = myBestTrainIdx1;
-
-    float bestDistance1 = MAX_FLOAT;
-    float bestDistance2 = MAX_FLOAT;
-    int bestTrainIdx1 = -1;
-    int bestTrainIdx2 = -1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lidx == 0)
-    {
-        for (int i = 0 ; i < block_size ; i++)
-        {
-            float val = s_distance[i];
-            if (val < bestDistance1)
-            {
-                bestDistance2 = bestDistance1;
-                bestTrainIdx2 = bestTrainIdx1;
-
-                bestDistance1 = val;
-                bestTrainIdx1 = s_trainIdx[i];
-            }
-            else if (val < bestDistance2)
-            {
-                bestDistance2 = val;
-                bestTrainIdx2 = s_trainIdx[i];
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    s_distance[lidx] = myBestDistance2;
-    s_trainIdx[lidx] = myBestTrainIdx2;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lidx == 0)
-    {
-        for (int i = 0 ; i < block_size ; i++)
-        {
-            float val = s_distance[i];
-
-            if (val < bestDistance2)
-            {
-                bestDistance2 = val;
-                bestTrainIdx2 = s_trainIdx[i];
-            }
-        }
-    }
-
-    myBestDistance1 = bestDistance1;
-    myBestDistance2 = bestDistance2;
-
-    myBestTrainIdx1 = bestTrainIdx1;
-    myBestTrainIdx2 = bestTrainIdx2;
-
-    if (queryIdx < query_rows && lidx == 0)
-    {
-        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
-        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
-    }
+	const int lidx = get_local_id(0);
+	const int lidy = get_local_id(1);
+	const int groupidx = get_group_id(0);
+	
+	const int queryIdx = groupidx * block_size + lidy;
+	local float *s_query = sharebuffer;
+	local float *s_train = sharebuffer + block_size * block_size;
+	
+	float myBestDistance1 = MAX_FLOAT;
+	float myBestDistance2 = MAX_FLOAT;
+	int myBestTrainIdx1 = -1;
+	int myBestTrainIdx2 = -1;
+	
+	//loop
+	for (int  t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+	{
+		float result = 0.0f;
+		
+		for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
+		{
+			const int loadx = lidx + i * block_size;
+			//load query and train into local memory
+			s_query[lidy * block_size + lidx] = 0;
+			s_train[lidx * block_size + lidy] = 0;
+			
+			if (loadx < query_cols)
+			{
+				s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+				s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+			}
+			
+			barrier(CLK_LOCAL_MEM_FENCE);
+			
+			/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+			sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+			
+			switch (distType)
+			{
+				case 0:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+					}
+					
+					break;
+				case 1:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+						result += qr * qr;
+					}
+					
+					break;
+				case 2:
+				
+					for (int j = 0 ; j < block_size ; j++)
+					{
+						//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+						result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+					}
+					
+					break;
+			}
+			
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+		
+		const int trainIdx = t * block_size + lidx;
+		
+		if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
+		{
+			if (result < myBestDistance1)
+			{
+				myBestDistance2 = myBestDistance1;
+				myBestTrainIdx2 = myBestTrainIdx1;
+				myBestDistance1 = result;
+				myBestTrainIdx1 = trainIdx;
+			}
+			else if (result < myBestDistance2)
+			{
+				myBestDistance2 = result;
+				myBestTrainIdx2 = trainIdx;
+			}
+		}
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	__local float *s_distance = (__local float *)sharebuffer;
+	__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+	
+	//findBestMatch
+	s_distance += lidy * block_size;
+	s_trainIdx += lidy * block_size;
+	
+	s_distance[lidx] = myBestDistance1;
+	s_trainIdx[lidx] = myBestTrainIdx1;
+	
+	float bestDistance1 = MAX_FLOAT;
+	float bestDistance2 = MAX_FLOAT;
+	int bestTrainIdx1 = -1;
+	int bestTrainIdx2 = -1;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	if (lidx == 0)
+	{
+		for (int i = 0 ; i < block_size ; i++)
+		{
+			float val = s_distance[i];
+			
+			if (val < bestDistance1)
+			{
+				bestDistance2 = bestDistance1;
+				bestTrainIdx2 = bestTrainIdx1;
+				
+				bestDistance1 = val;
+				bestTrainIdx1 = s_trainIdx[i];
+			}
+			else if (val < bestDistance2)
+			{
+				bestDistance2 = val;
+				bestTrainIdx2 = s_trainIdx[i];
+			}
+		}
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	s_distance[lidx] = myBestDistance2;
+	s_trainIdx[lidx] = myBestTrainIdx2;
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	if (lidx == 0)
+	{
+		for (int i = 0 ; i < block_size ; i++)
+		{
+			float val = s_distance[i];
+			
+			if (val < bestDistance2)
+			{
+				bestDistance2 = val;
+				bestTrainIdx2 = s_trainIdx[i];
+			}
+		}
+	}
+	
+	myBestDistance1 = bestDistance1;
+	myBestDistance2 = bestDistance2;
+	
+	myBestTrainIdx1 = bestTrainIdx1;
+	myBestTrainIdx2 = bestTrainIdx2;
+	
+	if (queryIdx < query_rows && lidx == 0)
+	{
+		bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
+		bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
+	}
 }

 kernel void BruteForceMatch_calcDistanceUnrolled(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int block_size,
@@ -784,13 +833,13 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
    int step,
    int distType)
 {
-    /* Todo */
+	/* Todo */
 }

 kernel void BruteForceMatch_calcDistance(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int block_size,
@@ -801,16 +850,16 @@ kernel void BruteForceMatch_calcDistance(
    int step,
    int distType)
 {
-    /* Todo */
+	/* Todo */
 }

 kernel void BruteForceMatch_findBestMatch(
    __global float *allDist,
    __global int *bestTrainIdx,
    __global float *bestDistance,
-     int k,
-     int block_size
-    )
+    int k,
+    int block_size
+)
 {
-    /* Todo */
+	/* Todo */
 }
\ No newline at end of file