Commit ed1b293d authored by Marina Kolpakova's avatar Marina Kolpakova

refactored GPU LBP cascade. Added support for big images. Fixed bug in connected…

refactored GPU LBP cascade. Added support for big images. Fixed bug in connected components function
parent 469ec7c5
...@@ -315,7 +315,24 @@ namespace cv { namespace gpu { namespace device ...@@ -315,7 +315,24 @@ namespace cv { namespace gpu { namespace device
DevMem2D_<int4> objects, DevMem2D_<int4> objects,
unsigned int* classified); unsigned int* classified);
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); void classifyStumpFixed(const DevMem2Db& mstages,
const int nstages,
const DevMem2Di& mnodes,
const DevMem2Df& mleaves,
const DevMem2Di& msubsets,
const DevMem2Db& mfeatures,
const int workWidth,
const int workHeight,
const int clWidth,
const int clHeight,
float scale,
int step,
int subsetSize,
DevMem2D_<int4> objects,
unsigned int* classified,
const int maxX);
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
void bindIntegral(DevMem2Di integral); void bindIntegral(DevMem2Di integral);
void unbindIntegral(); void unbindIntegral();
} }
...@@ -337,8 +354,8 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp ...@@ -337,8 +354,8 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
GpuMat candidates(1 , image.cols >> 1, CV_32SC4); GpuMat candidates(1 , image.cols >> 1, CV_32SC4);
// GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4); // GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
// used for debug // used for debug
// candidates.setTo(cv::Scalar::all(0)); candidates.setTo(cv::Scalar::all(0));
// objects.setTo(cv::Scalar::all(0)); objects.setTo(cv::Scalar::all(0));
if (maxObjectSize == cv::Size()) if (maxObjectSize == cv::Size())
maxObjectSize = image.size(); maxObjectSize = image.size();
...@@ -349,16 +366,50 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp ...@@ -349,16 +366,50 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
unsigned int* dclassified; unsigned int* dclassified;
cudaMalloc(&dclassified, sizeof(int)); cudaMalloc(&dclassified, sizeof(int));
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
int step; int step = 2;
cv::gpu::device::lbp::bindIntegral(integral); cv::gpu::device::lbp::bindIntegral(integral);
for( double factor = 1; ; factor *= scaleFactor ) cv::Size scaledImageSize(image.cols, image.rows);
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
cv::Size windowSize(NxM.width, NxM.height);
double factor = 1;
for (; processingRectSize.width / step >= 256;)
{ {
// std::cout << "IN FIXED: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;
// if (factor > 2.0) break; // if (factor > 2.0) break;
cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor )); break;
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
break;
// if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
// continue;
GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
GpuMat currBuff = integralBuffer;
cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
step = (factor <= 2.) + 1;
cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified, processingRectSize.width);
factor *= scaleFactor;
windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
}
for (; /*processingRectSize.width / step >= 128*/;)
{
// std::cout << "In FLOATING: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;
// if (factor > 2.0) break;
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 ) if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
break; break;
...@@ -379,12 +430,19 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp ...@@ -379,12 +430,19 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified); processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
factor *= scaleFactor;
windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
} }
cv::gpu::device::lbp::unbindIntegral(); cv::gpu::device::lbp::unbindIntegral();
if (groupThreshold <= 0 || objects.empty()) if (groupThreshold <= 0 || objects.empty())
return 0; return 0;
cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified); cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
// std::cout << "!!! CLASSIFIED " << *classified << std::endl;
cv::gpu::device::lbp::connectedConmonents(candidates, *classified, objects, groupThreshold, grouping_eps, dclassified);
cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
step = *classified; step = *classified;
......
...@@ -53,28 +53,27 @@ namespace cv { namespace gpu { namespace device ...@@ -53,28 +53,27 @@ namespace cv { namespace gpu { namespace device
struct LBP struct LBP
{ {
__device__ __forceinline__ LBP(const LBP& other) {(void)other;} __host__ __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
__device__ __forceinline__ LBP() {} __host__ __device__ __forceinline__ LBP() {}
//feature as uchar x, y - left top, z,w - right bottom __device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
{ {
int anchors[9]; int anchors[9];
anchors[0] = tex2D(tintegral, tx, ty); anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty); anchors[1] = tex2D(tintegral, tx + fw, ty);
anchors[0] -= anchors[1]; anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty); anchors[2] = tex2D(tintegral, tx + fw * 2, ty);
anchors[1] -= anchors[2]; anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty); anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
ty += fh; ty += fh;
anchors[3] = tex2D(tintegral, tx, ty); anchors[3] = tex2D(tintegral, tx, ty);
anchors[4] = tex2D(tintegral, tx + featurez, ty); anchors[4] = tex2D(tintegral, tx + fw, ty);
anchors[3] -= anchors[4]; anchors[3] -= anchors[4];
anchors[5] = tex2D(tintegral, tx + featurez * 2, ty); anchors[5] = tex2D(tintegral, tx + fw * 2, ty);
anchors[4] -= anchors[5]; anchors[4] -= anchors[5];
anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty); anchors[5] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[0] -= anchors[3]; anchors[0] -= anchors[3];
anchors[1] -= anchors[4]; anchors[1] -= anchors[4];
...@@ -83,11 +82,11 @@ namespace cv { namespace gpu { namespace device ...@@ -83,11 +82,11 @@ namespace cv { namespace gpu { namespace device
ty += fh; ty += fh;
anchors[6] = tex2D(tintegral, tx, ty); anchors[6] = tex2D(tintegral, tx, ty);
anchors[7] = tex2D(tintegral, tx + featurez, ty); anchors[7] = tex2D(tintegral, tx + fw, ty);
anchors[6] -= anchors[7]; anchors[6] -= anchors[7];
anchors[8] = tex2D(tintegral, tx + featurez * 2, ty); anchors[8] = tex2D(tintegral, tx + fw * 2, ty);
anchors[7] -= anchors[8]; anchors[7] -= anchors[8];
anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty); anchors[8] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[3] -= anchors[6]; anchors[3] -= anchors[6];
anchors[4] -= anchors[7]; anchors[4] -= anchors[7];
...@@ -109,11 +108,11 @@ namespace cv { namespace gpu { namespace device ...@@ -109,11 +108,11 @@ namespace cv { namespace gpu { namespace device
ty += fh; ty += fh;
anchors[0] = tex2D(tintegral, tx, ty); anchors[0] = tex2D(tintegral, tx, ty);
anchors[1] = tex2D(tintegral, tx + featurez, ty); anchors[1] = tex2D(tintegral, tx + fw, ty);
anchors[0] -= anchors[1]; anchors[0] -= anchors[1];
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty); anchors[2] = tex2D(tintegral, tx + fw * 2, ty);
anchors[1] -= anchors[2]; anchors[1] -= anchors[2];
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty); anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
anchors[6] -= anchors[0]; anchors[6] -= anchors[0];
anchors[7] -= anchors[1]; anchors[7] -= anchors[1];
...@@ -142,54 +141,90 @@ namespace cv { namespace gpu { namespace device ...@@ -142,54 +141,90 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaUnbindTexture(&tintegral)); cudaSafeCall( cudaUnbindTexture(&tintegral));
} }
__global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features, struct Classifier
/* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
{ {
int x = threadIdx.x * step; __host__ __device__ __forceinline__ Classifier(const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
int y = blockIdx.x * step; const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
: stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
scale(_scale), step(_step), subsetSize(_subsetSize){}
int current_node = 0; __device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
int current_leave = 0;
LBP evaluator;
for (int s = 0; s < nstages; s++ )
{ {
float sum = 0; int current_node = 0;
Stage stage = stages[s]; int current_leave = 0;
for (int t = 0; t < stage.ntrees; t++)
for (int s = 0; s < nstages; ++s)
{ {
ClNode node = nodes[current_node]; float sum = 0;
Stage stage = stages[s];
uchar4 feature = features[node.featureIdx]; for (int t = 0; t < stage.ntrees; t++)
int shift; {
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift); ClNode node = nodes[current_node];
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1; uchar4 feature = features[node.featureIdx];
sum += leaves[idx];
current_node += 1; int shift;
current_leave += 2; int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
sum += leaves[idx];
current_node += 1;
current_leave += 2;
}
if (sum < stage.threshold)
return;
} }
if (sum < stage.threshold)
return;
}
int4 rect; int4 rect;
rect.x = roundf(x * scale); rect.x = roundf(x * scale);
rect.y = roundf(y * scale); rect.y = roundf(y * scale);
rect.z = clWidth; rect.z = clWidth;
rect.w = clHeight; rect.w = clHeight;
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, 100U); int res = __atomicInc(n, maxN);
#else #else
int res = atomicInc(n, 100U); int res = atomicInc(n, maxN);
#endif #endif
objects(0, res) = rect; objects(0, res) = rect;
}
const Stage* stages;
const ClNode* nodes;
const float* leaves;
const int* subsets;
const uchar4* features;
const int nstages;
const int clWidth;
const int clHeight;
const float scale;
const int step;
const int subsetSize;
const LBP evaluator;
};
__global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n)
{
int x = threadIdx.x * classifier.step;
int y = blockIdx.x * classifier.step;
classifier(y, x, objects, maxN, n);
}
__global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n, int lines, int maxX)
{
int x = threadIdx.x * lines * classifier.step;
if (x >= maxX) return;
int y = blockIdx.x * classifier.step / lines;
classifier(y, x, objects, maxN, n);
} }
template<typename Pr> template<typename Pr>
__global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses) __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{ {
using cv::gpu::device::VecTraits;
unsigned int tid = threadIdx.x; unsigned int tid = threadIdx.x;
extern __shared__ int sbuff[]; extern __shared__ int sbuff[];
...@@ -207,23 +242,26 @@ namespace cv { namespace gpu { namespace device ...@@ -207,23 +242,26 @@ namespace cv { namespace gpu { namespace device
int cls = labels[tid]; int cls = labels[tid];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x); __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
__atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y); __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
__atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z); __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
__atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w); __atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
#else #else
atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x); atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y); atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z); atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w); atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
#endif #endif
__syncthreads();
labels[tid] = 0; labels[tid] = 0;
__syncthreads(); __syncthreads();
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicInc((unsigned int*)labels + cls, n); __atomicInc((unsigned int*)labels + cls, n);
#else #else
atomicInc((unsigned int*)labels + cls, n); atomicInc((unsigned int*)labels + cls, n);
#endif #endif
__syncthreads();
*nclasses = 0; *nclasses = 0;
int active = labels[tid]; int active = labels[tid];
...@@ -235,61 +273,54 @@ namespace cv { namespace gpu { namespace device ...@@ -235,61 +273,54 @@ namespace cv { namespace gpu { namespace device
r1[1] = saturate_cast<int>(r1[1] * s); r1[1] = saturate_cast<int>(r1[1] * s);
r1[2] = saturate_cast<int>(r1[2] * s); r1[2] = saturate_cast<int>(r1[2] * s);
r1[3] = saturate_cast<int>(r1[3] * s); r1[3] = saturate_cast<int>(r1[3] * s);
}
__syncthreads();
int n1 = active; if (active && active >= groupThreshold)
__syncthreads(); {
unsigned int j = 0; int* r1 = rrects + tid * 4;
if( active > groupThreshold ) int4 r_out;
{ r_out.x = r1[0];
for (j = 0; j < n; j++) r_out.y = r1[1];
{ r_out.z = r1[2];
int n2 = labels[j]; r_out.w = r1[3];
if(!n2 || j == tid || n2 <= groupThreshold )
continue;
int* r2 = rrects + j * 4;
int dx = saturate_cast<int>( r2[2] * grouping_eps );
int dy = saturate_cast<int>( r2[3] * grouping_eps );
if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy &&
r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy &&
(n2 > max(3, n1) || n1 < 3) )
break;
}
if( j == n)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
objects[__atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]); objects[__atomicInc(nclasses, n)] = r_out;
#else #else
objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]); int aidx = atomicInc(nclasses, n);
objects[aidx] = r_out;
#endif #endif
}
}
} }
} }
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
/*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
DevMem2D_<int4> objects, unsigned int* classified) {
int blocks = ceilf(workHeight / (float)step);
int threads = ceilf(workWidth / (float)step);
Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
lbp_classify_stump<<<blocks, threads>>>(clr, objects, objects.cols, classified);
}
void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified,
int maxX)
{ {
const int THREADS_BLOCK = 256;
int blocks = ceilf(workHeight / (float)step); int blocks = ceilf(workHeight / (float)step);
int threads = ceilf(workWidth / (float)step); int threads = ceilf(workWidth / (float)step);
Stage* stages = (Stage*)(mstages.ptr()); Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
ClNode* nodes = (ClNode*)(mnodes.ptr()); int lines = divUp(threads, THREADS_BLOCK);
const float* leaves = mleaves.ptr(); lbp_classify_stump<<<blocks * lines, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, lines, maxX);
const int* subsets = msubsets.ptr();
const uchar4* features = (uchar4*)(mfeatures.ptr());
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
} }
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{ {
int threads = candidates.cols; int threads = ncandidates;
int smem_amount = threads * sizeof(int) + threads * sizeof(int4); int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses); disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses);
return 0; return 0;
} }
} }
......
...@@ -61,6 +61,7 @@ __device__ __forceinline__ T __atomicInc(T* address, T val) ...@@ -61,6 +61,7 @@ __device__ __forceinline__ T __atomicInc(T* address, T val)
count = tag | (count + 1); count = tag | (count + 1);
*address = count; *address = count;
} while (*address != count); } while (*address != count);
return (count & TAG_MASK) - 1; return (count & TAG_MASK) - 1;
} }
...@@ -85,6 +86,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val) ...@@ -85,6 +86,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
{ {
*address = count; *address = count;
} while (*address > count); } while (*address > count);
return count; return count;
} }
...@@ -151,7 +153,6 @@ __device__ __forceinline__ T __atomicMin(T* address, T val) ...@@ -151,7 +153,6 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
} }
} }
__syncthreads(); __syncthreads();
// printf("tid %d label %d\n", tid, labels[tid]);
} }
} // lbp } // lbp
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment