Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
e63ab8de
Commit
e63ab8de
authored
Jul 17, 2012
by
Marina Kolpakova
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
LBP: multiscale approach; refactored atomics usage
parent
5dc7752d
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
272 additions
and
119 deletions
+272
-119
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+1
-0
cascadeclassifier.cpp
modules/gpu/src/cascadeclassifier.cpp
+102
-30
lbp.cu
modules/gpu/src/cuda/lbp.cu
+127
-26
emulation.hpp
modules/gpu/src/opencv2/gpu/device/emulation.hpp
+33
-11
lbp.hpp
modules/gpu/src/opencv2/gpu/device/lbp.hpp
+9
-52
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
e63ab8de
...
...
@@ -1464,6 +1464,7 @@ private:
GpuMat
resuzeBuffer
;
GpuMat
candidates
;
static
const
int
integralFactor
=
4
;
};
////////////////////////////////// SURF //////////////////////////////////////////
...
...
modules/gpu/src/cascadeclassifier.cpp
View file @
e63ab8de
...
...
@@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame)
{
resuzeBuffer
.
create
(
frame
,
CV_8UC1
);
integral
.
create
(
frame
.
height
+
1
,
frame
.
width
+
1
,
CV_32SC1
);
integral
.
create
(
frame
.
height
+
1
,
integralFactor
*
(
frame
.
width
+
1
)
,
CV_32SC1
);
NcvSize32u
roiSize
;
roiSize
.
width
=
frame
.
width
;
roiSize
.
height
=
frame
.
height
;
...
...
@@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device
DevMem2D_
<
int4
>
objects
,
unsigned
int
*
classified
);
void
classifyPyramid
(
int
frameW
,
int
frameH
,
int
windowW
,
int
windowH
,
float
initalScale
,
float
factor
,
int
total
,
const
DevMem2Db
&
mstages
,
const
int
nstages
,
const
DevMem2Di
&
mnodes
,
const
DevMem2Df
&
mleaves
,
const
DevMem2Di
&
msubsets
,
const
DevMem2Db
&
mfeatures
,
const
int
subsetSize
,
DevMem2D_
<
int4
>
objects
,
unsigned
int
*
classified
,
DevMem2Di
integral
);
void
connectedConmonents
(
DevMem2D_
<
int4
>
candidates
,
int
ncandidates
,
DevMem2D_
<
int4
>
objects
,
int
groupThreshold
,
float
grouping_eps
,
unsigned
int
*
nclasses
);
void
bindIntegral
(
DevMem2Di
integral
);
void
unbindIntegral
();
}
}}}
int
cv
::
gpu
::
CascadeClassifier_GPU_LBP
::
detectMultiScale
(
const
GpuMat
&
image
,
GpuMat
&
objects
,
double
scaleFactor
,
int
groupThreshold
,
cv
::
Size
maxObjectSize
/*, Size minSize=Size()*/
)
cv
::
Size
operator
-
(
const
cv
::
Size
&
a
,
const
cv
::
Size
&
b
)
{
return
cv
::
Size
(
a
.
width
-
b
.
width
,
a
.
height
-
b
.
height
);
}
cv
::
Size
operator
+
(
const
cv
::
Size
&
a
,
const
int
&
i
)
{
return
cv
::
Size
(
a
.
width
+
i
,
a
.
height
+
i
);
}
cv
::
Size
operator
*
(
const
cv
::
Size
&
a
,
const
float
&
f
)
{
return
cv
::
Size
(
cvRound
(
a
.
width
*
f
),
cvRound
(
a
.
height
*
f
));
}
cv
::
Size
operator
/
(
const
cv
::
Size
&
a
,
const
float
&
f
)
{
return
cv
::
Size
(
cvRound
(
a
.
width
/
f
),
cvRound
(
a
.
height
/
f
));
}
bool
operator
<=
(
const
cv
::
Size
&
a
,
const
cv
::
Size
&
b
)
{
return
a
.
width
<=
b
.
width
&&
a
.
height
<=
b
.
width
;
}
struct
PyrLavel
{
PyrLavel
(
int
_order
,
float
_scale
,
cv
::
Size
frame
,
cv
::
Size
window
)
:
order
(
_order
)
{
scale
=
pow
(
_scale
,
order
);
sFrame
=
frame
/
scale
;
workArea
=
sFrame
-
window
+
1
;
sWindow
=
window
*
scale
;
}
bool
isFeasible
(
cv
::
Size
maxObj
)
{
return
workArea
.
width
>
0
&&
workArea
.
height
>
0
&&
sWindow
<=
maxObj
;
}
PyrLavel
next
(
float
factor
,
cv
::
Size
frame
,
cv
::
Size
window
)
{
return
PyrLavel
(
order
+
1
,
factor
,
frame
,
window
);
}
int
order
;
float
scale
;
cv
::
Size
sFrame
;
cv
::
Size
workArea
;
cv
::
Size
sWindow
;
};
int
cv
::
gpu
::
CascadeClassifier_GPU_LBP
::
detectMultiScale
(
const
GpuMat
&
image
,
GpuMat
&
objects
,
double
scaleFactor
,
int
groupThreshold
,
cv
::
Size
maxObjectSize
)
{
CV_Assert
(
!
empty
()
&&
scaleFactor
>
1
&&
image
.
depth
()
==
CV_8U
);
...
...
@@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
// used for debug
// candidates.setTo(cv::Scalar::all(0));
// objects.setTo(cv::Scalar::all(0));
if
(
maxObjectSize
==
cv
::
Size
())
maxObjectSize
=
image
.
size
();
...
...
@@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
GpuMat
dclassified
(
1
,
1
,
CV_32S
);
cudaSafeCall
(
cudaMemcpy
(
dclassified
.
ptr
(),
&
classified
,
sizeof
(
int
),
cudaMemcpyHostToDevice
)
);
// cv::gpu::device::lbp::bindIntegral(integral);
Size
scaledImageSize
(
image
.
cols
,
image
.
rows
);
Size
processingRectSize
(
scaledImageSize
.
width
-
NxM
.
width
+
1
,
scaledImageSize
.
height
-
NxM
.
height
+
1
);
Size
windowSize
(
NxM
.
width
,
NxM
.
height
);
PyrLavel
level
(
0
,
1.0
f
,
image
.
size
(),
NxM
);
float
factor
=
1
;
for
(;;)
while
(
level
.
isFeasible
(
maxObjectSize
))
{
if
(
processingRectSize
.
width
<=
0
||
processingRectSize
.
height
<=
0
)
break
;
int
acc
=
level
.
sFrame
.
width
+
1
;
float
iniScale
=
level
.
scale
;
cv
::
Size
area
=
level
.
workArea
;
float
step
=
(
float
)(
1
+
(
level
.
scale
<=
2.
f
));
if
(
windowSize
.
width
>
maxObjectSize
.
width
||
windowSize
.
height
>
maxObjectSize
.
height
)
break
;
int
total
=
0
,
prev
=
0
;
// if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
// continue;
while
(
acc
<=
integralFactor
*
(
image
.
cols
+
1
)
&&
level
.
isFeasible
(
maxObjectSize
))
{
// create sutable matrix headers
GpuMat
src
=
resuzeBuffer
(
cv
::
Rect
(
0
,
0
,
level
.
sFrame
.
width
,
level
.
sFrame
.
height
));
GpuMat
sint
=
integral
(
cv
::
Rect
(
prev
,
0
,
level
.
sFrame
.
width
+
1
,
level
.
sFrame
.
height
+
1
));
GpuMat
buff
=
integralBuffer
;
GpuMat
scaledImg
=
resuzeBuffer
(
cv
::
Rect
(
0
,
0
,
scaledImageSize
.
width
,
scaledImageSize
.
height
));
GpuMat
scaledIntegral
=
integral
(
cv
::
Rect
(
0
,
0
,
scaledImageSize
.
width
+
1
,
scaledImageSize
.
height
+
1
)
);
GpuMat
currBuff
=
integralBuffer
;
// generate integral for scale
gpu
::
resize
(
image
,
src
,
level
.
sFrame
,
0
,
0
,
CV_INTER_LINEAR
);
gpu
::
integralBuffered
(
src
,
sint
,
buff
)
;
gpu
::
resize
(
image
,
scaledImg
,
scaledImageSize
,
0
,
0
,
CV_INTER_LINEAR
);
gpu
::
integralBuffered
(
scaledImg
,
scaledIntegral
,
currBuff
)
;
total
+=
cvCeil
(
area
.
width
/
step
)
*
cvCeil
(
area
.
height
/
step
);
// std::cout << "Total for scale: " << total << " this step contribution " << cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " << acc << " scales: " << cvCeil(area.width / step) << std::endl
;
int
step
=
factor
<=
2.
f
?
2
:
1
;
// increment pyr lavel
level
=
level
.
next
(
scaleFactor
,
image
.
size
(),
NxM
);
area
=
level
.
workArea
;
device
::
lbp
::
classifyStumpFixed
(
integral
,
integral
.
step1
(),
stage_mat
,
stage_mat
.
cols
/
sizeof
(
Stage
),
nodes_mat
,
leaves_mat
,
subsets_mat
,
features_mat
,
processingRectSize
.
width
,
processingRectSize
.
height
,
windowSize
.
width
,
windowSize
.
height
,
factor
,
step
,
subsetSize
,
candidates
,
dclassified
.
ptr
<
unsigned
int
>
());
step
=
(
float
)(
1
+
(
level
.
scale
<=
2.
f
));
prev
=
acc
;
acc
+=
level
.
sFrame
.
width
+
1
;
}
factor
*=
scaleFactor
;
windowSize
=
cv
::
Size
(
cvRound
(
NxM
.
width
*
factor
),
cvRound
(
NxM
.
height
*
factor
));
scaledImageSize
=
cv
::
Size
(
cvRound
(
image
.
cols
/
factor
),
cvRound
(
image
.
rows
/
factor
));
processingRectSize
=
cv
::
Size
(
scaledImageSize
.
width
-
NxM
.
width
+
1
,
scaledImageSize
.
height
-
NxM
.
height
+
1
);
device
::
lbp
::
classifyPyramid
(
image
.
cols
,
image
.
rows
,
NxM
.
width
,
NxM
.
height
,
iniScale
,
scaleFactor
,
total
,
stage_mat
,
stage_mat
.
cols
/
sizeof
(
Stage
),
nodes_mat
,
leaves_mat
,
subsets_mat
,
features_mat
,
subsetSize
,
candidates
,
dclassified
.
ptr
<
unsigned
int
>
(),
integral
);
}
// cv::gpu::device::lbp::unbindIntegral();
if
(
groupThreshold
<=
0
||
objects
.
empty
())
return
0
;
cudaSafeCall
(
cudaMemcpy
(
&
classified
,
dclassified
.
ptr
(),
sizeof
(
int
),
cudaMemcpyDeviceToHost
)
);
device
::
lbp
::
connectedConmonents
(
candidates
,
classified
,
objects
,
groupThreshold
,
grouping_eps
,
dclassified
.
ptr
<
unsigned
int
>
());
// candidates.copyTo(objects);
cudaSafeCall
(
cudaMemcpy
(
&
classified
,
dclassified
.
ptr
(),
sizeof
(
int
),
cudaMemcpyDeviceToHost
)
);
cudaSafeCall
(
cudaDeviceSynchronize
()
);
// std::cout << classified << " !!!!!!!!!!" << std::endl;
return
classified
;
}
...
...
modules/gpu/src/cuda/lbp.cu
View file @
e63ab8de
...
...
@@ -255,11 +255,7 @@ namespace cv { namespace gpu { namespace device
rect.z = clWidth;
rect.w = clHeight;
#if (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, maxN);
#else
int res = atomicInc(n, maxN);
#endif
int res = Emulation::smem::atomicInc(n, maxN);
objects(0, res) = rect;
}
...
...
@@ -317,26 +313,17 @@ namespace cv { namespace gpu { namespace device
__syncthreads();
int cls = labels[tid];
#if (__CUDA_ARCH__ < 120)
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
__atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
#else
atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
#endif
Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
__syncthreads();
labels[tid] = 0;
__syncthreads();
#if (__CUDA_ARCH__ < 120)
__atomicInc((unsigned int*)labels + cls, n);
#else
atomicInc((unsigned int*)labels + cls, n);
#endif
Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
__syncthreads();
*nclasses = 0;
...
...
@@ -357,12 +344,8 @@ namespace cv { namespace gpu { namespace device
int* r1 = rrects + tid * 4;
int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
#if (__CUDA_ARCH__ < 120)
objects[__atomicInc(nclasses, n)] = r_out;
#else
int aidx = atomicInc(nclasses, n);
int aidx = Emulation::smem::atomicInc(nclasses, n);
objects[aidx] = r_out;
#endif
}
}
...
...
@@ -387,5 +370,122 @@ namespace cv { namespace gpu { namespace device
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
cudaSafeCall( cudaGetLastError() );
}
struct Cascade
{
__host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
const int* _subsets, const uchar4* _features, int _subsetSize)
: stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
__device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch/*, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n*/) const
{
int current_node = 0;
int current_leave = 0;
for (int s = 0; s < nstages; ++s)
{
float sum = 0;
Stage stage = stages[s];
for (int t = 0; t < stage.ntrees; t++)
{
ClNode node = nodes[current_node];
uchar4 feature = features[node.featureIdx];
int shift;
int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
sum += leaves[idx];
current_node += 1;
current_leave += 2;
}
if (sum < stage.threshold)
return false;
}
return true;
}
const Stage* stages;
const int nstages;
const ClNode* nodes;
const float* leaves;
const int* subsets;
const uchar4* features;
const int subsetSize;
const LBP evaluator;
};
// stepShift, scale, width_k, sum_prev => y = sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
__global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
const int workAmount, int* integral, const int pitch, DevMem2D_<int4> objects, unsigned int* classified)
{
int ftid = blockIdx.x * blockDim.x + threadIdx.x;
if (ftid >= workAmount ) return;
int sum = 0;
// float scale = 1.0f;
float stepShift = (scale <= 2.f) ? 2.0 : 1.0;
int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
// if (!ftid)
// printf("!!!!: %d %d", w, h);
int framTid = ftid;
int i = 0;
while (1)
{
if (framTid < (w - 1) * (h - 1)) break;
i++;
sum += __float2int_rn(frameW / scale) + 1;
framTid -= w * h;
scale *= factor;
stepShift = (scale <= 2.f) ? 2.0 : 1.0;
int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
}
int y = (framTid / w);
int x = (framTid - y * w) * stepShift;
y *= stepShift;
x += sum;
// if (i == 2)
// printf("!!!!!!!!!!!!!! %f %d %d %d\n", windowW * scale, sum, y, x);
if (cascade(y, x, integral, pitch))
{
int4 rect;
rect.x = roundf( (x - sum) * scale);
rect.y = roundf(y * scale);
rect.z = roundf(windowW * scale);
rect.w = roundf(windowH * scale);
if (rect.x > frameW || rect.y > frameH) return;
// printf("OUTLAUER %d %d %d %d %d %d %d %d %d %f %f\n", x, y, ftid, framTid, rect.x, rect.y, sum, w, h, stepShift, scale);
// printf("passed: %d %d ---- %d %d %d %d %d\n", y, x, rect.x, rect.y, rect.z, rect.w, sum);
int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
objects(0, res) = rect;
}
}
void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
const int subsetSize, DevMem2D_<int4> objects, unsigned int* classified, DevMem2Di integral)
{
const int block = 256;
int grid = divUp(workAmount, block);
Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
}
}
}}}
\ No newline at end of file
modules/gpu/src/opencv2/gpu/device/emulation.hpp
View file @
e63ab8de
...
...
@@ -44,6 +44,7 @@
#define OPENCV_GPU_EMULATION_HPP_
#include "warp_reduce.hpp"
#include <stdio.h>
namespace
cv
{
namespace
gpu
{
namespace
device
{
...
...
@@ -52,10 +53,10 @@ namespace cv { namespace gpu { namespace device
template
<
int
CTA_SIZE
>
static
__forceinline__
__device__
int
Ballot
(
int
predicate
)
{
#if
(__CUDA_ARCH__ >= 200)
#if
defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
return
__ballot
(
predicate
);
#else
__shared__
volatile
int
cta_buffer
[
CTA_SIZE
]
__shared__
volatile
int
cta_buffer
[
CTA_SIZE
];
int
tid
=
threadIdx
.
x
;
cta_buffer
[
tid
]
=
predicate
?
(
1
<<
(
tid
&
31
))
:
0
;
...
...
@@ -70,31 +71,52 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
static
__device__
__forceinline__
T
atomicInc
(
T
*
address
,
T
val
)
{
#if (__CUDA_ARCH__ < 120)
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T
count
;
unsigned
int
tag
=
threadIdx
.
x
<<
(
(
sizeof
(
unsigned
int
)
<<
3
)
-
5U
);
do
{
count
=
*
address
&
TAG_MASK
;
count
=
tag
|
(
count
+
1
);
*
address
=
count
;
}
while
(
*
address
!=
count
);
return
(
count
&
TAG_MASK
)
-
1
;
#else
return
::
atomicInc
(
address
,
val
);
#endif
}
template
<
typename
T
>
static
__device__
__forceinline__
void
atomicAdd
(
T
*
address
,
T
val
)
{
#if (__CUDA_ARCH__ < 120)
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T
count
;
unsigned
int
tag
=
threadIdx
.
x
<<
(
(
sizeof
(
unsigned
int
)
<<
3
)
-
5U
);
do
{
count
=
*
address
&
TAG_MASK
;
count
=
tag
|
(
count
+
val
);
*
address
=
count
;
}
while
(
*
address
!=
count
);
#else
::
atomicAdd
(
address
,
val
);
#endif
}
template
<
typename
T
>
__device__
__forceinline__
T
__atomicMin
(
T
*
address
,
T
val
)
static
__device__
__forceinline__
T
atomicMin
(
T
*
address
,
T
val
)
{
#if (__CUDA_ARCH__ < 120)
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T
count
=
min
(
*
address
,
val
);
do
{
*
address
=
count
;
}
while
(
*
address
>
count
);
return
count
;
#else
return
::
atomicMin
(
address
,
val
);
#endif
}
};
...
...
modules/gpu/src/opencv2/gpu/device/lbp.hpp
View file @
e63ab8de
...
...
@@ -44,52 +44,11 @@
#define __OPENCV_GPU_DEVICE_LBP_HPP_
#include "internal_shared.hpp"
#include <opencv2/gpu/device/emulation.hpp>
namespace
cv
{
namespace
gpu
{
namespace
device
{
namespace
lbp
{
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
template
<
typename
T
>
__device__
__forceinline__
T
__atomicInc
(
T
*
address
,
T
val
)
{
T
count
;
unsigned
int
tag
=
threadIdx
.
x
<<
(
(
sizeof
(
unsigned
int
)
<<
3
)
-
5U
);
do
{
count
=
*
address
&
TAG_MASK
;
count
=
tag
|
(
count
+
1
);
*
address
=
count
;
}
while
(
*
address
!=
count
);
return
(
count
&
TAG_MASK
)
-
1
;
}
template
<
typename
T
>
__device__
__forceinline__
void
__atomicAdd
(
T
*
address
,
T
val
)
{
T
count
;
unsigned
int
tag
=
threadIdx
.
x
<<
(
(
sizeof
(
unsigned
int
)
<<
3
)
-
5U
);
do
{
count
=
*
address
&
TAG_MASK
;
count
=
tag
|
(
count
+
val
);
*
address
=
count
;
}
while
(
*
address
!=
count
);
}
template
<
typename
T
>
__device__
__forceinline__
T
__atomicMin
(
T
*
address
,
T
val
)
{
T
count
=
min
(
*
address
,
val
);
do
{
*
address
=
count
;
}
while
(
*
address
>
count
);
return
count
;
}
namespace
lbp
{
struct
Stage
{
...
...
@@ -127,27 +86,25 @@ namespace lbp{
unsigned
tid
=
threadIdx
.
x
;
labels
[
tid
]
=
tid
;
__syncthreads
();
for
(
unsigned
int
id
=
0
;
id
<
n
;
id
++
)
{
if
(
tid
!=
id
&&
predicate
(
vec
[
tid
],
vec
[
id
]))
{
int
p
=
labels
[
tid
];
int
q
=
labels
[
id
];
if
(
p
!=
q
)
if
(
p
<
q
)
{
int
m
=
min
(
p
,
q
);
#if (__CUDA_ARCH__ < 120)
__atomicMin
(
labels
+
id
,
m
);
#else
atomicMin
(
labels
+
id
,
m
);
#endif
Emulation
::
smem
::
atomicMin
(
labels
+
id
,
p
);
}
else
if
(
p
>
q
)
{
Emulation
::
smem
::
atomicMin
(
labels
+
tid
,
q
);
}
}
}
__syncthreads
();
}
}
// lbp
}
}
}
// namespaces
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment