Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
8274ed22
Commit
8274ed22
authored
Jan 31, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed gpu tests (BruteForceMatcher_GPU, divide, phase, cartToPolar, async)
minor code refactoring
parent
7a29d96c
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
417 additions
and
533 deletions
+417
-533
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+6
-4
brute_force_matcher.cpp
modules/gpu/src/brute_force_matcher.cpp
+19
-19
brute_force_matcher.cu
modules/gpu/src/cuda/brute_force_matcher.cu
+209
-316
imgproc_gpu.cpp
modules/gpu/src/imgproc_gpu.cpp
+58
-29
arithm.cpp
tests/gpu/src/arithm.cpp
+27
-14
brute_force_matcher.cpp
tests/gpu/src/brute_force_matcher.cpp
+36
-23
gputest_main.cpp
tests/gpu/src/gputest_main.cpp
+0
-1
imgproc_gpu.cpp
tests/gpu/src/imgproc_gpu.cpp
+25
-25
operator_async_call.cpp
tests/gpu/src/operator_async_call.cpp
+37
-102
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
8274ed22
...
...
@@ -671,10 +671,12 @@ namespace cv
//! output will have CV_32FC1 type
CV_EXPORTS
void
rectStdDev
(
const
GpuMat
&
src
,
const
GpuMat
&
sqr
,
GpuMat
&
dst
,
const
Rect
&
rect
);
//! applies Canny edge detector and produces the edge map
//! supprots only CV_8UC1 source type
//! disabled until fix crash
CV_EXPORTS
void
Canny
(
const
GpuMat
&
image
,
GpuMat
&
edges
,
double
threshold1
,
double
threshold2
,
int
apertureSize
=
3
);
// applies Canny edge detector and produces the edge map
// disabled until fix crash
//CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);
//CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3);
//CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);
//CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3);
//! computes Harris cornerness criteria at each image pixel
CV_EXPORTS
void
cornerHarris
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
blockSize
,
int
ksize
,
double
k
,
int
borderType
=
BORDER_REFLECT101
);
...
...
modules/gpu/src/brute_force_matcher.cpp
View file @
8274ed22
...
...
@@ -104,6 +104,18 @@ namespace cv { namespace gpu { namespace bfmatcher
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
unsigned
int
*
nMatches
,
const
DevMem2Df
&
distance
);
}}}
namespace
{
class
ImgIdxSetter
{
public
:
ImgIdxSetter
(
int
imgIdx_
)
:
imgIdx
(
imgIdx_
)
{}
void
operator
()(
DMatch
&
m
)
const
{
m
.
imgIdx
=
imgIdx
;}
private
:
int
imgIdx
;
};
}
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
BruteForceMatcher_GPU_base
(
DistType
distType_
)
:
distType
(
distType_
)
{
}
...
...
@@ -185,7 +197,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
&&
trainIdx
.
isContinuous
());
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
isContinuous
()
&&
distance
.
size
().
area
()
==
trainIdx
.
size
().
area
()
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
isContinuous
()
&&
distance
.
cols
==
trainIdx
.
cols
);
const
int
nQuery
=
trainIdx
.
cols
;
...
...
@@ -309,8 +321,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
&&
trainIdx
.
isContinuous
());
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
isContinuous
());
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
isContinuous
());
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
isContinuous
()
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
isContinuous
()
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
const
int
nQuery
=
trainIdx
.
cols
;
...
...
@@ -390,7 +402,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
distance
.
create
(
nQuery
,
k
,
CV_32F
);
allDist
.
create
(
nQuery
,
nTrain
,
CV_32F
);
ensureSizeIsEnough
(
nQuery
,
nTrain
,
CV_32FC1
,
allDist
);
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
CV_Assert
(
func
!=
0
);
...
...
@@ -451,18 +463,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
knnMatchDownload
(
trainIdx
,
distance
,
matches
,
compactResult
);
}
namespace
{
class
ImgIdxSetter
{
public
:
ImgIdxSetter
(
int
imgIdx_
)
:
imgIdx
(
imgIdx_
)
{}
void
operator
()(
DMatch
&
m
)
const
{
m
.
imgIdx
=
imgIdx
;}
private
:
int
imgIdx
;
};
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
queryDescs
,
vector
<
vector
<
DMatch
>
>&
matches
,
int
knn
,
const
vector
<
GpuMat
>&
masks
,
bool
compactResult
)
{
...
...
@@ -538,9 +538,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
CV_Assert
(
queryDescs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
CV_Assert
(
trainDescs
.
type
()
==
queryDescs
.
type
()
&&
trainDescs
.
cols
==
queryDescs
.
cols
);
CV_Assert
(
trainIdx
.
empty
()
||
trainIdx
.
rows
==
nQuery
);
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
nQuery
&&
trainIdx
.
size
()
==
distance
.
size
())
);
nMatches
.
create
(
1
,
nQuery
,
CV_32SC1
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC1
,
nMatches
);
nMatches
.
setTo
(
Scalar
::
all
(
0
));
if
(
trainIdx
.
empty
())
{
...
...
@@ -561,7 +561,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
isContinuous
()
&&
nMatches
.
size
().
area
()
=
=
trainIdx
.
rows
);
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
isContinuous
()
&&
nMatches
.
cols
>
=
trainIdx
.
rows
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
size
()
==
trainIdx
.
size
());
const
int
nQuery
=
trainIdx
.
rows
;
...
...
modules/gpu/src/cuda/brute_force_matcher.cu
View file @
8274ed22
...
...
@@ -64,6 +64,7 @@ namespace cv { namespace gpu { namespace bfmatcher
{
return mask.ptr(queryIdx)[trainIdx] != 0;
}
private:
PtrStep mask;
};
...
...
@@ -82,6 +83,7 @@ namespace cv { namespace gpu { namespace bfmatcher
{
return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0;
}
private:
PtrStep* maskCollection;
PtrStep curMask;
...
...
@@ -102,172 +104,55 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// Reduce Sum
template <int BLOCK_DIM_X>
__device__ void reduceSum(float* sdiff, float mySum, int tid)
{
sdiff[tid] = mySum;
__syncthreads();
template <int BLOCK_DIM_X> __device__ void reduceSum(float* sdiff_row, float& mySum);
if (BLOCK_DIM_X == 512)
{
if (tid < 256)
{
sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads();
sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads();
sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads();
}
volatile float* smem = sdiff;
smem[tid] = mySum += smem[tid + 32];
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
if (BLOCK_DIM_X == 256)
{
if (tid < 128)
{
sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads();
sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads();
}
volatile float* smem = sdiff;
smem[tid] = mySum += smem[tid + 32];
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
if (BLOCK_DIM_X == 128)
{
if (tid < 64)
{
sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads();
}
volatile float* smem = sdiff;
smem[tid] = mySum += smem[tid + 32];
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
volatile float* smem = sdiff;
if (BLOCK_DIM_X == 64)
{
if (tid < 32)
{
smem[tid] = mySum += smem[tid + 32];
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 32)
{
if (tid < 16)
{
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 16)
{
if (tid < 8)
{
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 8)
{
if (tid < 4)
{
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 4)
{
if (tid < 2)
{
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 2)
{
if (tid < 1)
{
smem[tid] = mySum += smem[tid + 1];
}
}
}
///////////////////////////////////////////////////////////////////////////////
// loadDescsVals
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, typename T>
__device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals)
template <> __device__ void reduceSum<16>(float* sdiff_row, float& mySum)
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x
;
volatile float* smem = sdiff_row
;
if (tid < desc_len)
smem[threadIdx.x] = mySum;
if (threadIdx.x < 8)
{
smem[tid] = (float)descs[tid];
}
__syncthreads();
#pragma unroll
for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)
{
*queryVals = smem[i];
++queryVals;
smem[threadIdx.x] = mySum += smem[threadIdx.x + 8];
smem[threadIdx.x] = mySum += smem[threadIdx.x + 4];
smem[threadIdx.x] = mySum += smem[threadIdx.x + 2];
smem[threadIdx.x] = mySum += smem[threadIdx.x + 1];
}
}
///////////////////////////////////////////////////////////////////////////////
// Distance
template <int BLOCK_DIM_X>
class L1Dist
{
public:
__device__ L1Dist() : mySum(0) {}
__device__ L1Dist() : mySum(0
.0f
) {}
__device__ void reduceIter(float val1, float val2)
{
mySum += fabs(val1 - val2);
}
__device__ void reduceAll(float* sdiff, int tid)
template <int BLOCK_DIM_X>
__device__ void reduceAll(float* sdiff_row)
{
reduceSum<BLOCK_DIM_X>(sdiff
, mySum, tid
);
reduceSum<BLOCK_DIM_X>(sdiff
_row, mySum
);
}
static __device__ float finalResult(float res)
__device__ operator float() const
{
return
res
;
return
mySum
;
}
private:
float mySum;
};
template <int BLOCK_DIM_X>
class L2Dist
{
public:
__device__ L2Dist() : mySum(0) {}
__device__ L2Dist() : mySum(0
.0f
) {}
__device__ void reduceIter(float val1, float val2)
{
...
...
@@ -275,15 +160,17 @@ namespace cv { namespace gpu { namespace bfmatcher
mySum += reg * reg;
}
__device__ void reduceAll(float* sdiff, int tid)
template <int BLOCK_DIM_X>
__device__ void reduceAll(float* sdiff_row)
{
reduceSum<BLOCK_DIM_X>(sdiff
, mySum, tid
);
reduceSum<BLOCK_DIM_X>(sdiff
_row, mySum
);
}
static __device__ float finalResult(float res)
__device__ operator float() const
{
return sqrtf(
res
);
return sqrtf(
mySum
);
}
private:
float mySum;
};
...
...
@@ -292,56 +179,81 @@ namespace cv { namespace gpu { namespace bfmatcher
// reduceDescDiff
template <int BLOCK_DIM_X, typename Dist, typename T>
__device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff)
__device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist,
float* sdiff_row)
{
const int tid = threadIdx.x;
for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X)
dist.reduceIter(queryDescs[i], trainDescs[i]);
Dist dist;
dist.reduceAll<BLOCK_DIM_X>(sdiff_row);
}
for (int i = tid; i < desc_len; i += BLOCK_DIM_X)
dist.reduceIter(queryDescs[i], trainDescs[i]);
///////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// Match //////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// loadDescsVals
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, typename T>
__device__ void loadDescsVals(const T* descs, int desc_len, float* queryVals, float* smem)
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < desc_len)
{
smem[tid] = (float)descs[tid];
}
__syncthreads();
dist.reduceAll(sdiff, tid);
#pragma unroll
for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)
{
*queryVals = smem[i];
++queryVals;
}
}
///////////////////////////////////////////////////////////////////////////////
// reduceDescDiff
_smem
// reduceDescDiff
Cached
template <int N> struct UnrollDescDiff
{
template <typename Dist, typename T>
static __device__ void calcCheck(
Dist& dist, const float* queryVals, const T* trainDescs
,
int ind, int desc_len
)
static __device__ void calcCheck(
const float* queryVals, const T* trainDescs, int desc_len
,
Dist& dist, int ind
)
{
if (ind < desc_len)
{
dist.reduceIter(*queryVals, trainDescs[ind]);
++queryVals;
++queryVals;
UnrollDescDiff<N - 1>::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len);
UnrollDescDiff<N - 1>::calcCheck(queryVals, trainDescs, desc_len, dist, ind + blockDim.x);
}
}
template <typename Dist, typename T>
static __device__ void calcWithoutCheck(
Dist& dist, const float* queryVals, const T* trainDescs
)
static __device__ void calcWithoutCheck(
const float* queryVals, const T* trainDescs, Dist& dist
)
{
dist.reduceIter(*queryVals, *trainDescs);
++queryVals;
trainDescs += blockDim.x;
UnrollDescDiff<N - 1>::calcWithoutCheck(
dist, queryVals, trainDescs
);
UnrollDescDiff<N - 1>::calcWithoutCheck(
queryVals, trainDescs, dist
);
}
};
template <> struct UnrollDescDiff<0>
{
template <typename Dist, typename T>
static __device__ void calcCheck(
Dist& dist, const float* queryVals, const T* trainDescs
,
int ind, int desc_len
)
static __device__ void calcCheck(
const float* queryVals, const T* trainDescs, int desc_len
,
Dist& dist, int ind
)
{
}
template <typename Dist, typename T>
static __device__ void calcWithoutCheck(
Dist& dist, const float* queryVals, const T* trainDescs
)
static __device__ void calcWithoutCheck(
const float* queryVals, const T* trainDescs, Dist& dist
)
{
}
};
...
...
@@ -351,106 +263,82 @@ namespace cv { namespace gpu { namespace bfmatcher
struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>
{
template <typename Dist, typename T>
static __device__ void calc(
Dist& dist, const float* queryVals, const T* trainDescs, int desc_len
)
static __device__ void calc(
const float* queryVals, const T* trainDescs, int desc_len, Dist& dist
)
{
UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(
dist, queryVals, trainDescs
,
threadIdx.x, desc_len
);
UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(
queryVals, trainDescs, desc_len
,
dist, threadIdx.x
);
}
};
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN>
struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>
{
template <typename Dist, typename T>
static __device__ void calc(
Dist& dist, const float* queryVals, const T* trainDescs, int desc_len
)
static __device__ void calc(
const float* queryVals, const T* trainDescs, int desc_len, Dist& dist
)
{
UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(
dist,
queryVals,
trainDescs + threadIdx.x);
UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals,
trainDescs + threadIdx.x
, dist
);
}
};
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>
__device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff)
{
const int tid = threadIdx.x;
__device__ void reduceDescDiffCached(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist,
float* sdiff_row)
{
DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals,
trainDescs, desc_len, dist);
Dist dist;
DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(dist, queryVals,
trainDescs, desc_len);
dist.reduceAll(sdiff, tid);
dist.reduceAll<BLOCK_DIM_X>(sdiff_row);
}
///////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// Match //////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// warpReduceMin
// warpReduceMin
IdxIdx
template <int BLOCK_DIM_Y>
__device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx)
__device__ void warpReduceMinIdxIdx(float& myMin, int& myBestTrainIdx, int& myBestImgIdx,
volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx);
template <>
__device__ void warpReduceMinIdxIdx<16>(float& myMin, int& myBestTrainIdx, int& myBestImgIdx,
volatile float* smin, volatile int* strainIdx, volatile int* simgIdx)
{
float minSum = sdata[tid]
;
const int tid = threadIdx.y * blockDim.x + threadIdx.x
;
if (BLOCK_DIM_Y >= 64)
{
float reg = sdata[tid + 32];
if (reg < minSum)
{
sdata[tid] = minSum = reg;
strainIdx[tid] = strainIdx[tid + 32];
simgIdx[tid] = simgIdx[tid + 32];
}
}
if (BLOCK_DIM_Y >= 32)
if (tid < 8)
{
float reg = sdata[tid + 16];
if (reg < minSum)
{
sdata[tid] = minSum = reg;
strainIdx[tid] = strainIdx[tid + 16];
simgIdx[tid] = simgIdx[tid + 16];
}
}
if (BLOCK_DIM_Y >= 16)
{
float reg = sdata[tid + 8];
if (reg < minSum)
myMin = smin[tid];
myBestTrainIdx = strainIdx[tid];
myBestImgIdx = simgIdx[tid];
float reg = smin[tid + 8];
if (reg < myMin)
{
s
data[tid] = minSum
= reg;
strainIdx[tid] = strainIdx[tid + 8];
simgIdx[tid] = simgIdx[tid + 8];
s
min[tid] = myMin
= reg;
strainIdx[tid] =
myBestTrainIdx =
strainIdx[tid + 8];
simgIdx[tid] =
myBestImgIdx =
simgIdx[tid + 8];
}
}
if (BLOCK_DIM_Y >= 8)
{
float reg = sdata[tid + 4];
if (reg < minSum)
reg = smin[tid + 4];
if (reg < myMin)
{
s
data[tid] = minSum
= reg;
strainIdx[tid] = strainIdx[tid + 4];
simgIdx[tid] = simgIdx[tid + 4];
s
min[tid] = myMin
= reg;
strainIdx[tid] =
myBestTrainIdx =
strainIdx[tid + 4];
simgIdx[tid] =
myBestImgIdx =
simgIdx[tid + 4];
}
}
if (BLOCK_DIM_Y >= 4)
{
float reg = sdata[tid + 2];
if (reg < minSum)
reg = smin[tid + 2];
if (reg < myMin)
{
s
data[tid] = minSum
= reg;
strainIdx[tid] = strainIdx[tid + 2];
simgIdx[tid] = simgIdx[tid + 2];
s
min[tid] = myMin
= reg;
strainIdx[tid] =
myBestTrainIdx =
strainIdx[tid + 2];
simgIdx[tid] =
myBestImgIdx =
simgIdx[tid + 2];
}
}
if (BLOCK_DIM_Y >= 2)
{
float reg = sdata[tid + 1];
if (reg < minSum)
reg = smin[tid + 1];
if (reg < myMin)
{
s
data[tid] = minSum
= reg;
strainIdx[tid] = strainIdx[tid + 1];
simgIdx[tid] = simgIdx[tid + 1];
s
min[tid] = myMin
= reg;
strainIdx[tid] =
myBestTrainIdx =
strainIdx[tid + 1];
simgIdx[tid] =
myBestImgIdx =
simgIdx[tid + 1];
}
}
}
...
...
@@ -458,9 +346,9 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// findBestMatch
template <int BLOCK_DIM_Y
, typename Dist
>
__device__ void findBestMatch(
int queryIdx, float myMin, int myBestTrainIdx, int
myBestImgIdx,
float* smin, int* strainIdx, int* simgIdx
, int* trainIdx, int* imgIdx, float* distance
)
template <int BLOCK_DIM_Y>
__device__ void findBestMatch(
float& myMin, int& myBestTrainIdx, int&
myBestImgIdx,
float* smin, int* strainIdx, int* simgIdx)
{
if (threadIdx.x == 0)
{
...
...
@@ -470,27 +358,13 @@ namespace cv { namespace gpu { namespace bfmatcher
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 32)
warpReduceMin<BLOCK_DIM_Y>(tid, smin, strainIdx, simgIdx);
if (threadIdx.x == 0 && threadIdx.y == 0)
{
float minSum = smin[0];
int bestTrainIdx = strainIdx[0];
int bestImgIdx = simgIdx[0];
imgIdx[queryIdx] = bestImgIdx;
trainIdx[queryIdx] = bestTrainIdx;
distance[queryIdx] = Dist::finalResult(minSum);
}
warpReduceMinIdxIdx<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);
}
///////////////////////////////////////////////////////////////////////////////
// ReduceDescCalculator
template <int BLOCK_DIM_X, typename
Dist, typename
T>
template <int BLOCK_DIM_X, typename T>
class ReduceDescCalculatorSimple
{
public:
...
...
@@ -499,29 +373,30 @@ namespace cv { namespace gpu { namespace bfmatcher
queryDescs = queryDescs_;
}
__device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const
template <typename Dist>
__device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const
{
reduceDescDiff<BLOCK_DIM_X
, Dist>(queryDescs, trainDescs, desc_len
, sdiff_row);
reduceDescDiff<BLOCK_DIM_X
>(queryDescs, trainDescs, desc_len, dist
, sdiff_row);
}
private:
const T* queryDescs;
};
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN,
typename Dist, typename T>
class ReduceDescCalculatorSmem
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T>
class ReduceDescCalculatorCached
{
public:
__device__ void prepare(const T* queryDescs, int desc_len, float* smem)
{
loadDescsVals<BLOCK_DIM_X,
BLOCK_DIM_Y, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, smem, queryVals
);
loadDescsVals<BLOCK_DIM_X,
MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem
);
}
__device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const
template <typename Dist>
__device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const
{
reduceDescDiff
_smem<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist
>(queryVals, trainDescs,
desc_len, sdiff_row);
reduceDescDiff
Cached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN
>(queryVals, trainDescs,
desc_len,
dist,
sdiff_row);
}
private:
...
...
@@ -531,26 +406,26 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// matchDescs loop
template <typename ReduceDescCalculator, typename T, typename Mask>
__device__ void matchDescs(int queryIdx,
const
int imgIdx, const DevMem2D_<T>& trainDescs_,
template <typename
Dist, typename
ReduceDescCalculator, typename T, typename Mask>
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& trainDescs_,
const Mask& m, const ReduceDescCalculator& reduceDescCalc,
float
* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx
)
float
& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row
)
{
const T* trainDescs = trainDescs_.ptr(threadIdx.y);
const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T);
for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows;
trainIdx += blockDim.y, trainDescs += trainDescsStep)
for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y)
{
if (m(queryIdx, trainIdx))
{
reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row);
const T* trainDescs = trainDescs_.ptr(trainIdx);
Dist dist;
reduceDescCalc.calc(trainDescs, trainDescs_.cols, dist, sdiff_row);
if (threadIdx.x == 0)
{
float reg = sdiff_row[0];
if (reg < myMin)
if (dist < myMin)
{
myMin =
reg
;
myMin =
dist
;
myBestTrainIdx = trainIdx;
myBestImgIdx = imgIdx;
}
...
...
@@ -570,18 +445,19 @@ namespace cv { namespace gpu { namespace bfmatcher
{
}
template <typename ReduceDescCalculator, typename Mask>
template <typename
Dist, typename
ReduceDescCalculator, typename Mask>
__device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc,
float
* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx
) const
float
& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row
) const
{
matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc,
sdiff_row, myMin, myBestTrainIdx, myBestImgIdx
);
matchDescs
<Dist>
(queryIdx, 0, trainDescs, m, reduceDescCalc,
myMin, myBestTrainIdx, myBestImgIdx, sdiff_row
);
}
__device__ int desc_len() const
{
return trainDescs.cols;
}
private:
DevMem2D_<T> trainDescs;
};
...
...
@@ -595,16 +471,16 @@ namespace cv { namespace gpu { namespace bfmatcher
{
}
template <typename ReduceDescCalculator, typename Mask>
template <typename
Dist, typename
ReduceDescCalculator, typename Mask>
__device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc,
float
* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx
) const
float
& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row
) const
{
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
{
DevMem2D_<T> trainDescs = trainCollection[imgIdx];
m.nextMask();
matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc,
sdiff_row, myMin, myBestTrainIdx, myBestImgIdx
);
matchDescs
<Dist>
(queryIdx, imgIdx, trainDescs, m, reduceDescCalc,
myMin, myBestTrainIdx, myBestImgIdx, sdiff_row
);
}
}
...
...
@@ -612,6 +488,7 @@ namespace cv { namespace gpu { namespace bfmatcher
{
return desclen;
}
private:
const DevMem2D_<T>* trainCollection;
int nImg;
...
...
@@ -623,12 +500,10 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T,
typename Train, typename Mask>
__global__ void match(PtrStep_<T> queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance)
__global__ void match(const PtrStep_<T> queryDescs_, const Train train, const Mask mask,
int* trainIdx, int* imgIdx, float* distance)
{
__shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
__shared__ float smin[64];
__shared__ int strainIdx[64];
__shared__ int simgIdx[64];
__shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y];
const int queryIdx = blockIdx.x;
...
...
@@ -637,24 +512,39 @@ namespace cv { namespace gpu { namespace bfmatcher
float myMin = numeric_limits_gpu<float>::max();
{
float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
Mask m = mask;
ReduceDescCalculator reduceDescCalc;
reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff);
reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), smem);
train.
loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx
);
train.
template loop<Dist>(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row
);
}
__syncthreads();
float* smin = smem;
int* strainIdx = (int*)(smin + BLOCK_DIM_Y);
int* simgIdx = strainIdx + BLOCK_DIM_Y;
findBestMatch<BLOCK_DIM_Y, Dist>(queryIdx, myMin, myBestTrainIdx, myBestImgIdx,
smin, strainIdx, simgIdx, trainIdx, imgIdx, distance);
findBestMatch<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx,
smin, strainIdx, simgIdx);
if (threadIdx.x == 0 && threadIdx.y == 0)
{
imgIdx[queryIdx] = myBestImgIdx;
trainIdx[queryIdx] = myBestTrainIdx;
distance[queryIdx] = myMin;
}
}
///////////////////////////////////////////////////////////////////////////////
// Match kernel callers
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, t
emplate <int> class
Dist, typename T,
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, t
ypename
Dist, typename T,
typename Train, typename Mask>
void match_caller(const DevMem2D_<T>& queryDescs, const Train& train,
void match
Simple
_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
...
...
@@ -662,15 +552,15 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 grid(queryDescs.rows, 1, 1);
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X,
Dist<BLOCK_DIM_X>, T>,
Dist<BLOCK_DIM_X>, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X,
T>, Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
imgIdx.data, distance.data);
cudaSafeCall( cudaThreadSynchronize() );
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN,
t
emplate <int> class
Dist, typename T, typename Train, typename Mask>
void match
_smem
_caller(const DevMem2D_<T>& queryDescs, const Train& train,
t
ypename
Dist, typename T, typename Train, typename Mask>
void match
Cached
_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
...
...
@@ -680,9 +570,10 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 grid(queryDescs.rows, 1, 1);
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSmem<BLOCK_DIM_X, BLOCK_DIM_Y,
MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist<BLOCK_DIM_X>, T>,
Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
match<BLOCK_DIM_X, BLOCK_DIM_Y,
ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T>,
Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
imgIdx.data, distance.data);
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -691,24 +582,24 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// Match kernel chooser
template <t
emplate <int> class
Dist, typename T, typename Train, typename Mask>
template <t
ypename
Dist, typename T, typename Train, typename Mask>
void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
if (queryDescs.cols < 64)
match
_smem
_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
match
Cached
_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols == 64)
match
_smem
_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
match
Cached
_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols < 128)
match
_smem
_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
match
Cached
_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols == 128)
match
_smem
_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
match
Cached
_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols < 256)
match
_smem
_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
match
Cached
_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols == 256)
match
_smem
_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
match
Cached
_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else
match_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
match
Simple
_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -828,41 +719,41 @@ namespace cv { namespace gpu { namespace bfmatcher
{
const T* trainDescs = trainDescs_.ptr(trainIdx);
float
d
ist = numeric_limits_gpu<float>::max();
float
myD
ist = numeric_limits_gpu<float>::max();
if (mask(queryIdx, trainIdx))
{
reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);
Dist dist;
reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, trainDescs_.cols, dist, sdiff_row);
if (threadIdx.x == 0)
{
dist = Dist::finalResult(sdiff_row[0]);
}
myDist = dist;
}
if (threadIdx.x == 0)
distance.ptr(queryIdx)[trainIdx] =
d
ist;
distance.ptr(queryIdx)[trainIdx] =
myD
ist;
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, t
emplate <int> class
Dist, typename T, typename Mask>
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, t
ypename
Dist, typename T, typename Mask>
void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
const Mask& mask, const DevMem2Df& distance)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist
<BLOCK_DIM_X>
, T><<<grid, threads>>>(
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
queryDescs, trainDescs, mask, distance);
cudaSafeCall( cudaThreadSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
//
reduceMin
//
warpReduceMinIdx
template <int BLOCK_SIZE>
__device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid)
...
...
@@ -1103,25 +994,27 @@ namespace cv { namespace gpu { namespace bfmatcher
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ float s
diff
[BLOCK_DIM_X * BLOCK_DIM_Y];
__shared__ float s
mem
[BLOCK_DIM_X * BLOCK_DIM_Y];
float* sdiff_row = s
diff
+ BLOCK_DIM_X * threadIdx.y;
float* sdiff_row = s
mem
+ BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x;
const T* queryDescs = queryDescs_.ptr(queryIdx);
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (trainIdx < trainDescs_.rows)
{
const T* trainDescs = trainDescs_.ptr(trainIdx);
if (mask(queryIdx, trainIdx))
{
reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);
Dist dist;
reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, trainDescs_.cols, dist, sdiff_row);
if (threadIdx.x == 0)
{
float dist = Dist::finalResult(sdiff_row[0]);
if (dist < maxDistance)
{
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
...
...
@@ -1141,7 +1034,7 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, t
emplate <int> class
Dist, typename T, typename Mask>
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, t
ypename
Dist, typename T, typename Mask>
void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
const DevMem2Df& distance)
...
...
@@ -1149,7 +1042,7 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist
<BLOCK_DIM_X>
, T><<<grid, threads>>>(
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
cudaSafeCall( cudaThreadSynchronize() );
...
...
modules/gpu/src/imgproc_gpu.cpp
View file @
8274ed22
...
...
@@ -66,7 +66,10 @@ void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
void
cv
::
gpu
::
sqrIntegral
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
columnSum
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
rectStdDev
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
Rect
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Canny
(
const
GpuMat
&
,
GpuMat
&
,
double
,
double
,
int
)
{
throw_nogpu
();
}
//void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
//void cv::gpu::Canny(const GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
//void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
//void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
void
cv
::
gpu
::
evenLevels
(
GpuMat
&
,
int
,
int
,
int
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
histEven
(
const
GpuMat
&
,
GpuMat
&
,
int
,
int
,
int
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
histEven
(
const
GpuMat
&
,
GpuMat
*
,
int
*
,
int
*
,
int
*
)
{
throw_nogpu
();
}
...
...
@@ -655,34 +658,60 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
////////////////////////////////////////////////////////////////////////
// Canny
void
cv
::
gpu
::
Canny
(
const
GpuMat
&
image
,
GpuMat
&
edges
,
double
threshold1
,
double
threshold2
,
int
apertureSize
)
{
CV_Assert
(
!
"disabled until fix crash"
);
CV_Assert
(
image
.
type
()
==
CV_8UC1
);
GpuMat
srcDx
,
srcDy
;
Sobel
(
image
,
srcDx
,
-
1
,
1
,
0
,
apertureSize
);
Sobel
(
image
,
srcDy
,
-
1
,
0
,
1
,
apertureSize
);
srcDx
.
convertTo
(
srcDx
,
CV_32F
);
srcDy
.
convertTo
(
srcDy
,
CV_32F
);
edges
.
create
(
image
.
size
(),
CV_8UC1
);
NppiSize
sz
;
sz
.
height
=
image
.
rows
;
sz
.
width
=
image
.
cols
;
int
bufsz
;
nppSafeCall
(
nppiCannyGetBufferSize
(
sz
,
&
bufsz
)
);
GpuMat
buf
(
1
,
bufsz
,
CV_8UC1
);
nppSafeCall
(
nppiCanny_32f8u_C1R
(
srcDx
.
ptr
<
Npp32f
>
(),
srcDx
.
step
,
srcDy
.
ptr
<
Npp32f
>
(),
srcDy
.
step
,
edges
.
ptr
<
Npp8u
>
(),
edges
.
step
,
sz
,
(
Npp32f
)
threshold1
,
(
Npp32f
)
threshold2
,
buf
.
ptr
<
Npp8u
>
())
);
cudaSafeCall
(
cudaThreadSynchronize
()
);
}
//void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize)
//{
// CV_Assert(!"disabled until fix crash");
//
// GpuMat srcDx, srcDy;
//
// Sobel(image, srcDx, CV_32F, 1, 0, apertureSize);
// Sobel(image, srcDy, CV_32F, 0, 1, apertureSize);
//
// GpuMat buf;
//
// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);
//}
//
//void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, GpuMat& buf, double threshold1, double threshold2, int apertureSize)
//{
// CV_Assert(!"disabled until fix crash");
//
// GpuMat srcDx, srcDy;
//
// Sobel(image, srcDx, CV_32F, 1, 0, apertureSize);
// Sobel(image, srcDy, CV_32F, 0, 1, apertureSize);
//
// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);
//}
//
//void cv::gpu::Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize)
//{
// CV_Assert(!"disabled until fix crash");
//
// GpuMat buf;
// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);
//}
//
//void cv::gpu::Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buf, double threshold1, double threshold2, int apertureSize)
//{
// CV_Assert(!"disabled until fix crash");
// CV_Assert(srcDx.type() == CV_32FC1 && srcDy.type() == CV_32FC1 && srcDx.size() == srcDy.size());
//
// edges.create(srcDx.size(), CV_8UC1);
//
// NppiSize sz;
// sz.height = srcDx.rows;
// sz.width = srcDx.cols;
//
// int bufsz;
// nppSafeCall( nppiCannyGetBufferSize(sz, &bufsz) );
// ensureSizeIsEnough(1, bufsz, CV_8UC1, buf);
//
// nppSafeCall( nppiCanny_32f8u_C1R(srcDx.ptr<Npp32f>(), srcDx.step, srcDy.ptr<Npp32f>(), srcDy.step,
// edges.ptr<Npp8u>(), edges.step, sz, (Npp32f)threshold1, (Npp32f)threshold2, buf.ptr<Npp8u>()) );
//
// cudaSafeCall( cudaThreadSynchronize() );
//}
////////////////////////////////////////////////////////////////////////
// Histogram
...
...
tests/gpu/src/arithm.cpp
View file @
8274ed22
...
...
@@ -66,45 +66,58 @@ protected:
virtual
int
test
(
const
Mat
&
mat1
,
const
Mat
&
mat2
)
=
0
;
int
CheckNorm
(
const
Mat
&
m1
,
const
Mat
&
m2
);
int
CheckNorm
(
const
Scalar
&
s1
,
const
Scalar
&
s2
);
int
CheckNorm
(
double
d1
,
double
d2
);
int
CheckNorm
(
const
Mat
&
m1
,
const
Mat
&
m2
,
double
eps
=
1e-5
);
int
CheckNorm
(
const
Scalar
&
s1
,
const
Scalar
&
s2
,
double
eps
=
1e-5
);
int
CheckNorm
(
double
d1
,
double
d2
,
double
eps
=
1e-5
);
};
int
CV_GpuArithmTest
::
test
(
int
type
)
{
cv
::
Size
sz
(
200
,
200
);
cv
::
Mat
mat1
(
sz
,
type
),
mat2
(
sz
,
type
);
cv
::
RNG
rng
(
*
ts
->
get_rng
());
rng
.
fill
(
mat1
,
cv
::
RNG
::
UNIFORM
,
cv
::
Scalar
::
all
(
1
),
cv
::
Scalar
::
all
(
20
));
rng
.
fill
(
mat2
,
cv
::
RNG
::
UNIFORM
,
cv
::
Scalar
::
all
(
1
),
cv
::
Scalar
::
all
(
20
));
if
(
type
!=
CV_32FC1
)
{
rng
.
fill
(
mat1
,
cv
::
RNG
::
UNIFORM
,
cv
::
Scalar
::
all
(
1
),
cv
::
Scalar
::
all
(
20
));
rng
.
fill
(
mat2
,
cv
::
RNG
::
UNIFORM
,
cv
::
Scalar
::
all
(
1
),
cv
::
Scalar
::
all
(
20
));
}
else
{
rng
.
fill
(
mat1
,
cv
::
RNG
::
UNIFORM
,
cv
::
Scalar
::
all
(
0.1
),
cv
::
Scalar
::
all
(
1.0
));
rng
.
fill
(
mat2
,
cv
::
RNG
::
UNIFORM
,
cv
::
Scalar
::
all
(
0.1
),
cv
::
Scalar
::
all
(
1.0
));
}
return
test
(
mat1
,
mat2
);
}
int
CV_GpuArithmTest
::
CheckNorm
(
const
Mat
&
m1
,
const
Mat
&
m2
)
int
CV_GpuArithmTest
::
CheckNorm
(
const
Mat
&
m1
,
const
Mat
&
m2
,
double
eps
)
{
double
ret
=
norm
(
m1
,
m2
,
NORM_INF
);
if
(
ret
<
1e-5
)
if
(
ret
<
eps
)
return
CvTS
::
OK
;
ts
->
printf
(
CvTS
::
LOG
,
"
\n
Norm: %f
\n
"
,
ret
);
return
CvTS
::
FAIL_GENERIC
;
}
int
CV_GpuArithmTest
::
CheckNorm
(
const
Scalar
&
s1
,
const
Scalar
&
s2
)
int
CV_GpuArithmTest
::
CheckNorm
(
const
Scalar
&
s1
,
const
Scalar
&
s2
,
double
eps
)
{
double
ret0
=
CheckNorm
(
s1
[
0
],
s2
[
0
]),
ret1
=
CheckNorm
(
s1
[
1
],
s2
[
1
]),
ret2
=
CheckNorm
(
s1
[
2
],
s2
[
2
]),
ret3
=
CheckNorm
(
s1
[
3
],
s2
[
3
]);
int
ret0
=
CheckNorm
(
s1
[
0
],
s2
[
0
],
eps
),
ret1
=
CheckNorm
(
s1
[
1
],
s2
[
1
],
eps
),
ret2
=
CheckNorm
(
s1
[
2
],
s2
[
2
],
eps
),
ret3
=
CheckNorm
(
s1
[
3
],
s2
[
3
],
eps
);
return
(
ret0
==
CvTS
::
OK
&&
ret1
==
CvTS
::
OK
&&
ret2
==
CvTS
::
OK
&&
ret3
==
CvTS
::
OK
)
?
CvTS
::
OK
:
CvTS
::
FAIL_GENERIC
;
}
int
CV_GpuArithmTest
::
CheckNorm
(
double
d1
,
double
d2
)
int
CV_GpuArithmTest
::
CheckNorm
(
double
d1
,
double
d2
,
double
eps
)
{
double
ret
=
::
fabs
(
d1
-
d2
);
if
(
ret
<
1e-5
)
if
(
ret
<
eps
)
return
CvTS
::
OK
;
ts
->
printf
(
CvTS
::
LOG
,
"
\n
Norm: %f
\n
"
,
ret
);
...
...
@@ -245,7 +258,7 @@ struct CV_GpuNppImageDivideTest : public CV_GpuArithmTest
GpuMat
gpuRes
;
cv
::
gpu
::
divide
(
gpu1
,
gpu2
,
gpuRes
);
return
CheckNorm
(
cpuRes
,
gpuRes
);
return
CheckNorm
(
cpuRes
,
gpuRes
,
1.01
f
);
}
};
...
...
@@ -584,7 +597,7 @@ struct CV_GpuNppImagePhaseTest : public CV_GpuArithmTest
GpuMat
gpuRes
;
cv
::
gpu
::
phase
(
gpu1
,
gpu2
,
gpuRes
,
true
);
return
CheckNorm
(
cpuRes
,
gpuRes
);
return
CheckNorm
(
cpuRes
,
gpuRes
,
0.3
f
);
}
};
...
...
@@ -611,7 +624,7 @@ struct CV_GpuNppImageCartToPolarTest : public CV_GpuArithmTest
cv
::
gpu
::
cartToPolar
(
gpu1
,
gpu2
,
gpuMag
,
gpuAngle
);
int
magRes
=
CheckNorm
(
cpuMag
,
gpuMag
);
int
angleRes
=
CheckNorm
(
cpuAngle
,
gpuAngle
);
int
angleRes
=
CheckNorm
(
cpuAngle
,
gpuAngle
,
0.005
f
);
return
magRes
==
CvTS
::
OK
&&
angleRes
==
CvTS
::
OK
?
CvTS
::
OK
:
CvTS
::
FAIL_GENERIC
;
}
...
...
tests/gpu/src/brute_force_matcher.cpp
View file @
8274ed22
...
...
@@ -51,24 +51,27 @@ class CV_GpuBruteForceMatcherTest : public CvTest
{
public
:
CV_GpuBruteForceMatcherTest
()
:
CvTest
(
"GPU-BruteForceMatcher"
,
"BruteForceMatcher"
)
,
badPart
(
0.01
f
)
CvTest
(
"GPU-BruteForceMatcher"
,
"BruteForceMatcher"
)
{
}
protected
:
static
const
int
dim
=
500
;
static
const
int
queryDescCount
=
300
;
// must be even number because we split train data in some cases in two
static
const
int
countFactor
=
4
;
// do not change it
const
float
badPart
;
protected
:
virtual
void
run
(
int
);
void
generateData
(
GpuMat
&
query
,
GpuMat
&
train
);
void
emptyDataTest
();
void
dataTest
(
int
dim
);
void
generateData
(
GpuMat
&
query
,
GpuMat
&
train
,
int
dim
);
void
matchTest
(
const
GpuMat
&
query
,
const
GpuMat
&
train
);
void
knnMatchTest
(
const
GpuMat
&
query
,
const
GpuMat
&
train
);
void
radiusMatchTest
(
const
GpuMat
&
query
,
const
GpuMat
&
train
);
private
:
BruteForceMatcher_GPU
<
L2
<
float
>
>
dmatcher
;
static
const
int
queryDescCount
=
300
;
// must be even number because we split train data in some cases in two
static
const
int
countFactor
=
4
;
// do not change it
};
void
CV_GpuBruteForceMatcherTest
::
emptyDataTest
()
...
...
@@ -150,7 +153,7 @@ void CV_GpuBruteForceMatcherTest::emptyDataTest()
}
void
CV_GpuBruteForceMatcherTest
::
generateData
(
GpuMat
&
queryGPU
,
GpuMat
&
trainGPU
)
void
CV_GpuBruteForceMatcherTest
::
generateData
(
GpuMat
&
queryGPU
,
GpuMat
&
trainGPU
,
int
dim
)
{
Mat
query
,
train
;
RNG
rng
(
*
ts
->
get_rng
());
...
...
@@ -209,7 +212,7 @@ void CV_GpuBruteForceMatcherTest::matchTest( const GpuMat& query, const GpuMat&
if
(
(
match
.
queryIdx
!=
(
int
)
i
)
||
(
match
.
trainIdx
!=
(
int
)
i
*
countFactor
)
||
(
match
.
imgIdx
!=
0
)
)
badCount
++
;
}
if
(
(
float
)
badCount
>
(
float
)
queryDescCount
*
badPart
)
if
(
badCount
>
0
)
{
ts
->
printf
(
CvTS
::
LOG
,
"%f - too large bad matches part while test match() function (1).
\n
"
,
(
float
)
badCount
/
(
float
)
queryDescCount
);
...
...
@@ -260,7 +263,7 @@ void CV_GpuBruteForceMatcherTest::matchTest( const GpuMat& query, const GpuMat&
}
}
}
if
(
(
float
)
badCount
>
(
float
)
queryDescCount
*
badPart
)
if
(
badCount
>
0
)
{
ts
->
printf
(
CvTS
::
LOG
,
"%f - too large bad matches part while test match() function (2).
\n
"
,
(
float
)
badCount
/
(
float
)
queryDescCount
);
...
...
@@ -305,7 +308,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
badCount
+=
localBadCount
>
0
?
1
:
0
;
}
}
if
(
(
float
)
badCount
>
(
float
)
queryDescCount
*
badPart
)
if
(
badCount
>
0
)
{
ts
->
printf
(
CvTS
::
LOG
,
"%f - too large bad matches part while test knnMatch() function (1).
\n
"
,
(
float
)
badCount
/
(
float
)
queryDescCount
);
...
...
@@ -369,7 +372,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
badCount
+=
localBadCount
>
0
?
1
:
0
;
}
}
if
(
(
float
)
badCount
>
(
float
)
queryDescCount
*
badPart
)
if
(
badCount
>
0
)
{
ts
->
printf
(
CvTS
::
LOG
,
"%f - too large bad matches part while test knnMatch() function (2).
\n
"
,
(
float
)
badCount
/
(
float
)
queryDescCount
);
...
...
@@ -407,7 +410,7 @@ void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const Gp
badCount
++
;
}
}
if
(
(
float
)
badCount
>
(
float
)
queryDescCount
*
badPart
)
if
(
badCount
>
0
)
{
ts
->
printf
(
CvTS
::
LOG
,
"%f - too large bad matches part while test radiusMatch() function (1).
\n
"
,
(
float
)
badCount
/
(
float
)
queryDescCount
);
...
...
@@ -473,7 +476,8 @@ void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const Gp
badCount
+=
localBadCount
>
0
?
1
:
0
;
}
}
if
(
(
float
)
badCount
>
(
float
)
queryDescCount
*
badPart
)
if
(
badCount
>
0
)
{
curRes
=
CvTS
::
FAIL_INVALID_OUTPUT
;
ts
->
printf
(
CvTS
::
LOG
,
"%f - too large bad matches part while test radiusMatch() function (2).
\n
"
,
...
...
@@ -483,20 +487,29 @@ void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const Gp
}
}
void
CV_GpuBruteForceMatcherTest
::
run
(
int
)
void
CV_GpuBruteForceMatcherTest
::
dataTest
(
int
dim
)
{
emptyDataTest
();
GpuMat
query
,
train
;
generateData
(
query
,
train
);
generateData
(
query
,
train
,
dim
);
matchTest
(
query
,
train
);
matchTest
(
query
,
train
);
knnMatchTest
(
query
,
train
);
radiusMatchTest
(
query
,
train
);
knnMatchTest
(
query
,
train
);
dmatcher
.
clear
();
}
radiusMatchTest
(
query
,
train
);
void
CV_GpuBruteForceMatcherTest
::
run
(
int
)
{
emptyDataTest
();
dmatcher
.
clear
();
dataTest
(
50
);
dataTest
(
64
);
dataTest
(
100
);
dataTest
(
128
);
dataTest
(
200
);
dataTest
(
256
);
dataTest
(
300
);
}
CV_GpuBruteForceMatcherTest
CV_GpuBruteForceMatcher_test
;
tests/gpu/src/gputest_main.cpp
View file @
8274ed22
...
...
@@ -45,7 +45,6 @@ CvTS test_system("gpu");
const
char
*
blacklist
[]
=
{
"GPU-AsyncGpuMatOperator"
,
// crash
"GPU-NppImageCanny"
,
// NPP_TEXTURE_BIND_ERROR
0
};
...
...
tests/gpu/src/imgproc_gpu.cpp
View file @
8274ed22
...
...
@@ -408,30 +408,30 @@ struct CV_GpuNppImageIntegralTest : public CV_GpuImageProcTest
////////////////////////////////////////////////////////////////////////////////
// Canny
struct
CV_GpuNppImageCannyTest
:
public
CV_GpuImageProcTest
{
CV_GpuNppImageCannyTest
()
:
CV_GpuImageProcTest
(
"GPU-NppImageCanny"
,
"Canny"
)
{}
int
test
(
const
Mat
&
img
)
{
if
(
img
.
type
()
!=
CV_8UC1
)
{
ts
->
printf
(
CvTS
::
LOG
,
"
\n
Unsupported type
\n
"
);
return
CvTS
::
OK
;
}
const
double
threshold1
=
1.0
,
threshold2
=
10.0
;
Mat
cpudst
;
cv
::
Canny
(
img
,
cpudst
,
threshold1
,
threshold2
);
GpuMat
gpu1
(
img
);
GpuMat
gpudst
;
cv
::
gpu
::
Canny
(
gpu1
,
gpudst
,
threshold1
,
threshold2
);
return
CheckNorm
(
cpudst
,
gpudst
);
}
};
//
struct CV_GpuNppImageCannyTest : public CV_GpuImageProcTest
//
{
//
CV_GpuNppImageCannyTest() : CV_GpuImageProcTest( "GPU-NppImageCanny", "Canny" ) {}
//
//
int test(const Mat& img)
//
{
//
if (img.type() != CV_8UC1)
//
{
//
ts->printf(CvTS::LOG, "\nUnsupported type\n");
//
return CvTS::OK;
//
}
//
//
const double threshold1 = 1.0, threshold2 = 10.0;
//
//
Mat cpudst;
//
cv::Canny(img, cpudst, threshold1, threshold2);
//
//
GpuMat gpu1(img);
//
GpuMat gpudst;
//
cv::gpu::Canny(gpu1, gpudst, threshold1, threshold2);
//
//
return CheckNorm(cpudst, gpudst);
//
}
//
};
////////////////////////////////////////////////////////////////////////////////
// cvtColor
...
...
@@ -839,7 +839,7 @@ CV_GpuNppImageCopyMakeBorderTest CV_GpuNppImageCopyMakeBorder_test;
CV_GpuNppImageWarpAffineTest
CV_GpuNppImageWarpAffine_test
;
CV_GpuNppImageWarpPerspectiveTest
CV_GpuNppImageWarpPerspective_test
;
CV_GpuNppImageIntegralTest
CV_GpuNppImageIntegral_test
;
CV_GpuNppImageCannyTest
CV_GpuNppImageCanny_test
;
//
CV_GpuNppImageCannyTest CV_GpuNppImageCanny_test;
CV_GpuCvtColorTest
CV_GpuCvtColor_test
;
CV_GpuHistogramsTest
CV_GpuHistograms_test
;
CV_GpuCornerHarrisTest
CV_GpuCornerHarris_test
;
...
...
tests/gpu/src/operator_async_call.cpp
View file @
8274ed22
...
...
@@ -40,119 +40,54 @@
//M*/
#include "gputest.hpp"
#include <string>
#include <iostream>
#include <fstream>
#include <iterator>
#include <limits>
#include <numeric>
#include <iomanip> // for cout << setw()
using
namespace
cv
;
using
namespace
std
;
using
namespace
gpu
;
class
CV_AsyncGpuMatTest
:
public
CvTest
{
public
:
CV_AsyncGpuMatTest
()
:
CvTest
(
"GPU-AsyncGpuMatOperator"
,
"async"
)
{
rows
=
234
;
cols
=
123
;
}
~
CV_AsyncGpuMatTest
()
{}
protected
:
void
run
(
int
);
template
<
typename
T
>
void
print_mat
(
const
T
&
mat
,
const
std
::
string
&
name
)
const
;
bool
compare_matrix
(
cv
::
Mat
&
cpumat
);
private
:
int
rows
;
int
cols
;
};
template
<
typename
T
>
void
CV_AsyncGpuMatTest
::
print_mat
(
const
T
&
mat
,
const
std
::
string
&
name
)
const
{
cv
::
imshow
(
name
,
mat
);
}
using
namespace
cv
;
using
namespace
cv
::
gpu
;
bool
CV_AsyncGpuMatTest
::
compare_matrix
(
cv
::
Mat
&
cpumat
)
struct
CV_AsyncGpuMatTest
:
public
CvTest
{
Mat
cmat
(
cpumat
.
size
(),
cpumat
.
type
(),
Scalar
::
all
(
0
));
GpuMat
gmat0
(
cmat
);
GpuMat
gmat1
;
GpuMat
gmat2
;
GpuMat
gmat3
;
//int64 time = getTickCount();
Stream
stream
;
stream
.
enqueueMemSet
(
gmat0
,
cv
::
Scalar
::
all
(
1
),
gmat1
);
stream
.
enqueueMemSet
(
gmat0
,
cv
::
Scalar
::
all
(
1
),
gmat2
);
stream
.
enqueueMemSet
(
gmat0
,
cv
::
Scalar
::
all
(
1
),
gmat3
);
stream
.
waitForCompletion
();
//int64 time1 = getTickCount();
gmat1
.
copyTo
(
gmat0
);
gmat2
.
copyTo
(
gmat0
);
gmat3
.
copyTo
(
gmat0
);
//int64 time2 = getTickCount();
//std::cout << "\ntime async: " << std::fixed << std::setprecision(12) << double((time1 - time) / (double)getTickFrequency());
//std::cout << "\ntime sync: " << std::fixed << std::setprecision(12) << double((time2 - time1) / (double)getTickFrequency());
//std::cout << "\n";
#ifdef PRINT_MATRIX
print_mat
(
cmat
,
"cpu mat"
);
print_mat
(
gmat0
,
"gpu mat 0"
);
print_mat
(
gmat1
,
"gpu mat 1"
);
print_mat
(
gmat2
,
"gpu mat 2"
);
print_mat
(
gmat3
,
"gpu mat 3"
);
cv
::
waitKey
(
0
);
#endif
double
ret
=
norm
(
cmat
,
gmat0
)
+
norm
(
cmat
,
gmat1
)
+
norm
(
cmat
,
gmat2
)
+
norm
(
cmat
,
gmat3
);
if
(
ret
<
1.0
)
return
true
;
else
CV_AsyncGpuMatTest
()
:
CvTest
(
"GPU-AsyncGpuMatOperator"
,
"async"
)
{
ts
->
printf
(
CvTS
::
LOG
,
"
\n
Norm: %f
\n
"
,
ret
);
return
false
;
}
}
void
CV_AsyncGpuMatTest
::
run
(
int
/* start_from */
)
{
bool
is_test_good
=
true
;
void
run
(
int
)
{
try
{
CudaMem
src
(
Mat
::
zeros
(
100
,
100
,
CV_8UC1
));
Mat
cpumat
(
rows
,
cols
,
CV_8U
)
;
cpumat
.
setTo
(
Scalar
::
all
(
127
)
);
GpuMat
gpusrc
;
GpuMat
gpudst0
,
gpudst1
(
100
,
100
,
CV_8UC1
);
try
{
is_test_good
&=
compare_matrix
(
cpumat
);
}
catch
(
cv
::
Exception
&
e
)
{
if
(
!
check_and_treat_gpu_exception
(
e
,
ts
))
throw
;
return
;
}
CudaMem
cpudst0
;
CudaMem
cpudst1
;
if
(
is_test_good
==
true
)
ts
->
set_failed_test_info
(
CvTS
::
OK
);
else
ts
->
set_failed_test_info
(
CvTS
::
FAIL_GENERIC
);
}
Stream
stream0
,
stream1
;
stream0
.
enqueueUpload
(
src
,
gpusrc
);
bitwise_not
(
gpusrc
,
gpudst0
,
GpuMat
(),
stream0
);
stream0
.
enqueueDownload
(
gpudst0
,
cpudst0
);
/////////////////////////////////////////////////////////////////////////////
/////////////////// tests registration /////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////
stream1
.
enqueueMemSet
(
gpudst1
,
Scalar
::
all
(
128
));
stream1
.
enqueueDownload
(
gpudst1
,
cpudst1
);
stream0
.
waitForCompletion
();
stream1
.
waitForCompletion
();
Mat
cpu_gold0
(
100
,
100
,
CV_8UC1
,
Scalar
::
all
(
255
));
Mat
cpu_gold1
(
100
,
100
,
CV_8UC1
,
Scalar
::
all
(
128
));
CV_AsyncGpuMatTest
CV_AsyncGpuMatTest_test
;
if
(
norm
(
cpudst0
,
cpu_gold0
,
NORM_INF
)
>
0
||
norm
(
cpudst1
,
cpu_gold1
,
NORM_INF
)
>
0
)
ts
->
set_failed_test_info
(
CvTS
::
FAIL_GENERIC
);
else
ts
->
set_failed_test_info
(
CvTS
::
OK
);
}
catch
(
cv
::
Exception
&
e
)
{
if
(
!
check_and_treat_gpu_exception
(
e
,
ts
))
throw
;
return
;
}
}
}
CV_AsyncGpuMatTest_test
;
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment