Commit deac5d97 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

fixed errors in gpu on old video cards (SURF_GPU, BruteForceMatcher_GPU, min/max, setTo, convertTo)

added assertion after all kernels calls
parent 5f175f95
...@@ -435,8 +435,8 @@ namespace cv ...@@ -435,8 +435,8 @@ namespace cv
void enqueueCopy(const GpuMat& src, GpuMat& dst); void enqueueCopy(const GpuMat& src, GpuMat& dst);
void enqueueMemSet(const GpuMat& src, Scalar val); void enqueueMemSet(GpuMat& src, Scalar val);
void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask); void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
// converts matrix type, ex from float to uchar depending on type // converts matrix type, ex from float to uchar depending on type
void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);
......
...@@ -76,18 +76,22 @@ namespace cv { namespace gpu { namespace bfmatcher ...@@ -76,18 +76,22 @@ namespace cv { namespace gpu { namespace bfmatcher
{ {
template <typename T> template <typename T>
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12);
template <typename T> template <typename T>
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12);
template <typename T> template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance); const DevMem2Df& distance,
bool cc_12);
template <typename T> template <typename T>
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance); const DevMem2Df& distance,
bool cc_12);
template <typename T> template <typename T>
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
...@@ -160,17 +164,20 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, ...@@ -160,17 +164,20 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
using namespace cv::gpu::bfmatcher; using namespace cv::gpu::bfmatcher;
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12);
static const match_caller_t match_callers[2][8] = static const match_caller_t match_callers[2][8] =
{ {
{ {
matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<char>, matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>,
matchSingleL1_gpu<short>, matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0 matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<short>,
matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
}, },
{ {
matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<char>, matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>,
matchSingleL2_gpu<short>, matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0 matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>,
matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
} }
}; };
...@@ -185,9 +192,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, ...@@ -185,9 +192,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
match_caller_t func = match_callers[distType][queryDescs.depth()]; match_caller_t func = match_callers[distType][queryDescs.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
// For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.
// trainIdx store after imgIdx, so we doesn't lose it value. // trainIdx store after imgIdx, so we doesn't lose it value.
func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance); func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12);
} }
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance,
...@@ -284,17 +293,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes ...@@ -284,17 +293,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance); const DevMem2Df& distance, bool cc_12);
static const match_caller_t match_callers[2][8] = static const match_caller_t match_callers[2][8] =
{ {
{ {
matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<char>, matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,
matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>, matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,
matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0 matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
}, },
{ {
matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<char>, matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,
matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>, matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0 matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
} }
...@@ -311,7 +320,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes ...@@ -311,7 +320,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
match_caller_t func = match_callers[distType][queryDescs.depth()]; match_caller_t func = match_callers[distType][queryDescs.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance); bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12);
} }
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx,
...@@ -383,11 +394,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con ...@@ -383,11 +394,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
static const match_caller_t match_callers[2][8] = static const match_caller_t match_callers[2][8] =
{ {
{ {
knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<char>, knnMatchL1_gpu<unsigned short>, knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,
knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0 knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
}, },
{ {
knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<char>, knnMatchL2_gpu<unsigned short>, knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,
knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0 knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
} }
}; };
...@@ -522,11 +533,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, ...@@ -522,11 +533,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
static const radiusMatch_caller_t radiusMatch_callers[2][8] = static const radiusMatch_caller_t radiusMatch_callers[2][8] =
{ {
{ {
radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<char>, radiusMatchL1_gpu<unsigned short>, radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,
radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0 radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
}, },
{ {
radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<char>, radiusMatchL2_gpu<unsigned short>, radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,
radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0 radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
} }
}; };
......
This diff is collapsed.
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp" #include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
using namespace cv::gpu; using namespace cv::gpu;
using namespace cv::gpu::device; using namespace cv::gpu::device;
...@@ -51,13 +52,9 @@ using namespace cv::gpu::device; ...@@ -51,13 +52,9 @@ using namespace cv::gpu::device;
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
#endif #endif
#ifndef FLT_EPSILON
#define FLT_EPSILON 1.192092896e-07F
#endif
namespace cv { namespace gpu { namespace color namespace cv { namespace gpu { namespace color
{ {
template<typename T> struct ColorChannel {}; template<typename T> struct ColorChannel;
template<> struct ColorChannel<uchar> template<> struct ColorChannel<uchar>
{ {
typedef float worktype_f; typedef float worktype_f;
...@@ -133,6 +130,7 @@ namespace cv { namespace gpu { namespace color ...@@ -133,6 +130,7 @@ namespace cv { namespace gpu { namespace color
RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -276,6 +274,7 @@ namespace cv { namespace gpu { namespace color ...@@ -276,6 +274,7 @@ namespace cv { namespace gpu { namespace color
RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step, RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -304,6 +303,7 @@ namespace cv { namespace gpu { namespace color ...@@ -304,6 +303,7 @@ namespace cv { namespace gpu { namespace color
RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -385,6 +385,7 @@ namespace cv { namespace gpu { namespace color ...@@ -385,6 +385,7 @@ namespace cv { namespace gpu { namespace color
Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -425,6 +426,7 @@ namespace cv { namespace gpu { namespace color ...@@ -425,6 +426,7 @@ namespace cv { namespace gpu { namespace color
Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -533,6 +535,7 @@ namespace cv { namespace gpu { namespace color ...@@ -533,6 +535,7 @@ namespace cv { namespace gpu { namespace color
RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -573,6 +576,7 @@ namespace cv { namespace gpu { namespace color ...@@ -573,6 +576,7 @@ namespace cv { namespace gpu { namespace color
RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -698,6 +702,7 @@ namespace cv { namespace gpu { namespace color ...@@ -698,6 +702,7 @@ namespace cv { namespace gpu { namespace color
RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -756,6 +761,7 @@ namespace cv { namespace gpu { namespace color ...@@ -756,6 +761,7 @@ namespace cv { namespace gpu { namespace color
YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace color ...@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace color
RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -960,6 +967,7 @@ namespace cv { namespace gpu { namespace color ...@@ -960,6 +967,7 @@ namespace cv { namespace gpu { namespace color
XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -1063,8 +1071,8 @@ namespace cv { namespace gpu { namespace color ...@@ -1063,8 +1071,8 @@ namespace cv { namespace gpu { namespace color
vmin = fmin(vmin, b); vmin = fmin(vmin, b);
diff = v - vmin; diff = v - vmin;
s = diff / (float)(fabs(v) + FLT_EPSILON); s = diff / (float)(fabs(v) + numeric_limits_gpu<float>::epsilon());
diff = (float)(60. / (diff + FLT_EPSILON)); diff = (float)(60. / (diff + numeric_limits_gpu<float>::epsilon()));
if (v == r) if (v == r)
h = (g - b) * diff; h = (g - b) * diff;
...@@ -1199,6 +1207,8 @@ namespace cv { namespace gpu { namespace color ...@@ -1199,6 +1207,8 @@ namespace cv { namespace gpu { namespace color
RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -1281,6 +1291,8 @@ namespace cv { namespace gpu { namespace color ...@@ -1281,6 +1291,8 @@ namespace cv { namespace gpu { namespace color
HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -1342,7 +1354,7 @@ namespace cv { namespace gpu { namespace color ...@@ -1342,7 +1354,7 @@ namespace cv { namespace gpu { namespace color
diff = vmax - vmin; diff = vmax - vmin;
l = (vmax + vmin) * 0.5f; l = (vmax + vmin) * 0.5f;
if (diff > FLT_EPSILON) if (diff > numeric_limits_gpu<float>::epsilon())
{ {
s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin); s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin);
diff = 60.f / diff; diff = 60.f / diff;
...@@ -1550,6 +1562,8 @@ namespace cv { namespace gpu { namespace color ...@@ -1550,6 +1562,8 @@ namespace cv { namespace gpu { namespace color
HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
......
This diff is collapsed.
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp" #include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp" #include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "safe_call.hpp" #include "safe_call.hpp"
#include "internal_shared.hpp" #include "internal_shared.hpp"
...@@ -51,192 +52,6 @@ ...@@ -51,192 +52,6 @@
using namespace cv::gpu; using namespace cv::gpu;
using namespace cv::gpu::device; using namespace cv::gpu::device;
namespace cv
{
namespace gpu
{
namespace device
{
struct BrdReflect101
{
explicit BrdReflect101(int len): last(len - 1) {}
__device__ int idx_low(int i) const
{
return abs(i);
}
__device__ int idx_high(int i) const
{
return last - abs(last - i);
}
__device__ int idx(int i) const
{
return abs(idx_high(i));
}
bool is_range_safe(int mini, int maxi) const
{
return -last <= mini && maxi <= 2 * last;
}
int last;
};
template <typename D>
struct BrdRowReflect101: BrdReflect101
{
explicit BrdRowReflect101(int len): BrdReflect101(len) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i)]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i)]);
}
};
template <typename D>
struct BrdColReflect101: BrdReflect101
{
BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i) * step]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i) * step]);
}
int step;
};
struct BrdReplicate
{
explicit BrdReplicate(int len): last(len - 1) {}
__device__ int idx_low(int i) const
{
return max(i, 0);
}
__device__ int idx_high(int i) const
{
return min(i, last);
}
__device__ int idx(int i) const
{
return max(min(i, last), 0);
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int last;
};
template <typename D>
struct BrdRowReplicate: BrdReplicate
{
explicit BrdRowReplicate(int len): BrdReplicate(len) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i)]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i)]);
}
};
template <typename D>
struct BrdColReplicate: BrdReplicate
{
BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i) * step]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i) * step]);
}
int step;
};
template <typename D>
struct BrdRowConstant
{
explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(data[i]) : val;
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i]) : val;
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int len;
D val;
};
template <typename D>
struct BrdColConstant
{
BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i * step]) : val;
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int len;
int step;
D val;
};
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
// Linear filters // Linear filters
...@@ -329,6 +144,7 @@ namespace cv { namespace gpu { namespace filters ...@@ -329,6 +144,7 @@ namespace cv { namespace gpu { namespace filters
} }
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b); filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -467,6 +283,7 @@ namespace cv { namespace gpu { namespace filters ...@@ -467,6 +283,7 @@ namespace cv { namespace gpu { namespace filters
} }
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b); filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -705,14 +522,18 @@ namespace cv { namespace gpu { namespace bf ...@@ -705,14 +522,18 @@ namespace cv { namespace gpu { namespace bf
for (int i = 0; i < iters; ++i) for (int i = 0; i < iters; ++i)
{ {
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
} }
break; break;
case 3: case 3:
for (int i = 0; i < iters; ++i) for (int i = 0; i < iters; ++i)
{ {
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
} }
break; break;
default: default:
......
...@@ -222,6 +222,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y, ...@@ -222,6 +222,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
int smem = hists_size + final_hists_size; int smem = hists_size + final_hists_size;
compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>( compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
img_block_width, grad, qangle, scale, block_hists); img_block_width, grad, qangle, scale, block_hists);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -325,6 +326,8 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y, ...@@ -325,6 +326,8 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
else else
cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__); cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -421,6 +424,8 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block ...@@ -421,6 +424,8 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>( classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
block_hists, coefs, free_coef, threshold, labels); block_hists, coefs, free_coef, threshold, labels);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -467,6 +472,8 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i ...@@ -467,6 +472,8 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
block_stride_x; block_stride_x;
extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>( extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -515,6 +522,8 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i ...@@ -515,6 +522,8 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
block_stride_x; block_stride_x;
extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>( extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -640,6 +649,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im ...@@ -640,6 +649,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>( compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(
height, width, img, angle_scale, grad, qangle); height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -713,6 +724,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im ...@@ -713,6 +724,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>( compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(
height, width, img, angle_scale, grad, qangle); height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -749,6 +762,8 @@ void resize_8UC4(const DevMem2D& src, DevMem2D dst) ...@@ -749,6 +762,8 @@ void resize_8UC4(const DevMem2D& src, DevMem2D dst)
float sx = (float)src.cols / dst.cols; float sx = (float)src.cols / dst.cols;
float sy = (float)src.rows / dst.rows; float sy = (float)src.rows / dst.rows;
resize_8UC4_kernel<<<grid, threads>>>(sx, sy, dst); resize_8UC4_kernel<<<grid, threads>>>(sx, sy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaUnbindTexture(resize8UC4_tex)); cudaSafeCall(cudaUnbindTexture(resize8UC4_tex));
...@@ -776,6 +791,8 @@ void resize_8UC1(const DevMem2D& src, DevMem2D dst) ...@@ -776,6 +791,8 @@ void resize_8UC1(const DevMem2D& src, DevMem2D dst)
float sx = (float)src.cols / dst.cols; float sx = (float)src.cols / dst.cols;
float sy = (float)src.rows / dst.rows; float sy = (float)src.rows / dst.rows;
resize_8UC1_kernel<<<grid, threads>>>(sx, sy, dst); resize_8UC1_kernel<<<grid, threads>>>(sx, sy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaUnbindTexture(resize8UC1_tex)); cudaSafeCall(cudaUnbindTexture(resize8UC1_tex));
......
...@@ -137,6 +137,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -137,6 +137,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) ); cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex_remap) ); cudaSafeCall( cudaUnbindTexture(tex_remap) );
...@@ -150,6 +151,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -150,6 +151,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(dst.rows, threads.y); grid.y = divUp(dst.rows, threads.y);
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -259,6 +261,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -259,6 +261,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
...@@ -273,6 +277,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -273,6 +277,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
...@@ -388,6 +394,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -388,6 +394,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp); drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -401,6 +408,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -401,6 +408,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp); drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -451,6 +459,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -451,6 +459,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols); reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -491,6 +500,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -491,6 +500,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y)); dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst); extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -598,6 +609,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -598,6 +609,8 @@ namespace cv { namespace gpu { namespace imgproc
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaUnbindTexture(harrisDxTex)); cudaSafeCall(cudaUnbindTexture(harrisDxTex));
cudaSafeCall(cudaUnbindTexture(harrisDyTex)); cudaSafeCall(cudaUnbindTexture(harrisDyTex));
...@@ -712,6 +725,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -712,6 +725,8 @@ namespace cv { namespace gpu { namespace imgproc
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
...@@ -746,6 +761,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -746,6 +761,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(src.cols, threads.x)); dim3 grid(divUp(src.cols, threads.x));
column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst); column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -772,6 +789,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -772,6 +789,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel<<<grid, threads>>>(a, b, c); mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -799,6 +818,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -799,6 +818,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c); mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -827,6 +848,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -827,6 +848,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c); mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -855,6 +878,8 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -855,6 +878,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c); mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
......
...@@ -132,6 +132,8 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ, ...@@ -132,6 +132,8 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
templ.cols, templ.rows, image, templ, result); templ.cols, templ.rows, image, templ, result);
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -161,6 +163,8 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ, ...@@ -161,6 +163,8 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
templ.cols, templ.rows, image, templ, result); templ.cols, templ.rows, image, templ, result);
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -222,6 +226,8 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ, ...@@ -222,6 +226,8 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
templ.cols, templ.rows, image, templ, result); templ.cols, templ.rows, image, templ, result);
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -251,6 +257,8 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ, ...@@ -251,6 +257,8 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
templ.cols, templ.rows, image, templ, result); templ.cols, templ.rows, image, templ, result);
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -299,6 +307,8 @@ void matchTemplatePrepared_SQDIFF_8U( ...@@ -299,6 +307,8 @@ void matchTemplatePrepared_SQDIFF_8U(
w, h, image_sqsum, templ_sqsum, result); w, h, image_sqsum, templ_sqsum, result);
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -348,6 +358,8 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U( ...@@ -348,6 +358,8 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
w, h, image_sqsum, templ_sqsum, result); w, h, image_sqsum, templ_sqsum, result);
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -378,6 +390,8 @@ void matchTemplatePrepared_CCOFF_8U( ...@@ -378,6 +390,8 @@ void matchTemplatePrepared_CCOFF_8U(
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>( matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(
w, h, (float)templ_sum / (w * h), image_sum, result); w, h, (float)templ_sum / (w * h), image_sum, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -418,6 +432,8 @@ void matchTemplatePrepared_CCOFF_8UC2( ...@@ -418,6 +432,8 @@ void matchTemplatePrepared_CCOFF_8UC2(
matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>( matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
image_sum_r, image_sum_g, result); image_sum_r, image_sum_g, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -472,6 +488,8 @@ void matchTemplatePrepared_CCOFF_8UC3( ...@@ -472,6 +488,8 @@ void matchTemplatePrepared_CCOFF_8UC3(
(float)templ_sum_g / (w * h), (float)templ_sum_g / (w * h),
(float)templ_sum_b / (w * h), (float)templ_sum_b / (w * h),
image_sum_r, image_sum_g, image_sum_b, result); image_sum_r, image_sum_g, image_sum_b, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -536,6 +554,8 @@ void matchTemplatePrepared_CCOFF_8UC4( ...@@ -536,6 +554,8 @@ void matchTemplatePrepared_CCOFF_8UC4(
(float)templ_sum_a / (w * h), (float)templ_sum_a / (w * h),
image_sum_r, image_sum_g, image_sum_b, image_sum_a, image_sum_r, image_sum_g, image_sum_b, image_sum_a,
result); result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -580,6 +600,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8U( ...@@ -580,6 +600,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>( matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(
w, h, weight, templ_sum_scale, templ_sqsum_scale, w, h, weight, templ_sum_scale, templ_sqsum_scale,
image_sum, image_sqsum, result); image_sum, image_sqsum, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -641,6 +663,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2( ...@@ -641,6 +663,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
image_sum_r, image_sqsum_r, image_sum_r, image_sqsum_r,
image_sum_g, image_sqsum_g, image_sum_g, image_sqsum_g,
result); result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -716,6 +740,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3( ...@@ -716,6 +740,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
image_sum_g, image_sqsum_g, image_sum_g, image_sqsum_g,
image_sum_b, image_sqsum_b, image_sum_b, image_sqsum_b,
result); result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -805,6 +831,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4( ...@@ -805,6 +831,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
image_sum_b, image_sqsum_b, image_sum_b, image_sqsum_b,
image_sum_a, image_sqsum_a, image_sum_a, image_sqsum_a,
result); result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -847,6 +875,8 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, ...@@ -847,6 +875,8 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result); normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -887,6 +917,8 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn) ...@@ -887,6 +917,8 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
extractFirstChannel_32F<4><<<grid, threads>>>(image, result); extractFirstChannel_32F<4><<<grid, threads>>>(image, result);
break; break;
} }
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
......
...@@ -150,6 +150,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -150,6 +150,7 @@ namespace cv { namespace gpu { namespace mathfunc
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>( cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows); mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -198,6 +199,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -198,6 +199,7 @@ namespace cv { namespace gpu { namespace mathfunc
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows); angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
......
This diff is collapsed.
This diff is collapsed.
...@@ -233,6 +233,8 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -233,6 +233,8 @@ namespace cv { namespace gpu { namespace split_merge {
src[0].data, src[0].step, src[0].data, src[0].step,
src[1].data, src[1].step, src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step); dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -248,6 +250,8 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -248,6 +250,8 @@ namespace cv { namespace gpu { namespace split_merge {
src[1].data, src[1].step, src[1].data, src[1].step,
src[2].data, src[2].step, src[2].data, src[2].step,
dst.rows, dst.cols, dst.data, dst.step); dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -264,6 +268,8 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -264,6 +268,8 @@ namespace cv { namespace gpu { namespace split_merge {
src[2].data, src[2].step, src[2].data, src[2].step,
src[3].data, src[3].step, src[3].data, src[3].step,
dst.rows, dst.cols, dst.data, dst.step); dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -436,6 +442,8 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -436,6 +442,8 @@ namespace cv { namespace gpu { namespace split_merge {
src.data, src.step, src.rows, src.cols, src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step, dst[0].data, dst[0].step,
dst[1].data, dst[1].step); dst[1].data, dst[1].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -451,6 +459,8 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -451,6 +459,8 @@ namespace cv { namespace gpu { namespace split_merge {
dst[0].data, dst[0].step, dst[0].data, dst[0].step,
dst[1].data, dst[1].step, dst[1].data, dst[1].step,
dst[2].data, dst[2].step); dst[2].data, dst[2].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -467,6 +477,8 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -467,6 +477,8 @@ namespace cv { namespace gpu { namespace split_merge {
dst[1].data, dst[1].step, dst[1].data, dst[1].step,
dst[2].data, dst[2].step, dst[2].data, dst[2].step,
dst[3].data, dst[3].step); dst[3].data, dst[3].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
......
...@@ -325,6 +325,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri ...@@ -325,6 +325,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int); size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp); stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
}; };
...@@ -402,6 +404,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, ...@@ -402,6 +404,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
grid.y = divUp(input.rows, threads.y); grid.y = divUp(input.rows, threads.y);
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap); prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -526,6 +529,7 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a ...@@ -526,6 +529,7 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float); size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold); textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
......
...@@ -172,6 +172,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -172,6 +172,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y); grid.y = divUp(left.rows, threads.y);
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data); comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -185,6 +186,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -185,6 +186,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y); grid.y = divUp(left.rows, threads.y);
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data); comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -199,6 +201,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -199,6 +201,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y); grid.y = divUp(left.rows, threads.y);
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data); comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -212,6 +215,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -212,6 +215,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y); grid.y = divUp(left.rows, threads.y);
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data); comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -226,6 +230,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -226,6 +230,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y); grid.y = divUp(left.rows, threads.y);
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data); comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -239,6 +244,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -239,6 +244,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y); grid.y = divUp(left.rows, threads.y);
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data); comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -278,6 +284,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -278,6 +284,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(dst_rows, threads.y); grid.y = divUp(dst_rows, threads.y);
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst); data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -321,9 +328,13 @@ namespace cv { namespace gpu { namespace bp ...@@ -321,9 +328,13 @@ namespace cv { namespace gpu { namespace bp
int src_idx = (dst_idx + 1) & 1; int src_idx = (dst_idx + 1) & 1;
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -443,6 +454,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -443,6 +454,7 @@ namespace cv { namespace gpu { namespace bp
for(int t = 0; t < iters; ++t) for(int t = 0; t < iters; ++t)
{ {
one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows); one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -505,6 +517,7 @@ namespace cv { namespace gpu { namespace bp ...@@ -505,6 +517,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(disp.rows, threads.y); grid.y = divUp(disp.rows, threads.y);
output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp); output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
......
...@@ -382,6 +382,8 @@ namespace cv { namespace gpu { namespace csbp ...@@ -382,6 +382,8 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream); init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -395,6 +397,9 @@ namespace cv { namespace gpu { namespace csbp ...@@ -395,6 +397,9 @@ namespace cv { namespace gpu { namespace csbp
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane); get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
else else
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane); get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -578,6 +583,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -578,6 +583,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream); callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -700,10 +706,11 @@ namespace cv { namespace gpu { namespace csbp ...@@ -700,10 +706,11 @@ namespace cv { namespace gpu { namespace csbp
grid.y = divUp(h, threads.y); grid.y = divUp(h, threads.y);
init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new, init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
u_cur, d_cur, l_cur, r_cur, u_cur, d_cur, l_cur, r_cur,
selected_disp_pyr_new, selected_disp_pyr_cur, selected_disp_pyr_new, selected_disp_pyr_cur,
data_cost_selected, data_cost, data_cost_selected, data_cost,
h, w, nr_plane, h2, w2, nr_plane2); h, w, nr_plane, h2, w2, nr_plane2);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -805,6 +812,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -805,6 +812,7 @@ namespace cv { namespace gpu { namespace csbp
for(int t = 0; t < iters; ++t) for(int t = 0; t < iters; ++t)
{ {
compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1); compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -873,7 +881,9 @@ namespace cv { namespace gpu { namespace csbp ...@@ -873,7 +881,9 @@ namespace cv { namespace gpu { namespace csbp
grid.y = divUp(disp.rows, threads.y); grid.y = divUp(disp.rows, threads.y);
compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane); disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
......
This diff is collapsed.
...@@ -61,8 +61,8 @@ void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) { ...@@ -61,8 +61,8 @@ void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) {
void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
...@@ -77,8 +77,10 @@ namespace cv ...@@ -77,8 +77,10 @@ namespace cv
{ {
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0); template <typename T>
void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0); void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
} }
...@@ -99,6 +101,20 @@ namespace ...@@ -99,6 +101,20 @@ namespace
size_t bwidth = src.cols * src.elemSize(); size_t bwidth = src.cols * src.elemSize();
cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) ); cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
}; };
template <typename T>
void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
{
Scalar_<T> sf = s;
matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);
}
template <typename T>
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
{
Scalar_<T> sf = s;
matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);
}
} }
CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; }; CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; };
...@@ -172,14 +188,26 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr ...@@ -172,14 +188,26 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr
void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); } void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); }
void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); } void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }
void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val) void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
{ {
matrix_operations::set_to_without_mask(src, src.depth(), val.val, src.channels(), impl->stream); typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
static const set_caller_t set_callers[] =
{
kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,
kernelSet<int>, kernelSet<float>, kernelSet<double>
};
set_callers[src.depth()](src, val, impl->stream);
} }
void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask) void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
{ {
matrix_operations::set_to_with_mask(src, src.depth(), val.val, mask, src.channels(), impl->stream); typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);
static const set_caller_t set_callers[] =
{
kernelSetMask<uchar>, kernelSetMask<schar>, kernelSetMask<ushort>, kernelSetMask<short>,
kernelSetMask<int>, kernelSetMask<float>, kernelSetMask<double>
};
set_callers[src.depth()](src, val, mask, impl->stream);
} }
void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta) void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
......
This diff is collapsed.
...@@ -128,6 +128,8 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp ...@@ -128,6 +128,8 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria) void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)
{ {
CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
if( src.empty() ) if( src.empty() )
CV_Error( CV_StsBadArg, "The input image is empty" ); CV_Error( CV_StsBadArg, "The input image is empty" );
...@@ -154,6 +156,8 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, ...@@ -154,6 +156,8 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria) void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)
{ {
CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
if( src.empty() ) if( src.empty() )
CV_Error( CV_StsBadArg, "The input image is empty" ); CV_Error( CV_StsBadArg, "The input image is empty" );
......
...@@ -87,8 +87,10 @@ namespace cv ...@@ -87,8 +87,10 @@ namespace cv
{ {
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0); template <typename T>
void set_to_with_mask (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0); void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0); void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
} }
...@@ -363,9 +365,11 @@ namespace ...@@ -363,9 +365,11 @@ namespace
} }
}; };
template <typename T>
void kernelSet(GpuMat& src, const Scalar& s) void kernelSet(GpuMat& src, const Scalar& s)
{ {
matrix_operations::set_to_without_mask(src, src.depth(), s.val, src.channels()); Scalar_<T> sf = s;
matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);
} }
template<int SDEPTH, int SCN> struct NppSetMaskFunc template<int SDEPTH, int SCN> struct NppSetMaskFunc
...@@ -412,9 +416,11 @@ namespace ...@@ -412,9 +416,11 @@ namespace
} }
}; };
template <typename T>
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask) void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
{ {
matrix_operations::set_to_with_mask(src, src.depth(), s.val, mask, src.channels()); Scalar_<T> sf = s;
matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);
} }
} }
...@@ -433,13 +439,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask) ...@@ -433,13 +439,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s); typedef void (*set_caller_t)(GpuMat& src, const Scalar& s);
static const set_caller_t set_callers[8][4] = static const set_caller_t set_callers[8][4] =
{ {
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet,kernelSet,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set}, {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
{kernelSet,kernelSet,kernelSet,kernelSet}, {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet,kernelSet,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set}, {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet<ushort>,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet,kernelSet,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set}, {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet<short>,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet,kernelSet,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set}, {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet,kernelSet,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set}, {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
{kernelSet,kernelSet,kernelSet,kernelSet}, {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>},
{0,0,0,0} {0,0,0,0}
}; };
set_callers[depth()][channels()-1](*this, s); set_callers[depth()][channels()-1](*this, s);
...@@ -449,13 +455,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask) ...@@ -449,13 +455,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask); typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask);
static const set_caller_t set_callers[8][4] = static const set_caller_t set_callers[8][4] =
{ {
{NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set}, {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
{kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask}, {kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},
{NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set}, {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
{NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set}, {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
{NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set}, {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask,kernelSetMask,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set}, {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
{kernelSetMask,kernelSetMask,kernelSetMask,kernelSetMask}, {kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>},
{0,0,0,0} {0,0,0,0}
}; };
set_callers[depth()][channels()-1](*this, s, mask); set_callers[depth()][channels()-1](*this, s, mask);
......
...@@ -227,6 +227,8 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs) ...@@ -227,6 +227,8 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria) void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)
{ {
CV_Assert(TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12));
CV_Assert(src.type() == CV_8UC4); CV_Assert(src.type() == CV_8UC4);
const int nrows = src.rows; const int nrows = src.rows;
const int ncols = src.cols; const int ncols = src.cols;
......
...@@ -40,6 +40,9 @@ ...@@ -40,6 +40,9 @@
// //
//M*/ //M*/
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
namespace cv namespace cv
{ {
namespace gpu namespace gpu
...@@ -48,7 +51,7 @@ namespace cv ...@@ -48,7 +51,7 @@ namespace cv
{ {
struct BrdReflect101 struct BrdReflect101
{ {
BrdReflect101(int len): last(len - 1) {} explicit BrdReflect101(int len): last(len - 1) {}
__device__ int idx_low(int i) const __device__ int idx_low(int i) const
{ {
...@@ -62,7 +65,7 @@ namespace cv ...@@ -62,7 +65,7 @@ namespace cv
__device__ int idx(int i) const __device__ int idx(int i) const
{ {
return abs(idx_high(i)); return idx_low(idx_high(i));
} }
bool is_range_safe(int mini, int maxi) const bool is_range_safe(int mini, int maxi) const
...@@ -70,49 +73,55 @@ namespace cv ...@@ -70,49 +73,55 @@ namespace cv
return -last <= mini && maxi <= 2 * last; return -last <= mini && maxi <= 2 * last;
} }
private:
int last; int last;
}; };
template <typename T> template <typename D>
struct BrdRowReflect101: BrdReflect101 struct BrdRowReflect101: BrdReflect101
{ {
BrdRowReflect101(int len): BrdReflect101(len) {} explicit BrdRowReflect101(int len): BrdReflect101(len) {}
__device__ float at_low(int i, const T* data) const template <typename T>
__device__ D at_low(int i, const T* data) const
{ {
return data[idx_low(i)]; return saturate_cast<D>(data[idx_low(i)]);
} }
__device__ float at_high(int i, const T* data) const template <typename T>
__device__ D at_high(int i, const T* data) const
{ {
return data[idx_high(i)]; return saturate_cast<D>(data[idx_high(i)]);
} }
}; };
template <typename T> template <typename D>
struct BrdColReflect101: BrdReflect101 struct BrdColReflect101: BrdReflect101
{ {
BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {} BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
__device__ float at_low(int i, const T* data) const template <typename T>
__device__ D at_low(int i, const T* data) const
{ {
return data[idx_low(i) * step]; return saturate_cast<D>(data[idx_low(i) * step]);
} }
__device__ float at_high(int i, const T* data) const template <typename T>
__device__ D at_high(int i, const T* data) const
{ {
return data[idx_high(i) * step]; return saturate_cast<D>(data[idx_high(i) * step]);
} }
private:
int step; int step;
}; };
struct BrdReplicate struct BrdReplicate
{ {
BrdReplicate(int len): last(len - 1) {} explicit BrdReplicate(int len): last(len - 1) {}
__device__ int idx_low(int i) const __device__ int idx_low(int i) const
{ {
...@@ -126,7 +135,7 @@ namespace cv ...@@ -126,7 +135,7 @@ namespace cv
__device__ int idx(int i) const __device__ int idx(int i) const
{ {
return max(min(i, last), 0); return idx_low(idx_high(i));
} }
bool is_range_safe(int mini, int maxi) const bool is_range_safe(int mini, int maxi) const
...@@ -134,42 +143,104 @@ namespace cv ...@@ -134,42 +143,104 @@ namespace cv
return true; return true;
} }
private:
int last; int last;
}; };
template <typename T> template <typename D>
struct BrdRowReplicate: BrdReplicate struct BrdRowReplicate: BrdReplicate
{ {
BrdRowReplicate(int len): BrdReplicate(len) {} explicit BrdRowReplicate(int len): BrdReplicate(len) {}
__device__ float at_low(int i, const T* data) const template <typename T>
__device__ D at_low(int i, const T* data) const
{ {
return data[idx_low(i)]; return saturate_cast<D>(data[idx_low(i)]);
} }
__device__ float at_high(int i, const T* data) const template <typename T>
__device__ D at_high(int i, const T* data) const
{ {
return data[idx_high(i)]; return saturate_cast<D>(data[idx_high(i)]);
} }
}; };
template <typename T> template <typename D>
struct BrdColReplicate: BrdReplicate struct BrdColReplicate: BrdReplicate
{ {
BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {} BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
__device__ float at_low(int i, const T* data) const template <typename T>
__device__ D at_low(int i, const T* data) const
{ {
return data[idx_low(i) * step]; return saturate_cast<D>(data[idx_low(i) * step]);
} }
__device__ float at_high(int i, const T* data) const template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i) * step]);
}
private:
int step;
};
template <typename D>
struct BrdRowConstant
{
explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{ {
return data[idx_high(i) * step]; return i >= 0 ? saturate_cast<D>(data[i]) : val;
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i]) : val;
}
bool is_range_safe(int mini, int maxi) const
{
return true;
} }
private:
int len;
D val;
};
template <typename D>
struct BrdColConstant
{
BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i * step]) : val;
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
private:
int len;
int step; int step;
D val;
}; };
} }
} }
......
...@@ -329,6 +329,7 @@ namespace cv ...@@ -329,6 +329,7 @@ namespace cv
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
device::transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op); device::transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -345,6 +346,7 @@ namespace cv ...@@ -345,6 +346,7 @@ namespace cv
grid.y = divUp(src1.rows, threads.y); grid.y = divUp(src1.rows, threads.y);
device::transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op); device::transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -365,6 +367,7 @@ namespace cv ...@@ -365,6 +367,7 @@ namespace cv
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
device::transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op); device::transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -383,6 +386,7 @@ namespace cv ...@@ -383,6 +386,7 @@ namespace cv
grid.y = divUp(src1.rows, threads.y); grid.y = divUp(src1.rows, threads.y);
device::transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op); device::transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
......
...@@ -65,6 +65,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -65,6 +65,7 @@ namespace cv { namespace gpu { namespace surf
dim3 calcBlockSize(int nIntervals); dim3 calcBlockSize(int nIntervals);
void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads); void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld);
void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter,
int x_size, int y_size, bool use_mask, const dim3& threads); int x_size, int y_size, bool use_mask, const dim3& threads);
...@@ -75,6 +76,7 @@ namespace cv { namespace gpu { namespace surf ...@@ -75,6 +76,7 @@ namespace cv { namespace gpu { namespace surf
void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures); void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures); void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
}}} }}}
using namespace cv::gpu::surf; using namespace cv::gpu::surf;
...@@ -170,6 +172,10 @@ namespace ...@@ -170,6 +172,10 @@ namespace
void detectKeypoints(GpuMat& keypoints) void detectKeypoints(GpuMat& keypoints)
{ {
typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
const fasthessian_t fasthessian =
DeviceInfo().supports(COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old;
dim3 threads = calcBlockSize(nIntervals); dim3 threads = calcBlockSize(nIntervals);
for(int octave = 0; octave < nOctaves; ++octave) for(int octave = 0; octave < nOctaves; ++octave)
{ {
...@@ -192,7 +198,7 @@ namespace ...@@ -192,7 +198,7 @@ namespace
uploadConstant("cv::gpu::surf::c_border", border); uploadConstant("cv::gpu::surf::c_border", border);
uploadConstant("cv::gpu::surf::c_step", step); uploadConstant("cv::gpu::surf::c_step", step);
fasthessian_gpu(hessianBuffer, x_size, y_size, threads); fasthessian(hessianBuffer, x_size, y_size, threads);
// Reset the candidate count. // Reset the candidate count.
maxCounter = 0; maxCounter = 0;
...@@ -201,10 +207,13 @@ namespace ...@@ -201,10 +207,13 @@ namespace
maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates)); maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, if (maxCounter > 0)
featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter); {
fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,
featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);
featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features)); featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
}
} }
if (featureCounter > 0) if (featureCounter > 0)
...@@ -221,10 +230,16 @@ namespace ...@@ -221,10 +230,16 @@ namespace
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize) void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
{ {
typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors,
const KeyPoint_GPU* features, int nFeatures);
const compute_descriptors_t compute_descriptors =
DeviceInfo().supports(COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
if (keypoints.cols > 0) if (keypoints.cols > 0)
{ {
descriptors.create(keypoints.cols, descriptorSize, CV_32F); descriptors.create(keypoints.cols, descriptorSize, CV_32F);
compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols); compute_descriptors(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
} }
} }
......
...@@ -384,6 +384,14 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa ...@@ -384,6 +384,14 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train ) void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train )
{ {
bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS);
if (!atomics_ok)
{
ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)");
ts->set_failed_test_info(CvTS::FAIL_GENERIC);
return;
}
dmatcher.clear(); dmatcher.clear();
// test const version of match() // test const version of match()
{ {
...@@ -501,15 +509,24 @@ void CV_GpuBruteForceMatcherTest::dataTest(int dim) ...@@ -501,15 +509,24 @@ void CV_GpuBruteForceMatcherTest::dataTest(int dim)
void CV_GpuBruteForceMatcherTest::run(int) void CV_GpuBruteForceMatcherTest::run(int)
{ {
emptyDataTest(); try
{
dataTest(50); emptyDataTest();
dataTest(64);
dataTest(100); dataTest(50);
dataTest(128); dataTest(64);
dataTest(200); dataTest(100);
dataTest(256); dataTest(128);
dataTest(300); dataTest(200);
dataTest(256);
dataTest(300);
}
catch(cv::Exception& e)
{
if (!check_and_treat_gpu_exception(e, ts))
throw;
return;
}
} }
CV_GpuBruteForceMatcherTest CV_GpuBruteForceMatcher_test; CV_GpuBruteForceMatcherTest CV_GpuBruteForceMatcher_test;
...@@ -154,7 +154,7 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints ...@@ -154,7 +154,7 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
return; return;
} }
if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f) if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f)
{ {
ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n"); ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");
ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY ); ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
...@@ -221,10 +221,19 @@ void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector) ...@@ -221,10 +221,19 @@ void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector)
void CV_GPU_SURFTest::run( int /*start_from*/ ) void CV_GPU_SURFTest::run( int /*start_from*/ )
{ {
SURF_GPU fdetector; try
{
SURF_GPU fdetector;
emptyDataTest(fdetector); emptyDataTest(fdetector);
regressionTest(fdetector); regressionTest(fdetector);
}
catch(cv::Exception& e)
{
if (!check_and_treat_gpu_exception(e, ts))
throw;
return;
}
} }
CV_GPU_SURFTest CV_GPU_SURF_test; CV_GPU_SURFTest CV_GPU_SURF_test;
...@@ -43,15 +43,15 @@ ...@@ -43,15 +43,15 @@
CvTS test_system("gpu"); CvTS test_system("gpu");
const char* blacklist[] = //const char* blacklist[] =
{ //{
"GPU-NppImageCanny", // NPP_TEXTURE_BIND_ERROR // "GPU-NVidia",
0 // 0
}; //};
int main( int argc, char** argv ) int main( int argc, char** argv )
{ {
return test_system.run( argc, argv, blacklist ); return test_system.run( argc, argv );
} }
/* End of file. */ /* End of file. */
...@@ -43,6 +43,9 @@ ...@@ -43,6 +43,9 @@
#include <iostream> #include <iostream>
#include <string> #include <string>
using namespace cv;
using namespace cv::gpu;
struct CV_GpuMeanShiftTest : public CvTest struct CV_GpuMeanShiftTest : public CvTest
{ {
...@@ -50,6 +53,14 @@ struct CV_GpuMeanShiftTest : public CvTest ...@@ -50,6 +53,14 @@ struct CV_GpuMeanShiftTest : public CvTest
void run(int) void run(int)
{ {
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
if (!cc12_ok)
{
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
ts->set_failed_test_info(CvTS::FAIL_GENERIC);
return;
}
int spatialRad = 30; int spatialRad = 30;
int colorRad = 30; int colorRad = 30;
...@@ -134,6 +145,14 @@ struct CV_GpuMeanShiftProcTest : public CvTest ...@@ -134,6 +145,14 @@ struct CV_GpuMeanShiftProcTest : public CvTest
void run(int) void run(int)
{ {
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
if (!cc12_ok)
{
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
ts->set_failed_test_info(CvTS::FAIL_GENERIC);
return;
}
int spatialRad = 30; int spatialRad = 30;
int colorRad = 30; int colorRad = 30;
......
...@@ -54,6 +54,14 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest { ...@@ -54,6 +54,14 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
{ {
try try
{ {
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
if (!cc12_ok)
{
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
ts->set_failed_test_info(CvTS::FAIL_GENERIC);
return;
}
Mat img_rgb = imread(string(ts->get_data_path()) + "meanshift/cones.png"); Mat img_rgb = imread(string(ts->get_data_path()) + "meanshift/cones.png");
if (img_rgb.empty()) if (img_rgb.empty())
{ {
......
...@@ -91,14 +91,14 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */) ...@@ -91,14 +91,14 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */)
Mat cpumatdst; Mat cpumatdst;
GpuMat gpumatdst; GpuMat gpumatdst;
cpumatsrc.convertTo(cpumatdst, dst_type); cpumatsrc.convertTo(cpumatdst, dst_type, 0.5, 3.0);
gpumatsrc.convertTo(gpumatdst, dst_type); gpumatsrc.convertTo(gpumatdst, dst_type, 0.5, 3.0);
double r = norm(cpumatdst, gpumatdst, NORM_INF); double r = norm(cpumatdst, gpumatdst, NORM_INF);
if (r > 1) if (r > 1)
{ {
ts->printf(CvTS::LOG, ts->printf(CvTS::LOG,
"\nFAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %d\n", "\nFAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %f\n",
types_str[i], c, types_str[j], r); types_str[i], c, types_str[j], r);
passed = false; passed = false;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment