Commit 2777ebb8 authored by Marina Kolpakova's avatar Marina Kolpakova

merged GPU scan

parent 6cca6a45
if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
message(STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.)
return()
return()
endif()
find_package(CUDA 4.1)
if(CUDA_FOUND)
......@@ -23,7 +23,7 @@ if(CUDA_FOUND)
else()
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
endif()
set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
......@@ -89,8 +89,8 @@ if(CUDA_FOUND)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
endif()
# we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
# we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
string(REPLACE "-ggdb3" "" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
CUDA_COMPILE(${VAR} ${ARGN})
set(CMAKE_CXX_DEBUG_FLAGS ${CMAKE_CXX_FLAGS_DEBUG_})
......
......@@ -90,6 +90,40 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
Interpolation(cv::INTER_CUBIC), Interpolation(cv::INTER_AREA)),
testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));
GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
{
cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
cv::gpu::setDevice(devInfo.deviceID());
cv::Size size = GET_PARAM(1);
int type = GET_PARAM(2);
int interpolation = cv::INTER_AREA;
double f = GET_PARAM(3);
cv::Mat src_host(size, type);
fill(src_host, 0, 255);
cv::gpu::GpuMat src(src_host);
cv::gpu::GpuMat dst;
cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
declare.time(1.0);
TEST_CYCLE()
{
cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
}
}
INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
ALL_DEVICES,
testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/),
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
//////////////////////////////////////////////////////////////////////
// WarpAffine
......
......@@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device
struct Mask8U
{
explicit Mask8U(PtrStepb mask): mask(mask) {}
explicit Mask8U(PtrStepb mask_): mask(mask_) {}
__device__ __forceinline__ bool operator()(int y, int x) const
{
......
......@@ -46,7 +46,8 @@
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp"
# include <cfloat>
#include <cfloat>
#include <opencv2/gpu/device/scan.hpp>
namespace cv { namespace gpu { namespace device
{
......
......@@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC2_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
......@@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC3_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
......@@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC4_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
......@@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC2_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
......@@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC3_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
......@@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC4_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
......
......@@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
//===================================================================
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_)
:
currentSize(0),
_maxSize(0),
......@@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
begin(NULL),
end(NULL),
_memType(NCVMemoryTypeNone),
_alignment(alignment),
_alignment(alignment_),
bReusesMemory(false)
{
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
}
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr)
:
currentSize(0),
_maxSize(0),
allocBegin(NULL),
_memType(memT),
_alignment(alignment)
_alignment(alignment_)
{
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
......@@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
//===================================================================
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_)
:
currentSize(0),
_maxSize(0),
_memType(memT),
_alignment(alignment)
_alignment(alignment_)
{
ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
}
......
......@@ -64,7 +64,7 @@
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)
#endif
namespace cv { namespace gpu
namespace cv { namespace gpu
{
void error(const char *error_string, const char *file, const int line, const char *func);
......@@ -87,14 +87,14 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
#ifdef __CUDACC__
namespace cv { namespace gpu
{
__host__ __device__ __forceinline__ int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
namespace cv { namespace gpu
{
__host__ __device__ __forceinline__ int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
namespace device
namespace device
{
typedef unsigned char uchar;
typedef unsigned short ushort;
......
......@@ -45,7 +45,7 @@
#include "common.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
......@@ -54,13 +54,13 @@ namespace cv { namespace gpu { namespace device
{
__device__ __forceinline__ static void Load(const T* ptr, int offset, T& val) { val = ptr[offset]; }
};
#else // __CUDA_ARCH__ >= 200
#if defined(_WIN64) || defined(__LP64__)
#else // __CUDA_ARCH__ >= 200
#if defined(_WIN64) || defined(__LP64__)
// 64-bit register modifier for inlined asm
#define OPENCV_GPU_ASM_PTR "l"
#else
#else
// 32-bit register modifier for inlined asm
#define OPENCV_GPU_ASM_PTR "r"
#endif
......@@ -84,21 +84,21 @@ namespace cv { namespace gpu { namespace device
asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
} \
};
OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)
OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)
OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)
OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)
OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)
OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)
OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r)
OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f)
OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d)
OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r)
OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f)
OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d)
#undef OPENCV_GPU_DEFINE_FORCE_GLOB
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
#undef OPENCV_GPU_ASM_PTR
#endif // __CUDA_ARCH__ >= 200
}}} // namespace cv { namespace gpu { namespace device
......
......@@ -44,7 +44,7 @@
#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
namespace cv { namespace gpu { namespace device
{
{
template<class T> struct DynamicSharedMem
{
__device__ __forceinline__ operator T*()
......
......@@ -45,21 +45,21 @@
#include "warp_reduce.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
struct Emulation
{
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
{
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
{
#if __CUDA_ARCH__ >= 200
(void)cta_buffer;
return __ballot(predicate);
(void)cta_buffer;
return __ballot(predicate);
#else
int tid = threadIdx.x;
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
return warp_reduce(cta_buffer);
int tid = threadIdx.x;
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
return warp_reduce(cta_buffer);
#endif
}
}
};
}}} // namespace cv { namespace gpu { namespace device
......
......@@ -46,14 +46,14 @@
#include <cstdio>
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
template<class Func>
template<class Func>
void printFuncAttrib(Func& func)
{
cudaFuncAttributes attrs;
cudaFuncGetAttributes(&attrs, func);
cudaFuncGetAttributes(&attrs, func);
printf("=== Function stats ===\n");
printf("Name: \n");
......@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
printf("ptxVersion = %d\n", attrs.ptxVersion);
printf("binaryVersion = %d\n", attrs.binaryVersion);
printf("\n");
fflush(stdout);
fflush(stdout);
}
}}} // namespace cv { namespace gpu { namespace device
......
......@@ -48,7 +48,7 @@
#include "vec_traits.hpp"
#include "type_traits.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
// Function Objects
......@@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> struct bit_not : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
{
return ~v;
}
......@@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device
// Generalized Identity Operations
template <typename T> struct identity : unary_function<T, T>
{
__device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
__device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
{
return x;
}
......@@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace device
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
{
__device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
__device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
{
return lhs;
}
......@@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
{
__device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
__device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
{
return rhs;
}
......@@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> struct maximum : binary_function<T, T, T>
{
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{
return lhs < rhs ? rhs : lhs;
}
......@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> struct minimum : binary_function<T, T, T>
{
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{
return lhs < rhs ? lhs : rhs;
}
......@@ -410,12 +410,14 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
{
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
{
return src1 * src1 + src2 * src2;
}
__device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
__device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
};
// Saturate Cast Functor
......@@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
{
return (src > thresh) * maxVal;
}
__device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
......@@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
{
return (src <= thresh) * maxVal;
}
__device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
......@@ -519,12 +523,16 @@ namespace cv { namespace gpu { namespace device
explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
{
return !pred(x);
{
return !pred(x);
}
__device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
__device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
const Predicate pred;
};
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
{
return unary_negate<Predicate>(pred);
......@@ -534,19 +542,26 @@ namespace cv { namespace gpu { namespace device
{
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
{
return !pred(x,y);
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
{
return !pred(x,y);
}
__device__ __forceinline__ binary_negate(const binary_negate& other)
: binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
__device__ __forceinline__ binary_negate() :
binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
const Predicate pred;
};
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
{
return binary_negate<BinaryPredicate>(pred);
}
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
{
__host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
......@@ -555,15 +570,19 @@ namespace cv { namespace gpu { namespace device
return op(arg1, a);
}
__device__ __forceinline__ binder1st(const binder1st& other) :
unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
const Op op;
const typename Op::first_argument_type arg1;
};
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
{
return binder1st<Op>(op, typename Op::first_argument_type(x));
}
template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
{
__host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
......@@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
return op(a, arg2);
}
__device__ __forceinline__ binder2nd(const binder2nd& other) :
unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
const Op op;
const typename Op::second_argument_type arg2;
};
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
{
return binder2nd<Op>(op, typename Op::second_argument_type(x));
}
// Functor Traits
template <typename F> struct IsUnaryFunction
{
typedef char Yes;
......@@ -618,7 +640,7 @@ namespace cv { namespace gpu { namespace device
{
enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
};
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
......
......@@ -46,7 +46,7 @@
#include <limits>
#include "common.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
template<class T> struct numeric_limits
{
......
......@@ -57,35 +57,35 @@ namespace cv { namespace gpu { namespace device
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
{
return (uchar) ::max((int)v, 0);
{
return (uchar) ::max((int)v, 0);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
{
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
{
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
{
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
{
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
{
return (uchar) ::min(v, (uint)UCHAR_MAX);
{
return (uchar) ::min(v, (uint)UCHAR_MAX);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
return saturate_cast<uchar>((uint)v);
{
return saturate_cast<uchar>((uint)v);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<uchar>(iv);
{
int iv = __float2int_rn(v);
return saturate_cast<uchar>(iv);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
int iv = __double2int_rn(v);
return saturate_cast<uchar>(iv);
#else
return saturate_cast<uchar>((float)v);
......@@ -93,35 +93,35 @@ namespace cv { namespace gpu { namespace device
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
{
return (schar) ::min((int)v, SCHAR_MAX);
{
return (schar) ::min((int)v, SCHAR_MAX);
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
{
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
{
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
{
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
{
return saturate_cast<schar>((int)v);
{
return saturate_cast<schar>((int)v);
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
{
return (schar) ::min(v, (uint)SCHAR_MAX);
{
return (schar) ::min(v, (uint)SCHAR_MAX);
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<schar>(iv);
{
int iv = __float2int_rn(v);
return saturate_cast<schar>(iv);
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
int iv = __double2int_rn(v);
return saturate_cast<schar>(iv);
#else
return saturate_cast<schar>((float)v);
......@@ -129,30 +129,30 @@ namespace cv { namespace gpu { namespace device
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
{
return (ushort) ::max((int)v, 0);
{
return (ushort) ::max((int)v, 0);
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
{
return (ushort) ::max((int)v, 0);
{
return (ushort) ::max((int)v, 0);
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
{
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
{
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
{
return (ushort) ::min(v, (uint)USHRT_MAX);
{
return (ushort) ::min(v, (uint)USHRT_MAX);
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<ushort>(iv);
int iv = __float2int_rn(v);
return saturate_cast<ushort>(iv);
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
int iv = __double2int_rn(v);
return saturate_cast<ushort>(iv);
#else
return saturate_cast<ushort>((float)v);
......@@ -160,37 +160,37 @@ namespace cv { namespace gpu { namespace device
}
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
{
return (short) ::min((int)v, SHRT_MAX);
{
return (short) ::min((int)v, SHRT_MAX);
}
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
{
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
}
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
{
return (short) ::min(v, (uint)SHRT_MAX);
{
return (short) ::min(v, (uint)SHRT_MAX);
}
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<short>(iv);
{
int iv = __float2int_rn(v);
return saturate_cast<short>(iv);
}
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
{
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
int iv = __double2int_rn(v);
return saturate_cast<short>(iv);
#else
return saturate_cast<short>((float)v);
#endif
}
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
{
return __float2int_rn(v);
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
{
return __float2int_rn(v);
}
template<> __device__ __forceinline__ int saturate_cast<int>(double v)
template<> __device__ __forceinline__ int saturate_cast<int>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
return __double2int_rn(v);
......@@ -200,11 +200,11 @@ namespace cv { namespace gpu { namespace device
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
{
return __float2uint_rn(v);
{
return __float2uint_rn(v);
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
{
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
{
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
return __double2uint_rn(v);
#else
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_SCAN_HPP__
#define __OPENCV_GPU_SCAN_HPP__
enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
template <ScanKind Kind, typename T, typename F> struct WarpScan
{
__device__ __forceinline__ WarpScan() {}
__device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
__device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
{
const unsigned int lane = idx & 31;
F op;
if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
if( Kind == INCLUSIVE )
return ptr [idx];
else
return (lane > 0) ? ptr [idx - 1] : 0;
}
__device__ __forceinline__ unsigned int index(const unsigned int tid)
{
return tid;
}
__device__ __forceinline__ void init(volatile T *ptr){}
static const int warp_offset = 0;
typedef WarpScan<INCLUSIVE, T, F> merge;
};
template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
{
__device__ __forceinline__ WarpScanNoComp() {}
__device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
__device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
{
const unsigned int lane = threadIdx.x & 31;
F op;
ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
if( Kind == INCLUSIVE )
return ptr [idx];
else
return (lane > 0) ? ptr [idx - 1] : 0;
}
__device__ __forceinline__ unsigned int index(const unsigned int tid)
{
return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
}
__device__ __forceinline__ void init(volatile T *ptr)
{
ptr[threadIdx.x] = 0;
}
static const int warp_smem_stride = 32 + 16 + 1;
static const int warp_offset = 16;
static const int warp_log = 5;
static const int warp_mask = 31;
typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
};
template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
{
__device__ __forceinline__ BlockScan() {}
__device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
__device__ __forceinline__ T operator()(volatile T *ptr)
{
const unsigned int tid = threadIdx.x;
const unsigned int lane = tid & warp_mask;
const unsigned int warp = tid >> warp_log;
Sc scan;
typename Sc::merge merge_scan;
const unsigned int idx = scan.index(tid);
T val = scan(ptr, idx);
__syncthreads ();
if( warp == 0)
scan.init(ptr);
__syncthreads ();
if( lane == 31 )
ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
__syncthreads ();
if( warp == 0 )
merge_scan(ptr, idx);
__syncthreads();
if ( warp > 0)
val = ptr [scan.warp_offset + warp - 1] + val;
__syncthreads ();
ptr[idx] = val;
__syncthreads ();
return val ;
}
static const int warp_log = 5;
static const int warp_mask = 31;
};
#endif
\ No newline at end of file
......@@ -43,27 +43,27 @@
#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
#if defined(__CUDACC__)
#define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
#if defined(__CUDACC__)
#define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
#else
#define __OPENCV_GPU_HOST_DEVICE__
#endif
#endif
namespace cv { namespace gpu
{
namespace cv { namespace gpu
{
namespace device
{
template<bool expr> struct Static {};
template<> struct Static<true>
{
__OPENCV_GPU_HOST_DEVICE__ static void check() {};
template<> struct Static<true>
{
__OPENCV_GPU_HOST_DEVICE__ static void check() {};
};
}
}
using ::cv::gpu::device::Static;
}}
#undef __OPENCV_GPU_HOST_DEVICE__
#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
\ No newline at end of file
......@@ -47,7 +47,7 @@
#include "utility.hpp"
#include "detail/transform_detail.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
template <typename T, typename D, typename UnOp, typename Mask>
static inline void transform(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
......
......@@ -45,11 +45,11 @@
#include "detail/type_traits_detail.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
template <typename T> struct IsSimpleParameter
{
enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
};
......@@ -65,16 +65,16 @@ namespace cv { namespace gpu { namespace device
enum { isVolatile = type_traits_detail::UnVolatile<T>::value };
enum { isReference = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };
enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };
enum { isArith = isIntegral || isFloat };
enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };
typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };
enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };
enum { isArith = isIntegral || isFloat };
enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };
typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
};
}}}
......
......@@ -47,17 +47,17 @@
#include "datamov_utils.hpp"
#include "detail/utility_detail.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
#define OPENCV_GPU_LOG_WARP_SIZE (5)
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_WARP_SIZE (5)
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
///////////////////////////////////////////////////////////////////////////////
// swap
template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
{
const T temp = a;
a = b;
......@@ -71,9 +71,9 @@ namespace cv { namespace gpu { namespace device
{
explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
__host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
__device__ __forceinline__ bool operator()(int y, int x) const
{
{
return mask.ptr(y)[x] != 0;
}
......@@ -82,13 +82,13 @@ namespace cv { namespace gpu { namespace device
struct SingleMaskChannels
{
__host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
__host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
: mask(mask_), channels(channels_) {}
__host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
:mask(mask_.mask), channels(mask_.channels){}
__device__ __forceinline__ bool operator()(int y, int x) const
{
{
return mask.ptr(y)[x / channels] != 0;
}
......@@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device
{
curMask = maskCollection[z];
}
__device__ __forceinline__ bool operator()(int y, int x) const
{
uchar val;
......@@ -165,20 +165,20 @@ namespace cv { namespace gpu { namespace device
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
}
template <int n, typename T, typename V, typename Pred>
template <int n, typename T, typename V, typename Pred>
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
}
template <int n, typename T, typename V1, typename V2, typename Pred>
template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
......@@ -212,17 +212,17 @@ namespace cv { namespace gpu { namespace device
{
double invdet = 1.0 / det;
x[0] = saturate_cast<T>(invdet *
x[0] = saturate_cast<T>(invdet *
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )));
x[1] = saturate_cast<T>(invdet *
x[1] = saturate_cast<T>(invdet *
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])));
x[2] = saturate_cast<T>(invdet *
x[2] = saturate_cast<T>(invdet *
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
......
......@@ -47,7 +47,7 @@
#include "functional.hpp"
#include "detail/vec_distance_detail.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
template <typename T> struct L1Dist
{
......@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
};
// calc distance between two vectors in global memory
template <int THREAD_DIM, typename Dist, typename T1, typename T2>
template <int THREAD_DIM, typename Dist, typename T1, typename T2>
__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
{
for (int i = tid; i < len; i += THREAD_DIM)
......@@ -170,9 +170,9 @@ namespace cv { namespace gpu { namespace device
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
{
{
vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
dist.reduceAll<THREAD_DIM>(smem, tid);
}
......
......@@ -47,7 +47,7 @@
#include "vec_traits.hpp"
#include "functional.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
namespace vec_math_detail
{
......@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
}
namespace vec_math_detail
{
{
template <typename T1, typename T2> struct BinOpTraits
{
typedef int argument_type;
......@@ -326,5 +326,5 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_VEC_OP
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
}}} // namespace cv { namespace gpu { namespace device
#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
......@@ -45,7 +45,7 @@
#include "common.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
template<typename T, int N> struct TypeVec;
......@@ -219,18 +219,18 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
template<> struct VecTraits<char>
{
template<> struct VecTraits<char>
{
typedef char elem_type;
enum {cn=1};
enum {cn=1};
static __device__ __host__ __forceinline__ char all(char v) {return v;}
static __device__ __host__ __forceinline__ char make(char x) {return x;}
static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
};
template<> struct VecTraits<schar>
{
template<> struct VecTraits<schar>
{
typedef schar elem_type;
enum {cn=1};
enum {cn=1};
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
......
......@@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
#define __OPENCV_GPU_DEVICE_WARP_HPP__
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
struct Warp
{
......@@ -64,18 +64,18 @@ namespace cv { namespace gpu { namespace device
template<typename It, typename T>
static __device__ __forceinline__ void fill(It beg, It end, const T& value)
{
{
for(It t = beg + laneId(); t < end; t += STRIDE)
*t = value;
}
}
template<typename InIt, typename OutIt>
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
{
{
for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
*out = *t;
return out;
}
}
template<typename InIt, typename OutIt, class UnOp>
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
......@@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device
{
unsigned int lane = laneId();
InIt1 t1 = beg1 + lane;
InIt1 t1 = beg1 + lane;
InIt2 t2 = beg2 + lane;
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
*out = op(*t1, *t2);
......@@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device
template<typename OutIt, typename T>
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
{
unsigned int lane = laneId();
unsigned int lane = laneId();
value += lane;
for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
......
......@@ -44,7 +44,32 @@
#ifndef HAVE_CUDA
void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
{
(void)src;
(void)dst;
(void)dsize;
(void)fx;
(void)fy;
(void)interpolation;
(void)s;
throw_nogpu();
}
void cv::gpu::resize(const GpuMat& src, GpuMat& dst,GpuMat& buffer, Size dsize,
double fx, double fy, int interpolation, Stream& s)
{
(void)src;
(void)dst;
(void)dsize;
(void)fx;
(void)fy;
(void)interpolation;
(void)buffer;
(void)s;
throw_nogpu();
}
#else // HAVE_CUDA
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment