Commit 3ab2728d authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

gpu device layer code refactoring

parent fa0daa48
......@@ -23,7 +23,9 @@ source_group("Include" FILES ${lib_hdrs})
#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
file(GLOB lib_device_hdrs_detail "src/opencv2/gpu/device/detail/*.h*")
source_group("Device" FILES ${lib_device_hdrs})
source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
if (HAVE_CUDA)
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
......@@ -83,7 +85,7 @@ foreach(d ${DEPS})
endif()
endforeach()
add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
# For dynamic link numbering convenions
set_target_properties(${the_target} PROPERTIES
......
This diff is collapsed.
......@@ -41,7 +41,7 @@
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
using namespace cv::gpu;
......@@ -565,7 +565,7 @@ namespace cv { namespace gpu { namespace bfmatcher
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
typename Dist::ResultType myMin = numeric_limits_gpu<typename Dist::ResultType>::max();
typename Dist::ResultType myMin = numeric_limits<typename Dist::ResultType>::max();
{
typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
......@@ -821,7 +821,7 @@ namespace cv { namespace gpu { namespace bfmatcher
{
const T* trainDescs = trainDescs_.ptr(trainIdx);
typename Dist::ResultType myDist = numeric_limits_gpu<typename Dist::ResultType>::max();
typename Dist::ResultType myDist = numeric_limits<typename Dist::ResultType>::max();
if (mask(queryIdx, trainIdx))
{
......@@ -932,7 +932,7 @@ namespace cv { namespace gpu { namespace bfmatcher
{
const int tid = threadIdx.x;
T myMin = numeric_limits_gpu<T>::max();
T myMin = numeric_limits<T>::max();
int myMinIdx = -1;
for (int i = tid; i < n; i += BLOCK_SIZE)
......@@ -1007,10 +1007,10 @@ namespace cv { namespace gpu { namespace bfmatcher
if (threadIdx.x == 0)
{
float dist = sdist[0];
if (dist < numeric_limits_gpu<float>::max())
if (dist < numeric_limits<float>::max())
{
int bestIdx = strainIdx[0];
allDist[bestIdx] = numeric_limits_gpu<float>::max();
allDist[bestIdx] = numeric_limits<float>::max();
trainIdx[i] = bestIdx;
distance[i] = dist;
}
......
This diff is collapsed.
......@@ -40,9 +40,10 @@
//
//M*/
#include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "internal_shared.hpp"
......@@ -355,113 +356,10 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////
// min/max
struct MinOp
{
template <typename T>
__device__ __forceinline__ T operator()(T a, T b)
{
return min(a, b);
}
__device__ __forceinline__ float operator()(float a, float b)
{
return fmin(a, b);
}
__device__ __forceinline__ double operator()(double a, double b)
{
return fmin(a, b);
}
};
struct MaxOp
{
template <typename T>
__device__ __forceinline__ T operator()(T a, T b)
{
return max(a, b);
}
__device__ __forceinline__ float operator()(float a, float b)
{
return fmax(a, b);
}
__device__ __forceinline__ double operator()(double a, double b)
{
return fmax(a, b);
}
};
template <typename T> struct ScalarMinOp
{
T s;
explicit ScalarMinOp(T s_) : s(s_) {}
__device__ __forceinline__ T operator()(T a)
{
return min(a, s);
}
};
template <> struct ScalarMinOp<float>
{
float s;
explicit ScalarMinOp(float s_) : s(s_) {}
__device__ __forceinline__ float operator()(float a)
{
return fmin(a, s);
}
};
template <> struct ScalarMinOp<double>
{
double s;
explicit ScalarMinOp(double s_) : s(s_) {}
__device__ __forceinline__ double operator()(double a)
{
return fmin(a, s);
}
};
template <typename T> struct ScalarMaxOp
{
T s;
explicit ScalarMaxOp(T s_) : s(s_) {}
__device__ __forceinline__ T operator()(T a)
{
return max(a, s);
}
};
template <> struct ScalarMaxOp<float>
{
float s;
explicit ScalarMaxOp(float s_) : s(s_) {}
__device__ __forceinline__ float operator()(float a)
{
return fmax(a, s);
}
};
template <> struct ScalarMaxOp<double>
{
double s;
explicit ScalarMaxOp(double s_) : s(s_) {}
__device__ __forceinline__ double operator()(double a)
{
return fmax(a, s);
}
};
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MinOp op;
transform(src1, src2, dst, op, stream);
transform(src1, src2, dst, minimum<T>(), stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
......@@ -475,8 +373,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MaxOp op;
transform(src1, src2, dst, op, stream);
transform(src1, src2, dst, maximum<T>(), stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
......@@ -490,8 +387,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMinOp<T> op(src2);
transform(src1, dst, op, stream);
transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);
......@@ -505,8 +401,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMaxOp<T> op(src2);
transform(src1, dst, op, stream);
transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);
......@@ -521,99 +416,6 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////
// threshold
template <typename T> struct ThreshBinary
{
ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
__device__ __forceinline__ T operator()(const T& src) const
{
return src > thresh ? maxVal : 0;
}
private:
T thresh;
T maxVal;
};
template <typename T> struct ThreshBinaryInv
{
ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
__device__ __forceinline__ T operator()(const T& src) const
{
return src > thresh ? 0 : maxVal;
}
private:
T thresh;
T maxVal;
};
template <typename T> struct ThreshTrunc
{
ThreshTrunc(T thresh_, T) : thresh(thresh_) {}
__device__ __forceinline__ T operator()(const T& src) const
{
return min(src, thresh);
}
private:
T thresh;
};
template <> struct ThreshTrunc<float>
{
ThreshTrunc(float thresh_, float) : thresh(thresh_) {}
__device__ __forceinline__ float operator()(const float& src) const
{
return fmin(src, thresh);
}
private:
float thresh;
};
template <> struct ThreshTrunc<double>
{
ThreshTrunc(double thresh_, double) : thresh(thresh_) {}
__device__ __forceinline__ double operator()(const double& src) const
{
return fmin(src, thresh);
}
private:
double thresh;
};
template <typename T> struct ThreshToZero
{
public:
ThreshToZero(T thresh_, T) : thresh(thresh_) {}
__device__ __forceinline__ T operator()(const T& src) const
{
return src > thresh ? src : 0;
}
private:
T thresh;
};
template <typename T> struct ThreshToZeroInv
{
public:
ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {}
__device__ __forceinline__ T operator()(const T& src) const
{
return src > thresh ? 0 : src;
}
private:
T thresh;
};
template <template <typename> class Op, typename T>
void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal,
cudaStream_t stream)
......@@ -631,11 +433,11 @@ namespace cv { namespace gpu { namespace mathfunc
static const caller_t callers[] =
{
threshold_caller<ThreshBinary, T>,
threshold_caller<ThreshBinaryInv, T>,
threshold_caller<ThreshTrunc, T>,
threshold_caller<ThreshToZero, T>,
threshold_caller<ThreshToZeroInv, T>
threshold_caller<thresh_binary_func, T>,
threshold_caller<thresh_binary_inv_func, T>,
threshold_caller<thresh_trunc_func, T>,
threshold_caller<thresh_to_zero_func, T>,
threshold_caller<thresh_to_zero_inv_func, T>
};
callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);
......@@ -653,20 +455,10 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////
// subtract
template <typename T>
class SubtractOp
{
public:
__device__ __forceinline__ T operator()(const T& l, const T& r) const
{
return l - r;
}
};
template <typename T>
void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
{
transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, SubtractOp<T>(), stream);
transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minus<T>(), stream);
}
template void subtractCaller<short>(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
......@@ -675,7 +467,7 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////
// pow
template<typename T, bool Signed = device::numeric_limits_gpu<T>::is_signed>
template<typename T, bool Signed = device::numeric_limits<T>::is_signed>
struct PowOp
{
float power;
......
......@@ -42,8 +42,8 @@
#include "opencv2/gpu/devmem2d.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "safe_call.hpp"
......@@ -76,7 +76,7 @@ namespace filter_krnls
{
template <typename T, size_t size> struct SmemType_
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t smem_t;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
};
template <typename T> struct SmemType_<T, 4>
{
......@@ -111,7 +111,7 @@ namespace filter_krnls
if (x < src.cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
sum_t sum = VecTraits<sum_t>::all(0);
sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
......@@ -253,7 +253,7 @@ namespace filter_krnls
if (y < src.rows)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
sum_t sum = VecTraits<sum_t>::all(0);
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
......@@ -475,7 +475,7 @@ namespace bf_krnls
}
}
float minimum = numeric_limits_gpu<float>::max();
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
......
......@@ -42,6 +42,7 @@
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu;
......@@ -50,14 +51,11 @@ using namespace cv::gpu::device;
#define UINT_BITS 32U
#define LOG2_WARP_SIZE 5U
#define WARP_SIZE (1U << LOG2_WARP_SIZE)
//Warps == subhistograms per threadblock
#define WARP_COUNT 6
//Threadblock size
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * WARP_SIZE)
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
#define HISTOGRAM256_BIN_COUNT 256
//Shared memory per threadblock
......@@ -73,7 +71,7 @@ namespace cv { namespace gpu { namespace histograms
{
#if (!USE_SMEM_ATOMICS)
#define TAG_MASK ( (1U << (UINT_BITS - LOG2_WARP_SIZE)) - 1U )
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
{
......@@ -111,7 +109,7 @@ namespace cv { namespace gpu { namespace histograms
{
//Per-warp subhistogram storage
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
uint* s_WarpHist= s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
//Clear shared memory storage for current threadblock before processing
#pragma unroll
......@@ -119,7 +117,7 @@ namespace cv { namespace gpu { namespace histograms
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
//Cycle through the entire data set, update subhistograms for each warp
const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE);
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
__syncthreads();
const uint colsui = d_Data.step / sizeof(uint);
......
......@@ -41,7 +41,7 @@
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
......@@ -84,8 +84,8 @@ __global__ void matchTemplateNaiveKernel_CCORR(
int w, int h, const PtrStep image, const PtrStep templ,
DevMem2Df result)
{
typedef typename TypeVec<T, cn>::vec_t Type;
typedef typename TypeVec<float, cn>::vec_t Typef;
typedef typename TypeVec<T, cn>::vec_type Type;
typedef typename TypeVec<float, cn>::vec_type Typef;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -174,8 +174,8 @@ __global__ void matchTemplateNaiveKernel_SQDIFF(
int w, int h, const PtrStep image, const PtrStep templ,
DevMem2Df result)
{
typedef typename TypeVec<T, cn>::vec_t Type;
typedef typename TypeVec<float, cn>::vec_t Typef;
typedef typename TypeVec<T, cn>::vec_type Type;
typedef typename TypeVec<float, cn>::vec_type Typef;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -884,7 +884,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
template <int cn>
__global__ void extractFirstChannel_32F(const PtrStep image, DevMem2Df result)
{
typedef typename TypeVec<float, cn>::vec_t Typef;
typedef typename TypeVec<float, cn>::vec_type Typef;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
......
......@@ -40,9 +40,9 @@
//
//M*/
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "internal_shared.hpp"
......
This diff is collapsed.
......@@ -42,7 +42,7 @@
#include "opencv2/gpu/devmem2d.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "safe_call.hpp"
using namespace cv::gpu;
......@@ -381,7 +381,7 @@ namespace cv { namespace gpu { namespace bp
template <typename T>
__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
{
float minimum = numeric_limits_gpu<float>::max();
float minimum = numeric_limits<float>::max();
for(int i = 0; i < cndisp; ++i)
{
......@@ -486,7 +486,7 @@ namespace cv { namespace gpu { namespace bp
size_t disp_step = disp.rows * u.step;
int best = 0;
float best_val = numeric_limits_gpu<float>::max();
float best_val = numeric_limits<float>::max();
for (int d = 0; d < cndisp; ++d)
{
float val = us[d * disp_step];
......
......@@ -42,7 +42,7 @@
#include "opencv2/gpu/devmem2d.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "safe_call.hpp"
using namespace cv::gpu;
......@@ -147,7 +147,7 @@ namespace cv { namespace gpu { namespace csbp
for(int i = 0; i < nr_plane; i++)
{
T minimum = numeric_limits_gpu<T>::max();
T minimum = numeric_limits<T>::max();
int id = 0;
for(int d = 0; d < cndisp; d++)
{
......@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace csbp
data_cost_selected[i * cdisp_step1] = minimum;
selected_disparity[i * cdisp_step1] = id;
data_cost [id * cdisp_step1] = numeric_limits_gpu<T>::max();
data_cost [id * cdisp_step1] = numeric_limits<T>::max();
}
}
}
......@@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace csbp
data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
selected_disparity[nr_local_minimum * cdisp_step1] = d;
data_cost[d * cdisp_step1] = numeric_limits_gpu<T>::max();
data_cost[d * cdisp_step1] = numeric_limits<T>::max();
nr_local_minimum++;
}
......@@ -203,7 +203,7 @@ namespace cv { namespace gpu { namespace csbp
for (int i = nr_local_minimum; i < nr_plane; i++)
{
T minimum = numeric_limits_gpu<T>::max();
T minimum = numeric_limits<T>::max();
int id = 0;
for (int d = 0; d < cndisp; d++)
......@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace csbp
data_cost_selected[i * cdisp_step1] = minimum;
selected_disparity[i * cdisp_step1] = id;
data_cost[id * cdisp_step1] = numeric_limits_gpu<T>::max();
data_cost[id * cdisp_step1] = numeric_limits<T>::max();
}
}
}
......@@ -610,7 +610,7 @@ namespace cv { namespace gpu { namespace csbp
{
for(int i = 0; i < nr_plane; i++)
{
T minimum = numeric_limits_gpu<T>::max();
T minimum = numeric_limits<T>::max();
int id = 0;
for(int j = 0; j < nr_plane2; j++)
{
......@@ -630,7 +630,7 @@ namespace cv { namespace gpu { namespace csbp
l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
data_cost_new[id * cdisp_step1] = numeric_limits_gpu<T>::max();
data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
}
}
......@@ -737,7 +737,7 @@ namespace cv { namespace gpu { namespace csbp
__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
{
T minimum = numeric_limits_gpu<T>::max();
T minimum = numeric_limits<T>::max();
for(int d = 0; d < nr_plane; d++)
{
......@@ -850,7 +850,7 @@ namespace cv { namespace gpu { namespace csbp
const T* r = r_ + (y+0) * cmsg_step1 + (x-1);
int best = 0;
T best_val = numeric_limits_gpu<T>::max();
T best_val = numeric_limits<T>::max();
for (int i = 0; i < nr_plane; ++i)
{
int idx = i * cdisp_step1;
......
......@@ -46,8 +46,10 @@
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/functional.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
......@@ -393,31 +395,10 @@ namespace cv { namespace gpu { namespace surf
//dss
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1])
- H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0])
+ H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]);
if (det != 0.0f)
{
float invdet = 1.0f / det;
__shared__ float x[3];
x[0] = invdet *
(dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) -
H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) +
H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2]));
x[1] = invdet *
(H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) -
dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) +
H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0]));
x[2] = invdet *
(H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) -
H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) +
dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]));
if (solve3x3(H, dD, x))
{
if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
{
// if the step is within the interpolation region, perform it
......@@ -500,20 +481,6 @@ namespace cv { namespace gpu { namespace surf
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__device__ void reduceSum32(volatile float* v_sum, float& sum)
{
v_sum[threadIdx.x] = sum;
if (threadIdx.x < 16)
{
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 16];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 8];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 4];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 2];
v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 1];
}
}
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
......@@ -599,8 +566,11 @@ namespace cv { namespace gpu { namespace surf
float* s_sum_row = s_sum + threadIdx.y * 32;
reduceSum32(s_sum_row, sumx);
reduceSum32(s_sum_row, sumy);
//reduceSum32(s_sum_row, sumx);
//reduceSum32(s_sum_row, sumy);
warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
......
......@@ -43,8 +43,8 @@
#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "saturate_cast.hpp"
#include "vec_traits.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -72,64 +72,53 @@ namespace cv { namespace gpu { namespace device
return -last <= mini && maxi <= 2 * last;
}
private:
int last;
};
template <typename D>
struct BrdRowReflect101: BrdReflect101
template <typename D> struct BrdRowReflect101 : BrdReflect101
{
explicit BrdRowReflect101(int len): BrdReflect101(len) {}
template <typename T>
__device__ __forceinline__ D at_low(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i)]);
}
template <typename T>
__device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i)]);
}
};
template <typename D>
struct BrdColReflect101: BrdReflect101
template <typename D> struct BrdColReflect101 : BrdReflect101
{
BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
template <typename T>
__device__ __forceinline__ D at_low(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
}
template <typename T>
__device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
}
private:
int step;
};
struct BrdReplicate
{
explicit BrdReplicate(int len): last(len - 1) {}
__device__ __forceinline__ int idx_low(int i) const
{
return max(i, 0);
return ::max(i, 0);
}
__device__ __forceinline__ int idx_high(int i) const
{
return min(i, last);
return ::min(i, last);
}
__device__ __forceinline__ int idx(int i) const
......@@ -142,64 +131,52 @@ namespace cv { namespace gpu { namespace device
return true;
}
private:
int last;
};
template <typename D>
struct BrdRowReplicate: BrdReplicate
template <typename D> struct BrdRowReplicate : BrdReplicate
{
explicit BrdRowReplicate(int len): BrdReplicate(len) {}
template <typename T>
__device__ __forceinline__ D at_low(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i)]);
}
template <typename T>
__device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i)]);
}
};
template <typename D>
struct BrdColReplicate: BrdReplicate
template <typename D> struct BrdColReplicate : BrdReplicate
{
BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
template <typename T>
__device__ __forceinline__ D at_low(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
}
template <typename T>
__device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
}
private:
int step;
};
template <typename D>
struct BrdRowConstant
template <typename D> struct BrdRowConstant
{
explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
template <typename T>
__device__ __forceinline__ D at_low(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(data[i]) : val;
}
template <typename T>
__device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i]) : val;
}
......@@ -209,24 +186,20 @@ namespace cv { namespace gpu { namespace device
return true;
}
private:
int len;
D val;
};
template <typename D>
struct BrdColConstant
template <typename D> struct BrdColConstant
{
BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
template <typename T>
__device__ __forceinline__ D at_low(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
}
template <typename T>
__device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
}
......@@ -236,15 +209,12 @@ namespace cv { namespace gpu { namespace device
return true;
}
private:
int len;
int step;
D val;
};
template <typename OutT>
struct BrdConstant
template <typename OutT> struct BrdConstant
{
BrdConstant(int w, int h, const OutT &val = VecTraits<OutT>::all(0)) : w(w), h(h), val(val) {}
......@@ -255,11 +225,9 @@ namespace cv { namespace gpu { namespace device
return val;
}
private:
int w, h;
OutT val;
};
}}}
#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
This diff is collapsed.
......@@ -44,6 +44,7 @@
#define __OPENCV_GPU_DATAMOV_UTILS_HPP__
#include "internal_shared.hpp"
#include "utility.hpp"
namespace cv { namespace gpu { namespace device
{
......@@ -57,47 +58,38 @@ namespace cv { namespace gpu { namespace device
#else // __CUDA_ARCH__ >= 200
#if defined(_WIN64) || defined(__LP64__)
// 64-bit register modifier for inlined asm
#define _OPENCV_ASM_PTR_ "l"
#else
// 32-bit register modifier for inlined asm
#define _OPENCV_ASM_PTR_ "r"
#endif
template<class T> struct ForceGlob;
#define DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
#define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
template <> struct ForceGlob<base_type> \
{ \
__device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
{ \
asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(ptr + offset)); \
asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
} \
};
#define DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
#define OPENCV_GPU_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
template <> struct ForceGlob<base_type> \
{ \
__device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
{ \
asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : _OPENCV_ASM_PTR_(ptr + offset)); \
asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
} \
};
DEFINE_FORCE_GLOB_B(uchar, u8)
DEFINE_FORCE_GLOB_B(schar, s8)
DEFINE_FORCE_GLOB_B(char, b8)
DEFINE_FORCE_GLOB (ushort, u16, h)
DEFINE_FORCE_GLOB (short, s16, h)
DEFINE_FORCE_GLOB (uint, u32, r)
DEFINE_FORCE_GLOB (int, s32, r)
DEFINE_FORCE_GLOB (float, f32, f)
DEFINE_FORCE_GLOB (double, f64, d)
OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)
OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)
OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)
OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)
OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)
OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)
OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r)
OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f)
OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d)
#undef DEFINE_FORCE_GLOB
#undef DEFINE_FORCE_GLOB_B
#undef _OPENCV_ASM_PTR_
#undef OPENCV_GPU_DEFINE_FORCE_GLOB
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
#endif // __CUDA_ARCH__ >= 200
}}}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -45,12 +45,8 @@
#include "internal_shared.hpp"
namespace cv
namespace cv { namespace gpu { namespace device
{
namespace gpu
{
namespace device
{
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
......@@ -165,8 +161,6 @@ namespace cv
return saturate_cast<uint>((float)v);
#endif
}
}
}
}
}}}
#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -95,9 +95,7 @@ namespace
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty()),
upright(surf.upright)
use_mask(!mask.empty())
{
CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
......@@ -224,8 +222,6 @@ namespace
bool use_mask;
bool upright;
int maxCandidates;
int maxFeatures;
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment