Commit e69c6fde authored by Marina Kolpakova's avatar Marina Kolpakova

minor formating changes

parent 7c160cdc
...@@ -44,9 +44,9 @@ ...@@ -44,9 +44,9 @@
#include <algorithm> #include <algorithm>
#include "internal_shared.hpp" #include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace canny namespace canny
{ {
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{ {
...@@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device ...@@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device
} }
}; };
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{ {
__shared__ int sdx[18][16]; __shared__ int sdx[18][16];
...@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace device ...@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace device
} }
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
#define CANNY_SHIFT 15 #define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5) #define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
...@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace device ...@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace device
edge_type = 1 + (int)(m > high_thresh); edge_type = 1 + (int)(m > high_thresh);
} }
} }
map.ptr(i + 1)[j + 1] = edge_type; map.ptr(i + 1)[j + 1] = edge_type;
} }
} }
...@@ -270,7 +270,7 @@ namespace cv { namespace gpu { namespace device ...@@ -270,7 +270,7 @@ namespace cv { namespace gpu { namespace device
const int tid = threadIdx.y * 16 + threadIdx.x; const int tid = threadIdx.y * 16 + threadIdx.x;
const int lx = tid % 18; const int lx = tid % 18;
const int ly = tid / 18; const int ly = tid / 18;
if (ly < 14) if (ly < 14)
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
...@@ -294,10 +294,10 @@ namespace cv { namespace gpu { namespace device ...@@ -294,10 +294,10 @@ namespace cv { namespace gpu { namespace device
n += smem[threadIdx.y ][threadIdx.x ] == 2; n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2; n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2; n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 1][threadIdx.x ] == 2; n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 2][threadIdx.x ] == 2; n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
...@@ -318,10 +318,10 @@ namespace cv { namespace gpu { namespace device ...@@ -318,10 +318,10 @@ namespace cv { namespace gpu { namespace device
n += smem[threadIdx.y ][threadIdx.x ] == 1; n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1; n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1; n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 1][threadIdx.x ] == 1; n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 2][threadIdx.x ] == 1; n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
...@@ -361,7 +361,7 @@ namespace cv { namespace gpu { namespace device ...@@ -361,7 +361,7 @@ namespace cv { namespace gpu { namespace device
#if __CUDA_ARCH__ >= 120 #if __CUDA_ARCH__ >= 120
const int stack_size = 512; const int stack_size = 512;
__shared__ unsigned int s_counter; __shared__ unsigned int s_counter;
__shared__ unsigned int s_ind; __shared__ unsigned int s_ind;
__shared__ ushort2 s_st[stack_size]; __shared__ ushort2 s_st[stack_size];
...@@ -404,11 +404,11 @@ namespace cv { namespace gpu { namespace device ...@@ -404,11 +404,11 @@ namespace cv { namespace gpu { namespace device
if (subTaskIdx < portion) if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx]; pos = s_st[s_counter - 1 - subTaskIdx];
__syncthreads(); __syncthreads();
if (threadIdx.x == 0) if (threadIdx.x == 0)
s_counter -= portion; s_counter -= portion;
__syncthreads(); __syncthreads();
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{ {
pos.x += c_dx[threadIdx.x & 7]; pos.x += c_dx[threadIdx.x & 7];
...@@ -452,7 +452,7 @@ namespace cv { namespace gpu { namespace device ...@@ -452,7 +452,7 @@ namespace cv { namespace gpu { namespace device
{ {
void* counter_ptr; void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
unsigned int count; unsigned int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
......
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
#include <opencv2/gpu/device/color.hpp> #include <opencv2/gpu/device/color.hpp>
#include <cvt_colot_internal.h> #include <cvt_colot_internal.h>
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type) OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
{ {
...@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace device ...@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace device
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type) OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
{ {
......
...@@ -48,9 +48,9 @@ ...@@ -48,9 +48,9 @@
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/static_check.hpp" #include "opencv2/gpu/device/static_check.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace column_filter namespace column_filter
{ {
#define MAX_KERNEL_SIZE 32 #define MAX_KERNEL_SIZE 32
...@@ -146,7 +146,7 @@ namespace cv { namespace gpu { namespace device ...@@ -146,7 +146,7 @@ namespace cv { namespace gpu { namespace device
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
B<T> brd(src.rows); B<T> brd(src.rows);
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
...@@ -162,7 +162,7 @@ namespace cv { namespace gpu { namespace device ...@@ -162,7 +162,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream); typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
static const caller_t callers[5][33] = static const caller_t callers[5][33] =
{ {
{ {
0, 0,
...@@ -338,9 +338,9 @@ namespace cv { namespace gpu { namespace device ...@@ -338,9 +338,9 @@ namespace cv { namespace gpu { namespace device
linearColumnFilter_caller<30, T, D, BrdColWrap>, linearColumnFilter_caller<30, T, D, BrdColWrap>,
linearColumnFilter_caller<31, T, D, BrdColWrap>, linearColumnFilter_caller<31, T, D, BrdColWrap>,
linearColumnFilter_caller<32, T, D, BrdColWrap> linearColumnFilter_caller<32, T, D, BrdColWrap>
} }
}; };
loadKernel(kernel, ksize); loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
......
...@@ -43,9 +43,9 @@ ...@@ -43,9 +43,9 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace imgproc namespace imgproc
{ {
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left) template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
{ {
...@@ -58,9 +58,9 @@ namespace cv { namespace gpu { namespace device ...@@ -58,9 +58,9 @@ namespace cv { namespace gpu { namespace device
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
{ {
static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream) const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
{ {
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
...@@ -75,20 +75,20 @@ namespace cv { namespace gpu { namespace device ...@@ -75,20 +75,20 @@ namespace cv { namespace gpu { namespace device
} }
}; };
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
const T* borderValue, cudaStream_t stream) const T* borderValue, cudaStream_t stream)
{ {
typedef typename TypeVec<T, cn>::vec_type vec_type; typedef typename TypeVec<T, cn>::vec_type vec_type;
typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
static const caller_t callers[5] = static const caller_t callers[5] =
{ {
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
}; };
callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream); callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
// //
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
// //
// The original code was written by Paul Furgale and Chi Hay Tong // The original code was written by Paul Furgale and Chi Hay Tong
// and later optimized and prepared for integration into OpenCV by Itseez. // and later optimized and prepared for integration into OpenCV by Itseez.
// //
//M*/ //M*/
...@@ -48,9 +48,9 @@ ...@@ -48,9 +48,9 @@
#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/utility.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace fast namespace fast
{ {
__device__ unsigned int g_counter = 0; __device__ unsigned int g_counter = 0;
...@@ -78,14 +78,14 @@ namespace cv { namespace gpu { namespace device ...@@ -78,14 +78,14 @@ namespace cv { namespace gpu { namespace device
d1 = diffType(v, C[0] & 0xff, th); d1 = diffType(v, C[0] & 0xff, th);
d2 = diffType(v, C[2] & 0xff, th); d2 = diffType(v, C[2] & 0xff, th);
if ((d1 | d2) == 0) if ((d1 | d2) == 0)
return; return;
mask1 |= (d1 & 1) << 0; mask1 |= (d1 & 1) << 0;
mask2 |= ((d1 & 2) >> 1) << 0; mask2 |= ((d1 & 2) >> 1) << 0;
mask1 |= (d2 & 1) << 8; mask1 |= (d2 & 1) << 8;
mask2 |= ((d2 & 2) >> 1) << 8; mask2 |= ((d2 & 2) >> 1) << 8;
...@@ -141,7 +141,7 @@ namespace cv { namespace gpu { namespace device ...@@ -141,7 +141,7 @@ namespace cv { namespace gpu { namespace device
return;*/ return;*/
mask1 |= (d1 & 1) << 1; mask1 |= (d1 & 1) << 1;
mask2 |= ((d1 & 2) >> 1) << 1; mask2 |= ((d1 & 2) >> 1) << 1;
mask1 |= (d2 & 1) << 9; mask1 |= (d2 & 1) << 9;
mask2 |= ((d2 & 2) >> 1) << 9; mask2 |= ((d2 & 2) >> 1) << 9;
...@@ -169,7 +169,7 @@ namespace cv { namespace gpu { namespace device ...@@ -169,7 +169,7 @@ namespace cv { namespace gpu { namespace device
return;*/ return;*/
mask1 |= (d1 & 1) << 5; mask1 |= (d1 & 1) << 5;
mask2 |= ((d1 & 2) >> 1) << 5; mask2 |= ((d1 & 2) >> 1) << 5;
mask1 |= (d2 & 1) << 13; mask1 |= (d2 & 1) << 13;
mask2 |= ((d2 & 2) >> 1) << 13; mask2 |= ((d2 & 2) >> 1) << 13;
...@@ -191,7 +191,7 @@ namespace cv { namespace gpu { namespace device ...@@ -191,7 +191,7 @@ namespace cv { namespace gpu { namespace device
// 0 -> not a keypoint // 0 -> not a keypoint
__device__ __forceinline__ bool isKeyPoint(int mask1, int mask2) __device__ __forceinline__ bool isKeyPoint(int mask1, int mask2)
{ {
return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) || return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) ||
(__popc(mask2) > 8 && (c_table[(mask2 >> 3) - 63] & (1 << (mask2 & 7)))); (__popc(mask2) > 8 && (c_table[(mask2 >> 3) - 63] & (1 << (mask2 & 7))));
} }
...@@ -212,14 +212,14 @@ namespace cv { namespace gpu { namespace device ...@@ -212,14 +212,14 @@ namespace cv { namespace gpu { namespace device
calcMask(C, v, mid, mask1, mask2); calcMask(C, v, mid, mask1, mask2);
int isKp = static_cast<int>(isKeyPoint(mask1, mask2)); int isKp = static_cast<int>(isKeyPoint(mask1, mask2));
min = isKp * (mid + 1) + (isKp ^ 1) * min; min = isKp * (mid + 1) + (isKp ^ 1) * min;
max = (isKp ^ 1) * (mid - 1) + isKp * max; max = (isKp ^ 1) * (mid - 1) + isKp * max;
} }
return min - 1; return min - 1;
} }
template <bool calcScore, class Mask> template <bool calcScore, class Mask>
__global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold) __global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
{ {
...@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace device ...@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace device
C[2] |= static_cast<uint>(img(i - 1, j - 3)) << (3 * 8); C[2] |= static_cast<uint>(img(i - 1, j - 3)) << (3 * 8);
C[1] |= static_cast<uint>(img(i - 1, j + 3)) << 8; C[1] |= static_cast<uint>(img(i - 1, j + 3)) << 8;
C[3] |= static_cast<uint>(img(i, j - 3)); C[3] |= static_cast<uint>(img(i, j - 3));
v = static_cast<int>(img(i, j)); v = static_cast<int>(img(i, j));
C[1] |= static_cast<uint>(img(i, j + 3)); C[1] |= static_cast<uint>(img(i, j + 3));
...@@ -313,7 +313,7 @@ namespace cv { namespace gpu { namespace device ...@@ -313,7 +313,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
unsigned int count; unsigned int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
...@@ -335,14 +335,14 @@ namespace cv { namespace gpu { namespace device ...@@ -335,14 +335,14 @@ namespace cv { namespace gpu { namespace device
int score = scoreMat(loc.y, loc.x); int score = scoreMat(loc.y, loc.x);
bool ismax = bool ismax =
score > scoreMat(loc.y - 1, loc.x - 1) && score > scoreMat(loc.y - 1, loc.x - 1) &&
score > scoreMat(loc.y - 1, loc.x ) && score > scoreMat(loc.y - 1, loc.x ) &&
score > scoreMat(loc.y - 1, loc.x + 1) && score > scoreMat(loc.y - 1, loc.x + 1) &&
score > scoreMat(loc.y , loc.x - 1) && score > scoreMat(loc.y , loc.x - 1) &&
score > scoreMat(loc.y , loc.x + 1) && score > scoreMat(loc.y , loc.x + 1) &&
score > scoreMat(loc.y + 1, loc.x - 1) && score > scoreMat(loc.y + 1, loc.x - 1) &&
score > scoreMat(loc.y + 1, loc.x ) && score > scoreMat(loc.y + 1, loc.x ) &&
score > scoreMat(loc.y + 1, loc.x + 1); score > scoreMat(loc.y + 1, loc.x + 1);
...@@ -375,7 +375,7 @@ namespace cv { namespace gpu { namespace device ...@@ -375,7 +375,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
unsigned int new_count; unsigned int new_count;
cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
// //
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
// //
// The original code was written by Paul Furgale and Chi Hay Tong // The original code was written by Paul Furgale and Chi Hay Tong
// and later optimized and prepared for integration into OpenCV by Itseez. // and later optimized and prepared for integration into OpenCV by Itseez.
// //
//M*/ //M*/
...@@ -50,9 +50,9 @@ ...@@ -50,9 +50,9 @@
#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/utility.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace gfft namespace gfft
{ {
texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp); texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
...@@ -117,7 +117,7 @@ namespace cv { namespace gpu { namespace device ...@@ -117,7 +117,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
uint count; uint count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );
...@@ -126,9 +126,9 @@ namespace cv { namespace gpu { namespace device ...@@ -126,9 +126,9 @@ namespace cv { namespace gpu { namespace device
class EigGreater class EigGreater
{ {
public: public:
__device__ __forceinline__ bool operator()(float2 a, float2 b) const __device__ __forceinline__ bool operator()(float2 a, float2 b) const
{ {
return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y); return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
} }
}; };
......
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
#define UINT_BITS 32U #define UINT_BITS 32U
...@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device ...@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120) #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
namespace hist namespace hist
{ {
#if (!USE_SMEM_ATOMICS) #if (!USE_SMEM_ATOMICS)
...@@ -173,7 +173,7 @@ namespace cv { namespace gpu { namespace device ...@@ -173,7 +173,7 @@ namespace cv { namespace gpu { namespace device
{ {
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>( histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
DevMem2D_<uint>(src), DevMem2D_<uint>(src),
buf, buf,
static_cast<uint>(src.rows * src.step / sizeof(uint)), static_cast<uint>(src.rows * src.step / sizeof(uint)),
src.cols); src.cols);
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
// Other values are not supported // Other values are not supported
#define CELL_WIDTH 8 #define CELL_WIDTH 8
...@@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device ...@@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device
#define CELLS_PER_BLOCK_X 2 #define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2 #define CELLS_PER_BLOCK_Y 2
namespace hog namespace hog
{ {
__constant__ int cnbins; __constant__ int cnbins;
__constant__ int cblock_stride_x; __constant__ int cblock_stride_x;
...@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device ...@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
__constant__ int cdescr_width; __constant__ int cdescr_width;
/* Returns the nearest upper power of two, works only for /* Returns the nearest upper power of two, works only for
the typical GPU thread count (pert block) values */ the typical GPU thread count (pert block) values */
int power_2up(unsigned int n) int power_2up(unsigned int n)
{ {
...@@ -82,19 +82,19 @@ namespace cv { namespace gpu { namespace device ...@@ -82,19 +82,19 @@ namespace cv { namespace gpu { namespace device
} }
void set_up_constants(int nbins, int block_stride_x, int block_stride_y, void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y) int nblocks_win_x, int nblocks_win_y)
{ {
cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) ); cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
int block_hist_size_2up = power_2up(block_hist_size); int block_hist_size_2up = power_2up(block_hist_size);
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) ); cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
int descr_width = nblocks_win_x * block_hist_size; int descr_width = nblocks_win_x * block_hist_size;
...@@ -110,7 +110,7 @@ namespace cv { namespace gpu { namespace device ...@@ -110,7 +110,7 @@ namespace cv { namespace gpu { namespace device
template <int nblocks> // Number of histogram blocks processed by single GPU thread block template <int nblocks> // Number of histogram blocks processed by single GPU thread block
__global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad,
const PtrElemStep qangle, float scale, float* block_hists) const PtrElemStep qangle, float scale, float* block_hists)
{ {
const int block_x = threadIdx.z; const int block_x = threadIdx.z;
...@@ -125,7 +125,7 @@ namespace cv { namespace gpu { namespace device ...@@ -125,7 +125,7 @@ namespace cv { namespace gpu { namespace device
float* hists = smem; float* hists = smem;
float* final_hist = smem + cnbins * 48 * nblocks; float* final_hist = smem + cnbins * 48 * nblocks;
const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
4 * cell_x + cell_thread_x; 4 * cell_x + cell_thread_x;
const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y; const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
...@@ -135,8 +135,8 @@ namespace cv { namespace gpu { namespace device ...@@ -135,8 +135,8 @@ namespace cv { namespace gpu { namespace device
// 12 means that 12 pixels affect on block's cell (in one row) // 12 means that 12 pixels affect on block's cell (in one row)
if (cell_thread_x < 12) if (cell_thread_x < 12)
{ {
float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +
cell_x + block_x * CELLS_PER_BLOCK_X) + cell_x + block_x * CELLS_PER_BLOCK_X) +
cell_thread_x; cell_thread_x;
for (int bin_id = 0; bin_id < cnbins; ++bin_id) for (int bin_id = 0; bin_id < cnbins; ++bin_id)
hist[bin_id * 48 * nblocks] = 0.f; hist[bin_id * 48 * nblocks] = 0.f;
...@@ -155,9 +155,9 @@ namespace cv { namespace gpu { namespace device ...@@ -155,9 +155,9 @@ namespace cv { namespace gpu { namespace device
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
float gaussian = ::expf(-(dist_center_y * dist_center_y + float gaussian = ::expf(-(dist_center_y * dist_center_y +
dist_center_x * dist_center_x) * scale); dist_center_x * dist_center_x) * scale);
float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
(8.f - ::fabs(dist_x + 0.5f)) / 64.f; (8.f - ::fabs(dist_x + 0.5f)) / 64.f;
hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x; hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
...@@ -169,41 +169,41 @@ namespace cv { namespace gpu { namespace device ...@@ -169,41 +169,41 @@ namespace cv { namespace gpu { namespace device
{ {
if (cell_thread_x < 6) hist_[0] += hist_[6]; if (cell_thread_x < 6) hist_[0] += hist_[6];
if (cell_thread_x < 3) hist_[0] += hist_[3]; if (cell_thread_x < 3) hist_[0] += hist_[3];
if (cell_thread_x == 0) if (cell_thread_x == 0)
final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]
= hist_[0] + hist_[1] + hist_[2]; = hist_[0] + hist_[1] + hist_[2];
} }
} }
__syncthreads(); __syncthreads();
float* block_hist = block_hists + (blockIdx.y * img_block_width + float* block_hist = block_hists + (blockIdx.y * img_block_width +
blockIdx.x * blockDim.z + block_x) * blockIdx.x * blockDim.z + block_x) *
cblock_hist_size; cblock_hist_size;
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x; int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
if (tid < cblock_hist_size) if (tid < cblock_hist_size)
block_hist[tid] = final_hist[block_x * cblock_hist_size + tid]; block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
} }
void compute_hists(int nbins, int block_stride_x, int block_stride_y, void compute_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, const DevMem2Df& grad, int height, int width, const DevMem2Df& grad,
const DevMem2Db& qangle, float sigma, float* block_hists) const DevMem2Db& qangle, float sigma, float* block_hists)
{ {
const int nblocks = 1; const int nblocks = 1;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x; block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /
block_stride_y; block_stride_y;
dim3 grid(divUp(img_block_width, nblocks), img_block_height); dim3 grid(divUp(img_block_width, nblocks), img_block_height);
dim3 threads(32, 2, nblocks); dim3 threads(32, 2, nblocks);
cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,
cudaFuncCachePreferL1)); cudaFuncCachePreferL1));
// Precompute gaussian spatial window parameter // Precompute gaussian spatial window parameter
float scale = 1.f / (2.f * sigma * sigma); float scale = 1.f / (2.f * sigma * sigma);
...@@ -223,18 +223,18 @@ namespace cv { namespace gpu { namespace device ...@@ -223,18 +223,18 @@ namespace cv { namespace gpu { namespace device
// //
template<int size> template<int size>
__device__ float reduce_smem(volatile float* smem) __device__ float reduce_smem(volatile float* smem)
{ {
unsigned int tid = threadIdx.x; unsigned int tid = threadIdx.x;
float sum = smem[tid]; float sum = smem[tid];
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); } if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); } if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); } if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
if (tid < 32) if (tid < 32)
{ {
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
...@@ -245,54 +245,54 @@ namespace cv { namespace gpu { namespace device ...@@ -245,54 +245,54 @@ namespace cv { namespace gpu { namespace device
__syncthreads(); __syncthreads();
sum = smem[0]; sum = smem[0];
return sum; return sum;
} }
template <int nthreads, // Number of threads which process one block historgam template <int nthreads, // Number of threads which process one block historgam
int nblocks> // Number of block hisograms processed by one GPU thread block int nblocks> // Number of block hisograms processed by one GPU thread block
__global__ void normalize_hists_kernel_many_blocks(const int block_hist_size, __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
const int img_block_width, const int img_block_width,
float* block_hists, float threshold) float* block_hists, float threshold)
{ {
if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width) if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
return; return;
float* hist = block_hists + (blockIdx.y * img_block_width + float* hist = block_hists + (blockIdx.y * img_block_width +
blockIdx.x * blockDim.z + threadIdx.z) * blockIdx.x * blockDim.z + threadIdx.z) *
block_hist_size + threadIdx.x; block_hist_size + threadIdx.x;
__shared__ float sh_squares[nthreads * nblocks]; __shared__ float sh_squares[nthreads * nblocks];
float* squares = sh_squares + threadIdx.z * nthreads; float* squares = sh_squares + threadIdx.z * nthreads;
float elem = 0.f; float elem = 0.f;
if (threadIdx.x < block_hist_size) if (threadIdx.x < block_hist_size)
elem = hist[0]; elem = hist[0];
squares[threadIdx.x] = elem * elem; squares[threadIdx.x] = elem * elem;
__syncthreads(); __syncthreads();
float sum = reduce_smem<nthreads>(squares); float sum = reduce_smem<nthreads>(squares);
float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
elem = ::min(elem * scale, threshold); elem = ::min(elem * scale, threshold);
__syncthreads(); __syncthreads();
squares[threadIdx.x] = elem * elem; squares[threadIdx.x] = elem * elem;
__syncthreads(); __syncthreads();
sum = reduce_smem<nthreads>(squares); sum = reduce_smem<nthreads>(squares);
scale = 1.0f / (::sqrtf(sum) + 1e-3f); scale = 1.0f / (::sqrtf(sum) + 1e-3f);
if (threadIdx.x < block_hist_size) if (threadIdx.x < block_hist_size)
hist[0] = elem * scale; hist[0] = elem * scale;
} }
void normalize_hists(int nbins, int block_stride_x, int block_stride_y, void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, float* block_hists, float threshold) int height, int width, float* block_hists, float threshold)
{ {
const int nblocks = 1; const int nblocks = 1;
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
...@@ -327,19 +327,19 @@ namespace cv { namespace gpu { namespace device ...@@ -327,19 +327,19 @@ namespace cv { namespace gpu { namespace device
// //
template <int nthreads, // Number of threads per one histogram block template <int nthreads, // Number of threads per one histogram block
int nblocks> // Number of histogram block processed by single GPU thread block int nblocks> // Number of histogram block processed by single GPU thread block
__global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y, const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, const float* coefs, const float* block_hists, const float* coefs,
float free_coef, float threshold, unsigned char* labels) float free_coef, float threshold, unsigned char* labels)
{ {
const int win_x = threadIdx.z; const int win_x = threadIdx.z;
if (blockIdx.x * blockDim.z + win_x >= img_win_width) if (blockIdx.x * blockDim.z + win_x >= img_win_width)
return; return;
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x * blockDim.z + win_x) * blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
cblock_hist_size; cblock_hist_size;
float product = 0.f; float product = 0.f;
...@@ -357,24 +357,24 @@ namespace cv { namespace gpu { namespace device ...@@ -357,24 +357,24 @@ namespace cv { namespace gpu { namespace device
__syncthreads(); __syncthreads();
if (nthreads >= 512) if (nthreads >= 512)
{ {
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256]; if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads(); __syncthreads();
} }
if (nthreads >= 256) if (nthreads >= 256)
{ {
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads(); __syncthreads();
} }
if (nthreads >= 128) if (nthreads >= 128)
{ {
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads(); __syncthreads();
} }
if (threadIdx.x < 32) if (threadIdx.x < 32)
{ {
volatile float* smem = products; volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32]; if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16]; if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
...@@ -389,10 +389,10 @@ namespace cv { namespace gpu { namespace device ...@@ -389,10 +389,10 @@ namespace cv { namespace gpu { namespace device
} }
void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
float* coefs, float free_coef, float threshold, unsigned char* labels) float* coefs, float free_coef, float threshold, unsigned char* labels)
{ {
const int nthreads = 256; const int nthreads = 256;
const int nblocks = 1; const int nblocks = 1;
...@@ -408,7 +408,7 @@ namespace cv { namespace gpu { namespace device ...@@ -408,7 +408,7 @@ namespace cv { namespace gpu { namespace device
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>( classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
block_hists, coefs, free_coef, threshold, labels); block_hists, coefs, free_coef, threshold, labels);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -420,11 +420,11 @@ namespace cv { namespace gpu { namespace device ...@@ -420,11 +420,11 @@ namespace cv { namespace gpu { namespace device
template <int nthreads> template <int nthreads>
__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, PtrElemStepf descriptors) const float* block_hists, PtrElemStepf descriptors)
{ {
// Get left top corner of the window in src // Get left top corner of the window in src
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x) * cblock_hist_size; blockIdx.x * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst // Get left top corner of the window in dst
...@@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace device ...@@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace device
} }
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
int height, int width, float* block_hists, DevMem2Df descriptors) int height, int width, float* block_hists, DevMem2Df descriptors)
{ {
const int nthreads = 256; const int nthreads = 256;
...@@ -462,12 +462,12 @@ namespace cv { namespace gpu { namespace device ...@@ -462,12 +462,12 @@ namespace cv { namespace gpu { namespace device
template <int nthreads> template <int nthreads>
__global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
const int win_block_stride_y, const float* block_hists, const int win_block_stride_y, const float* block_hists,
PtrElemStepf descriptors) PtrElemStepf descriptors)
{ {
// Get left top corner of the window in src // Get left top corner of the window in src
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x) * cblock_hist_size; blockIdx.x * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst // Get left top corner of the window in dst
...@@ -482,14 +482,14 @@ namespace cv { namespace gpu { namespace device ...@@ -482,14 +482,14 @@ namespace cv { namespace gpu { namespace device
int y = block_idx / cnblocks_win_x; int y = block_idx / cnblocks_win_x;
int x = block_idx - y * cnblocks_win_x; int x = block_idx - y * cnblocks_win_x;
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block]
= hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block]; = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
} }
} }
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
DevMem2Df descriptors) DevMem2Df descriptors)
{ {
const int nthreads = 256; const int nthreads = 256;
...@@ -514,7 +514,7 @@ namespace cv { namespace gpu { namespace device ...@@ -514,7 +514,7 @@ namespace cv { namespace gpu { namespace device
template <int nthreads, int correct_gamma> template <int nthreads, int correct_gamma>
__global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img,
float angle_scale, PtrElemStepf grad, PtrElemStep qangle) float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -524,9 +524,9 @@ namespace cv { namespace gpu { namespace device ...@@ -524,9 +524,9 @@ namespace cv { namespace gpu { namespace device
__shared__ float sh_row[(nthreads + 2) * 3]; __shared__ float sh_row[(nthreads + 2) * 3];
uchar4 val; uchar4 val;
if (x < width) if (x < width)
val = row[x]; val = row[x];
else else
val = row[width - 2]; val = row[width - 2];
sh_row[threadIdx.x + 1] = val.x; sh_row[threadIdx.x + 1] = val.x;
...@@ -563,9 +563,9 @@ namespace cv { namespace gpu { namespace device ...@@ -563,9 +563,9 @@ namespace cv { namespace gpu { namespace device
float3 dx; float3 dx;
if (correct_gamma) if (correct_gamma)
dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z)); dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
else else
dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z); dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
float3 dy = make_float3(0.f, 0.f, 0.f); float3 dy = make_float3(0.f, 0.f, 0.f);
...@@ -588,7 +588,7 @@ namespace cv { namespace gpu { namespace device ...@@ -588,7 +588,7 @@ namespace cv { namespace gpu { namespace device
float mag0 = dx.x * dx.x + dy.x * dy.x; float mag0 = dx.x * dx.x + dy.x * dy.x;
float mag1 = dx.y * dx.y + dy.y * dy.y; float mag1 = dx.y * dx.y + dy.y * dy.y;
if (mag0 < mag1) if (mag0 < mag1)
{ {
best_dx = dx.y; best_dx = dx.y;
best_dy = dy.y; best_dy = dy.y;
...@@ -616,7 +616,7 @@ namespace cv { namespace gpu { namespace device ...@@ -616,7 +616,7 @@ namespace cv { namespace gpu { namespace device
} }
void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img, void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img,
float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma) float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
{ {
const int nthreads = 256; const int nthreads = 256;
...@@ -635,7 +635,7 @@ namespace cv { namespace gpu { namespace device ...@@ -635,7 +635,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int nthreads, int correct_gamma> template <int nthreads, int correct_gamma>
__global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img,
float angle_scale, PtrElemStepf grad, PtrElemStep qangle) float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -644,9 +644,9 @@ namespace cv { namespace gpu { namespace device ...@@ -644,9 +644,9 @@ namespace cv { namespace gpu { namespace device
__shared__ float sh_row[nthreads + 2]; __shared__ float sh_row[nthreads + 2];
if (x < width) if (x < width)
sh_row[threadIdx.x + 1] = row[x]; sh_row[threadIdx.x + 1] = row[x];
else else
sh_row[threadIdx.x + 1] = row[width - 2]; sh_row[threadIdx.x + 1] = row[width - 2];
if (threadIdx.x == 0) if (threadIdx.x == 0)
...@@ -688,7 +688,7 @@ namespace cv { namespace gpu { namespace device ...@@ -688,7 +688,7 @@ namespace cv { namespace gpu { namespace device
} }
void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img, void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img,
float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma) float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
{ {
const int nthreads = 256; const int nthreads = 256;
...@@ -729,13 +729,13 @@ namespace cv { namespace gpu { namespace device ...@@ -729,13 +729,13 @@ namespace cv { namespace gpu { namespace device
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows) if (x < dst.cols && y < dst.rows)
{ {
float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy); float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255); dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
} }
} }
template<class T, class TEX> template<class T, class TEX>
static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex) static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
{ {
tex.filterMode = cudaFilterModeLinear; tex.filterMode = cudaFilterModeLinear;
...@@ -743,19 +743,19 @@ namespace cv { namespace gpu { namespace device ...@@ -743,19 +743,19 @@ namespace cv { namespace gpu { namespace device
size_t texOfs = 0; size_t texOfs = 0;
int colOfs = 0; int colOfs = 0;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) ); cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
if (texOfs != 0) if (texOfs != 0)
{ {
colOfs = static_cast<int>( texOfs/sizeof(T) ); colOfs = static_cast<int>( texOfs/sizeof(T) );
cudaSafeCall( cudaUnbindTexture(tex) ); cudaSafeCall( cudaUnbindTexture(tex) );
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) ); cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
} }
dim3 threads(32, 8); dim3 threads(32, 8);
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y)); dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
float sx = static_cast<float>(src.cols) / dst.cols; float sx = static_cast<float>(src.cols) / dst.cols;
float sy = static_cast<float>(src.rows) / dst.rows; float sy = static_cast<float>(src.rows) / dst.rows;
...@@ -769,5 +769,5 @@ namespace cv { namespace gpu { namespace device ...@@ -769,5 +769,5 @@ namespace cv { namespace gpu { namespace device
void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); } void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); } void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
} // namespace hog } // namespace hog
}}} // namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device
...@@ -970,12 +970,12 @@ namespace cv { namespace gpu { namespace device ...@@ -970,12 +970,12 @@ namespace cv { namespace gpu { namespace device
#undef IMPLEMENT_FILTER2D_TEX_READER #undef IMPLEMENT_FILTER2D_TEX_READER
template <typename T, typename D> template <typename T, typename D>
void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
int borderMode, const float* borderValue, cudaStream_t stream) int borderMode, const float* borderValue, cudaStream_t stream)
{ {
typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream); typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
static const func_t funcs[] = static const func_t funcs[] =
{ {
Filter2DCaller<T, D, BrdReflect101>::call, Filter2DCaller<T, D, BrdReflect101>::call,
Filter2DCaller<T, D, BrdReplicate>::call, Filter2DCaller<T, D, BrdReplicate>::call,
......
...@@ -50,9 +50,9 @@ ...@@ -50,9 +50,9 @@
#include "safe_call.hpp" #include "safe_call.hpp"
#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/common.hpp"
namespace cv { namespace gpu namespace cv { namespace gpu
{ {
enum enum
{ {
BORDER_REFLECT101_GPU = 0, BORDER_REFLECT101_GPU = 0,
BORDER_REPLICATE_GPU, BORDER_REPLICATE_GPU,
...@@ -60,7 +60,7 @@ namespace cv { namespace gpu ...@@ -60,7 +60,7 @@ namespace cv { namespace gpu
BORDER_REFLECT_GPU, BORDER_REFLECT_GPU,
BORDER_WRAP_GPU BORDER_WRAP_GPU
}; };
// Converts CPU border extrapolation mode into GPU internal analogue. // Converts CPU border extrapolation mode into GPU internal analogue.
// Returns true if the GPU analogue exists, false otherwise. // Returns true if the GPU analogue exists, false otherwise.
bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType); bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
......
...@@ -43,9 +43,9 @@ ...@@ -43,9 +43,9 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace match_template namespace match_template
{ {
__device__ __forceinline__ float sum(float v) { return v; } __device__ __forceinline__ float sum(float v) { return v; }
__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; } __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
...@@ -80,7 +80,7 @@ namespace cv { namespace gpu { namespace device ...@@ -80,7 +80,7 @@ namespace cv { namespace gpu { namespace device
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Naive_CCORR // Naive_CCORR
template <typename T, int cn> template <typename T, int cn>
__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result) __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
{ {
typedef typename TypeVec<T, cn>::vec_type Type; typedef typename TypeVec<T, cn>::vec_type Type;
...@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace device ...@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4> 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
}; };
...@@ -135,7 +135,7 @@ namespace cv { namespace gpu { namespace device ...@@ -135,7 +135,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4> 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
}; };
...@@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace device ...@@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4> 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
}; };
...@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace device ...@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4> 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
}; };
...@@ -249,7 +249,7 @@ namespace cv { namespace gpu { namespace device ...@@ -249,7 +249,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream); typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4> 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
}; };
...@@ -321,7 +321,7 @@ namespace cv { namespace gpu { namespace device ...@@ -321,7 +321,7 @@ namespace cv { namespace gpu { namespace device
DevMem2Df result, int cn, cudaStream_t stream) DevMem2Df result, int cn, cudaStream_t stream)
{ {
typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream); typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4> 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
}; };
...@@ -379,16 +379,16 @@ namespace cv { namespace gpu { namespace device ...@@ -379,16 +379,16 @@ namespace cv { namespace gpu { namespace device
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
float ccorr = result.ptr(y)[x]; float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
- image_sum_g_ * templ_sum_scale_g; - image_sum_g_ * templ_sum_scale_g;
} }
} }
void matchTemplatePrepared_CCOFF_8UC2( void matchTemplatePrepared_CCOFF_8UC2(
int w, int h, int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned int> image_sum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned int> image_sum_g,
unsigned int templ_sum_r, unsigned int templ_sum_g, unsigned int templ_sum_r, unsigned int templ_sum_g,
DevMem2Df result, cudaStream_t stream) DevMem2Df result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
...@@ -406,7 +406,7 @@ namespace cv { namespace gpu { namespace device ...@@ -406,7 +406,7 @@ namespace cv { namespace gpu { namespace device
__global__ void matchTemplatePreparedKernel_CCOFF_8UC3( __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
int w, int h, int w, int h,
float templ_sum_scale_r, float templ_sum_scale_r,
float templ_sum_scale_g, float templ_sum_scale_g,
float templ_sum_scale_b, float templ_sum_scale_b,
...@@ -437,20 +437,20 @@ namespace cv { namespace gpu { namespace device ...@@ -437,20 +437,20 @@ namespace cv { namespace gpu { namespace device
} }
void matchTemplatePrepared_CCOFF_8UC3( void matchTemplatePrepared_CCOFF_8UC3(
int w, int h, int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned int> image_sum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned int> image_sum_g,
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned int> image_sum_b,
unsigned int templ_sum_r, unsigned int templ_sum_r,
unsigned int templ_sum_g, unsigned int templ_sum_g,
unsigned int templ_sum_b, unsigned int templ_sum_b,
DevMem2Df result, cudaStream_t stream) DevMem2Df result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>( matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
w, h, w, h,
(float)templ_sum_r / (w * h), (float)templ_sum_r / (w * h),
(float)templ_sum_g / (w * h), (float)templ_sum_g / (w * h),
(float)templ_sum_b / (w * h), (float)templ_sum_b / (w * h),
...@@ -464,8 +464,8 @@ namespace cv { namespace gpu { namespace device ...@@ -464,8 +464,8 @@ namespace cv { namespace gpu { namespace device
__global__ void matchTemplatePreparedKernel_CCOFF_8UC4( __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
int w, int h, int w, int h,
float templ_sum_scale_r, float templ_sum_scale_r,
float templ_sum_scale_g, float templ_sum_scale_g,
float templ_sum_scale_b, float templ_sum_scale_b,
float templ_sum_scale_a, float templ_sum_scale_a,
...@@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device ...@@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device
(image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
(image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
float ccorr = result.ptr(y)[x]; float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
- image_sum_g_ * templ_sum_scale_g - image_sum_g_ * templ_sum_scale_g
- image_sum_b_ * templ_sum_scale_b - image_sum_b_ * templ_sum_scale_b
- image_sum_a_ * templ_sum_scale_a; - image_sum_a_ * templ_sum_scale_a;
...@@ -501,24 +501,24 @@ namespace cv { namespace gpu { namespace device ...@@ -501,24 +501,24 @@ namespace cv { namespace gpu { namespace device
} }
void matchTemplatePrepared_CCOFF_8UC4( void matchTemplatePrepared_CCOFF_8UC4(
int w, int h, int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned int> image_sum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned int> image_sum_g,
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned int> image_sum_b,
const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned int> image_sum_a,
unsigned int templ_sum_r, unsigned int templ_sum_r,
unsigned int templ_sum_g, unsigned int templ_sum_g,
unsigned int templ_sum_b, unsigned int templ_sum_b,
unsigned int templ_sum_a, unsigned int templ_sum_a,
DevMem2Df result, cudaStream_t stream) DevMem2Df result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>( matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
w, h, w, h,
(float)templ_sum_r / (w * h), (float)templ_sum_r / (w * h),
(float)templ_sum_g / (w * h), (float)templ_sum_g / (w * h),
(float)templ_sum_b / (w * h), (float)templ_sum_b / (w * h),
(float)templ_sum_a / (w * h), (float)templ_sum_a / (w * h),
image_sum_r, image_sum_g, image_sum_b, image_sum_a, image_sum_r, image_sum_g, image_sum_b, image_sum_a,
...@@ -533,9 +533,9 @@ namespace cv { namespace gpu { namespace device ...@@ -533,9 +533,9 @@ namespace cv { namespace gpu { namespace device
// Prepared_CCOFF_NORMED // Prepared_CCOFF_NORMED
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
int w, int h, float weight, int w, int h, float weight,
float templ_sum_scale, float templ_sqsum_scale, float templ_sum_scale, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum, const PtrStep<unsigned int> image_sum,
const PtrStep<unsigned long long> image_sqsum, const PtrStep<unsigned long long> image_sqsum,
DevMem2Df result) DevMem2Df result)
{ {
...@@ -557,7 +557,7 @@ namespace cv { namespace gpu { namespace device ...@@ -557,7 +557,7 @@ namespace cv { namespace gpu { namespace device
} }
void matchTemplatePrepared_CCOFF_NORMED_8U( void matchTemplatePrepared_CCOFF_NORMED_8U(
int w, int h, const DevMem2D_<unsigned int> image_sum, int w, int h, const DevMem2D_<unsigned int> image_sum,
const DevMem2D_<unsigned long long> image_sqsum, const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sum, unsigned long long templ_sqsum, unsigned int templ_sum, unsigned long long templ_sqsum,
DevMem2Df result, cudaStream_t stream) DevMem2Df result, cudaStream_t stream)
...@@ -570,7 +570,7 @@ namespace cv { namespace gpu { namespace device ...@@ -570,7 +570,7 @@ namespace cv { namespace gpu { namespace device
float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum; float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>( matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
w, h, weight, templ_sum_scale, templ_sqsum_scale, w, h, weight, templ_sum_scale, templ_sqsum_scale,
image_sum, image_sqsum, result); image_sum, image_sqsum, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -581,8 +581,8 @@ namespace cv { namespace gpu { namespace device ...@@ -581,8 +581,8 @@ namespace cv { namespace gpu { namespace device
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2( __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
int w, int h, float weight, int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_r, float templ_sum_scale_g,
float templ_sqsum_scale, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
...@@ -615,7 +615,7 @@ namespace cv { namespace gpu { namespace device ...@@ -615,7 +615,7 @@ namespace cv { namespace gpu { namespace device
} }
void matchTemplatePrepared_CCOFF_NORMED_8UC2( void matchTemplatePrepared_CCOFF_NORMED_8UC2(
int w, int h, int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r, const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g, const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r, unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
...@@ -628,15 +628,15 @@ namespace cv { namespace gpu { namespace device ...@@ -628,15 +628,15 @@ namespace cv { namespace gpu { namespace device
float weight = 1.f / (w * h); float weight = 1.f / (w * h);
float templ_sum_scale_r = templ_sum_r * weight; float templ_sum_scale_r = templ_sum_r * weight;
float templ_sum_scale_g = templ_sum_g * weight; float templ_sum_scale_g = templ_sum_g * weight;
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g; + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>( matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
w, h, weight, w, h, weight,
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_r, templ_sum_scale_g,
templ_sqsum_scale, templ_sqsum_scale,
image_sum_r, image_sqsum_r, image_sum_r, image_sqsum_r,
image_sum_g, image_sqsum_g, image_sum_g, image_sqsum_g,
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -647,8 +647,8 @@ namespace cv { namespace gpu { namespace device ...@@ -647,8 +647,8 @@ namespace cv { namespace gpu { namespace device
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3( __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
int w, int h, float weight, int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
float templ_sqsum_scale, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
...@@ -690,7 +690,7 @@ namespace cv { namespace gpu { namespace device ...@@ -690,7 +690,7 @@ namespace cv { namespace gpu { namespace device
} }
void matchTemplatePrepared_CCOFF_NORMED_8UC3( void matchTemplatePrepared_CCOFF_NORMED_8UC3(
int w, int h, int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r, const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g, const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b, const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
...@@ -706,17 +706,17 @@ namespace cv { namespace gpu { namespace device ...@@ -706,17 +706,17 @@ namespace cv { namespace gpu { namespace device
float templ_sum_scale_r = templ_sum_r * weight; float templ_sum_scale_r = templ_sum_r * weight;
float templ_sum_scale_g = templ_sum_g * weight; float templ_sum_scale_g = templ_sum_g * weight;
float templ_sum_scale_b = templ_sum_b * weight; float templ_sum_scale_b = templ_sum_b * weight;
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+ templ_sqsum_b - weight * templ_sum_b * templ_sum_b; + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>( matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
w, h, weight, w, h, weight,
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
templ_sqsum_scale, templ_sqsum_scale,
image_sum_r, image_sqsum_r, image_sum_r, image_sqsum_r,
image_sum_g, image_sqsum_g, image_sum_g, image_sqsum_g,
image_sum_b, image_sqsum_b, image_sum_b, image_sqsum_b,
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -727,8 +727,8 @@ namespace cv { namespace gpu { namespace device ...@@ -727,8 +727,8 @@ namespace cv { namespace gpu { namespace device
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4( __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
int w, int h, float weight, int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
float templ_sum_scale_a, float templ_sqsum_scale, float templ_sum_scale_a, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
...@@ -777,7 +777,7 @@ namespace cv { namespace gpu { namespace device ...@@ -777,7 +777,7 @@ namespace cv { namespace gpu { namespace device
} }
void matchTemplatePrepared_CCOFF_NORMED_8UC4( void matchTemplatePrepared_CCOFF_NORMED_8UC4(
int w, int h, int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r, const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g, const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b, const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
...@@ -802,13 +802,13 @@ namespace cv { namespace gpu { namespace device ...@@ -802,13 +802,13 @@ namespace cv { namespace gpu { namespace device
+ templ_sqsum_a - weight * templ_sum_a * templ_sum_a; + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>( matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
w, h, weight, w, h, weight,
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
templ_sqsum_scale, templ_sqsum_scale,
image_sum_r, image_sqsum_r, image_sum_r, image_sqsum_r,
image_sum_g, image_sqsum_g, image_sum_g, image_sqsum_g,
image_sum_b, image_sqsum_b, image_sum_b, image_sqsum_b,
image_sum_a, image_sqsum_a, image_sum_a, image_sqsum_a,
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -821,7 +821,7 @@ namespace cv { namespace gpu { namespace device ...@@ -821,7 +821,7 @@ namespace cv { namespace gpu { namespace device
template <int cn> template <int cn>
__global__ void normalizeKernel_8U( __global__ void normalizeKernel_8U(
int w, int h, const PtrStep<unsigned long long> image_sqsum, int w, int h, const PtrStep<unsigned long long> image_sqsum,
unsigned long long templ_sqsum, DevMem2Df result) unsigned long long templ_sqsum, DevMem2Df result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -836,7 +836,7 @@ namespace cv { namespace gpu { namespace device ...@@ -836,7 +836,7 @@ namespace cv { namespace gpu { namespace device
} }
} }
void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream) unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
......
...@@ -42,9 +42,9 @@ ...@@ -42,9 +42,9 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace mathfunc namespace mathfunc
{ {
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar // Cart <-> Polar
...@@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device ...@@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device
} }
}; };
template <typename Mag, typename Angle> template <typename Mag, typename Angle>
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height) float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -137,11 +137,11 @@ namespace cv { namespace gpu { namespace device ...@@ -137,11 +137,11 @@ namespace cv { namespace gpu { namespace device
grid.x = divUp(x.cols, threads.x); grid.x = divUp(x.cols, threads.x);
grid.y = divUp(x.rows, threads.y); grid.y = divUp(x.rows, threads.y);
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f; const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>( cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows); mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -152,7 +152,7 @@ namespace cv { namespace gpu { namespace device ...@@ -152,7 +152,7 @@ namespace cv { namespace gpu { namespace device
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream) void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
{ {
typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream); typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2][2][2] = static const caller_t callers[2][2][2] =
{ {
{ {
{ {
...@@ -187,10 +187,10 @@ namespace cv { namespace gpu { namespace device ...@@ -187,10 +187,10 @@ namespace cv { namespace gpu { namespace device
grid.x = divUp(mag.cols, threads.x); grid.x = divUp(mag.cols, threads.x);
grid.y = divUp(mag.rows, threads.y); grid.y = divUp(mag.rows, threads.y);
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f; const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows); angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -201,7 +201,7 @@ namespace cv { namespace gpu { namespace device ...@@ -201,7 +201,7 @@ namespace cv { namespace gpu { namespace device
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream) void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
{ {
typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream); typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] = static const caller_t callers[2] =
{ {
polarToCart_caller<NonEmptyMag>, polarToCart_caller<NonEmptyMag>,
polarToCart_caller<EmptyMag> polarToCart_caller<EmptyMag>
......
...@@ -45,9 +45,9 @@ ...@@ -45,9 +45,9 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace matrix_reductions namespace matrix_reductions
{ {
// Performs reduction in shared memory // Performs reduction in shared memory
template <int size, typename T> template <int size, typename T>
...@@ -74,19 +74,19 @@ namespace cv { namespace gpu { namespace device ...@@ -74,19 +74,19 @@ namespace cv { namespace gpu { namespace device
{ {
explicit Mask8U(PtrStepb mask): mask(mask) {} explicit Mask8U(PtrStepb mask): mask(mask) {}
__device__ __forceinline__ bool operator()(int y, int x) const __device__ __forceinline__ bool operator()(int y, int x) const
{ {
return mask.ptr(y)[x]; return mask.ptr(y)[x];
} }
PtrStepb mask; PtrStepb mask;
}; };
struct MaskTrue struct MaskTrue
{ {
__device__ __forceinline__ bool operator()(int y, int x) const __device__ __forceinline__ bool operator()(int y, int x) const
{ {
return true; return true;
} }
__device__ __forceinline__ MaskTrue(){} __device__ __forceinline__ MaskTrue(){}
__device__ __forceinline__ MaskTrue(const MaskTrue& mask_){} __device__ __forceinline__ MaskTrue(const MaskTrue& mask_){}
...@@ -95,7 +95,7 @@ namespace cv { namespace gpu { namespace device ...@@ -95,7 +95,7 @@ namespace cv { namespace gpu { namespace device
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Min max // Min max
// To avoid shared bank conflicts we convert each value into value of // To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum) // appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits {}; template <typename T> struct MinMaxTypeTraits {};
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; }; template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
...@@ -106,7 +106,7 @@ namespace cv { namespace gpu { namespace device ...@@ -106,7 +106,7 @@ namespace cv { namespace gpu { namespace device
template <> struct MinMaxTypeTraits<float> { typedef float best_type; }; template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; }; template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
namespace minmax namespace minmax
{ {
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
...@@ -131,19 +131,19 @@ namespace cv { namespace gpu { namespace device ...@@ -131,19 +131,19 @@ namespace cv { namespace gpu { namespace device
{ {
dim3 threads, grid; dim3 threads, grid;
estimateThreadCfg(cols, rows, threads, grid); estimateThreadCfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size; bufcols = grid.x * grid.y * elem_size;
bufrows = 2; bufrows = 2;
} }
// Estimates device constants which are used in the kernels using specified thread configuration // Estimates device constants which are used in the kernels using specified thread configuration
void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
} }
// Does min and max in shared memory // Does min and max in shared memory
...@@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device ...@@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device
for (uint x = x0; x < x_end; x += blockDim.x) for (uint x = x0; x < x_end; x += blockDim.x)
{ {
T val = src_row[x]; T val = src_row[x];
if (mask(y, x)) if (mask(y, x))
{ {
mymin = ::min(mymin, val); mymin = ::min(mymin, val);
mymax = ::max(mymax, val); mymax = ::max(mymax, val);
} }
} }
} }
...@@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device ...@@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device
findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid); findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0) if (tid == 0)
{ {
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
...@@ -240,7 +240,7 @@ namespace cv { namespace gpu { namespace device ...@@ -240,7 +240,7 @@ namespace cv { namespace gpu { namespace device
findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid); findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0) if (tid == 0)
{ {
minval[0] = (T)sminval[0]; minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0]; maxval[0] = (T)smaxval[0];
...@@ -248,7 +248,7 @@ namespace cv { namespace gpu { namespace device ...@@ -248,7 +248,7 @@ namespace cv { namespace gpu { namespace device
} }
} }
#else #else
if (tid == 0) if (tid == 0)
{ {
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
...@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device ...@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device
#endif #endif
} }
template <typename T> template <typename T>
void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf) void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
{ {
...@@ -277,7 +277,7 @@ namespace cv { namespace gpu { namespace device ...@@ -277,7 +277,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_; *minval = minval_;
*maxval = maxval_; *maxval = maxval_;
} }
template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
...@@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device ...@@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_; *minval = minval_;
*maxval = maxval_; *maxval = maxval_;
} }
template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb); template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb); template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
...@@ -325,7 +325,7 @@ namespace cv { namespace gpu { namespace device ...@@ -325,7 +325,7 @@ namespace cv { namespace gpu { namespace device
typedef typename MinMaxTypeTraits<T>::best_type best_type; typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads]; __shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads]; __shared__ best_type smaxval[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = ::min(tid, size - 1); uint idx = ::min(tid, size - 1);
...@@ -335,7 +335,7 @@ namespace cv { namespace gpu { namespace device ...@@ -335,7 +335,7 @@ namespace cv { namespace gpu { namespace device
findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid); findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0) if (tid == 0)
{ {
minval[0] = (T)sminval[0]; minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0]; maxval[0] = (T)smaxval[0];
...@@ -410,7 +410,7 @@ namespace cv { namespace gpu { namespace device ...@@ -410,7 +410,7 @@ namespace cv { namespace gpu { namespace device
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// minMaxLoc // minMaxLoc
namespace minmaxloc namespace minmaxloc
{ {
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
...@@ -431,7 +431,7 @@ namespace cv { namespace gpu { namespace device ...@@ -431,7 +431,7 @@ namespace cv { namespace gpu { namespace device
// Returns required buffer sizes // Returns required buffer sizes
void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows) int& b1rows, int& b2cols, int& b2rows)
{ {
dim3 threads, grid; dim3 threads, grid;
...@@ -445,16 +445,16 @@ namespace cv { namespace gpu { namespace device ...@@ -445,16 +445,16 @@ namespace cv { namespace gpu { namespace device
// Estimates device constants which are used in the kernels using specified thread configuration // Estimates device constants which are used in the kernels using specified thread configuration
void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
} }
template <typename T> template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
volatile uint* minloc, volatile uint* maxloc) volatile uint* minloc, volatile uint* maxloc)
{ {
T val = minval[tid + offset]; T val = minval[tid + offset];
...@@ -473,7 +473,7 @@ namespace cv { namespace gpu { namespace device ...@@ -473,7 +473,7 @@ namespace cv { namespace gpu { namespace device
template <int size, typename T> template <int size, typename T>
__device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile uint* maxloc, const uint tid) volatile uint* maxloc, const uint tid)
{ {
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); } if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
...@@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device ...@@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device
template <int nthreads, typename T, typename Mask> template <int nthreads, typename T, typename Mask>
__global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval,
uint* minloc, uint* maxloc) uint* minloc, uint* maxloc)
{ {
typedef typename MinMaxTypeTraits<T>::best_type best_type; typedef typename MinMaxTypeTraits<T>::best_type best_type;
...@@ -507,7 +507,7 @@ namespace cv { namespace gpu { namespace device ...@@ -507,7 +507,7 @@ namespace cv { namespace gpu { namespace device
uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits<T>::max(); T mymin = numeric_limits<T>::max();
T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
uint myminloc = 0; uint myminloc = 0;
uint mymaxloc = 0; uint mymaxloc = 0;
uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
...@@ -527,7 +527,7 @@ namespace cv { namespace gpu { namespace device ...@@ -527,7 +527,7 @@ namespace cv { namespace gpu { namespace device
} }
} }
sminval[tid] = mymin; sminval[tid] = mymin;
smaxval[tid] = mymax; smaxval[tid] = mymax;
sminloc[tid] = myminloc; sminloc[tid] = myminloc;
smaxloc[tid] = mymaxloc; smaxloc[tid] = mymaxloc;
...@@ -564,7 +564,7 @@ namespace cv { namespace gpu { namespace device ...@@ -564,7 +564,7 @@ namespace cv { namespace gpu { namespace device
findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid); findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0) if (tid == 0)
{ {
minval[0] = (T)sminval[0]; minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0]; maxval[0] = (T)smaxval[0];
...@@ -574,7 +574,7 @@ namespace cv { namespace gpu { namespace device ...@@ -574,7 +574,7 @@ namespace cv { namespace gpu { namespace device
} }
} }
#else #else
if (tid == 0) if (tid == 0)
{ {
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
...@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace device ...@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> template <typename T>
void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
...@@ -598,7 +598,7 @@ namespace cv { namespace gpu { namespace device ...@@ -598,7 +598,7 @@ namespace cv { namespace gpu { namespace device
uint* minloc_buf = (uint*)locbuf.ptr(0); uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1); uint* maxloc_buf = (uint*)locbuf.ptr(1);
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
minloc_buf, maxloc_buf); minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -627,7 +627,7 @@ namespace cv { namespace gpu { namespace device ...@@ -627,7 +627,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> template <typename T>
void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
...@@ -639,7 +639,7 @@ namespace cv { namespace gpu { namespace device ...@@ -639,7 +639,7 @@ namespace cv { namespace gpu { namespace device
uint* minloc_buf = (uint*)locbuf.ptr(0); uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1); uint* maxloc_buf = (uint*)locbuf.ptr(1);
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
minloc_buf, maxloc_buf); minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -688,7 +688,7 @@ namespace cv { namespace gpu { namespace device ...@@ -688,7 +688,7 @@ namespace cv { namespace gpu { namespace device
findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid); findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0) if (tid == 0)
{ {
minval[0] = (T)sminval[0]; minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0]; maxval[0] = (T)smaxval[0];
...@@ -699,7 +699,7 @@ namespace cv { namespace gpu { namespace device ...@@ -699,7 +699,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> template <typename T>
void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
...@@ -711,7 +711,7 @@ namespace cv { namespace gpu { namespace device ...@@ -711,7 +711,7 @@ namespace cv { namespace gpu { namespace device
uint* minloc_buf = (uint*)locbuf.ptr(0); uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1); uint* maxloc_buf = (uint*)locbuf.ptr(1);
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
minloc_buf, maxloc_buf); minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
...@@ -741,7 +741,7 @@ namespace cv { namespace gpu { namespace device ...@@ -741,7 +741,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> template <typename T>
void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
...@@ -753,7 +753,7 @@ namespace cv { namespace gpu { namespace device ...@@ -753,7 +753,7 @@ namespace cv { namespace gpu { namespace device
uint* minloc_buf = (uint*)locbuf.ptr(0); uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1); uint* maxloc_buf = (uint*)locbuf.ptr(1);
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
minloc_buf, maxloc_buf); minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
...@@ -785,7 +785,7 @@ namespace cv { namespace gpu { namespace device ...@@ -785,7 +785,7 @@ namespace cv { namespace gpu { namespace device
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// countNonZero // countNonZero
namespace countnonzero namespace countnonzero
{ {
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
...@@ -811,11 +811,11 @@ namespace cv { namespace gpu { namespace device ...@@ -811,11 +811,11 @@ namespace cv { namespace gpu { namespace device
void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
} }
...@@ -862,7 +862,7 @@ namespace cv { namespace gpu { namespace device ...@@ -862,7 +862,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, uint>(scount, tid); sumInSmem<nthreads, uint>(scount, tid);
if (tid == 0) if (tid == 0)
{ {
count[0] = scount[0]; count[0] = scount[0];
blocks_finished = 0; blocks_finished = 0;
...@@ -873,7 +873,7 @@ namespace cv { namespace gpu { namespace device ...@@ -873,7 +873,7 @@ namespace cv { namespace gpu { namespace device
#endif #endif
} }
template <typename T> template <typename T>
int countNonZeroCaller(const DevMem2Db src, PtrStepb buf) int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)
{ {
...@@ -890,9 +890,9 @@ namespace cv { namespace gpu { namespace device ...@@ -890,9 +890,9 @@ namespace cv { namespace gpu { namespace device
uint count; uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count; return count;
} }
template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb); template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);
template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb); template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);
...@@ -914,7 +914,7 @@ namespace cv { namespace gpu { namespace device ...@@ -914,7 +914,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, uint>(scount, tid); sumInSmem<nthreads, uint>(scount, tid);
if (tid == 0) if (tid == 0)
count[0] = scount[0]; count[0] = scount[0];
} }
...@@ -937,9 +937,9 @@ namespace cv { namespace gpu { namespace device ...@@ -937,9 +937,9 @@ namespace cv { namespace gpu { namespace device
uint count; uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count; return count;
} }
template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);
template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);
...@@ -965,16 +965,16 @@ namespace cv { namespace gpu { namespace device ...@@ -965,16 +965,16 @@ namespace cv { namespace gpu { namespace device
template <> struct SumType<float> { typedef float R; }; template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; }; template <> struct SumType<double> { typedef double R; };
template <typename R> template <typename R>
struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } }; struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };
template <typename R> template <typename R>
struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } }; struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };
template <> template <>
struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } }; struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
template <typename R> template <typename R>
struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } }; struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };
__constant__ int ctwidth; __constant__ int ctwidth;
...@@ -987,7 +987,7 @@ namespace cv { namespace gpu { namespace device ...@@ -987,7 +987,7 @@ namespace cv { namespace gpu { namespace device
void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
{ {
threads = dim3(threads_x, threads_y); threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y), grid = dim3(divUp(cols, threads.x * threads.y),
divUp(rows, threads.y * threads.x)); divUp(rows, threads.y * threads.x));
grid.x = std::min(grid.x, threads.x); grid.x = std::min(grid.x, threads.x);
grid.y = std::min(grid.y, threads.y); grid.y = std::min(grid.y, threads.y);
...@@ -1004,11 +1004,11 @@ namespace cv { namespace gpu { namespace device ...@@ -1004,11 +1004,11 @@ namespace cv { namespace gpu { namespace device
void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
} }
template <typename T, typename R, typename Op, int nthreads> template <typename T, typename R, typename Op, int nthreads>
...@@ -1055,7 +1055,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1055,7 +1055,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
if (tid == 0) if (tid == 0)
{ {
result[0] = smem[0]; result[0] = smem[0];
blocks_finished = 0; blocks_finished = 0;
...@@ -1078,7 +1078,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1078,7 +1078,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
if (tid == 0) if (tid == 0)
result[0] = smem[0]; result[0] = smem[0];
} }
...@@ -1142,7 +1142,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1142,7 +1142,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sumInSmem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
res.x = smem[0]; res.x = smem[0];
res.y = smem[nthreads]; res.y = smem[nthreads];
...@@ -1151,7 +1151,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1151,7 +1151,7 @@ namespace cv { namespace gpu { namespace device
} }
} }
#else #else
if (tid == 0) if (tid == 0)
{ {
DstType res; DstType res;
res.x = smem[0]; res.x = smem[0];
...@@ -1179,7 +1179,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1179,7 +1179,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sumInSmem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
res.x = smem[0]; res.x = smem[0];
res.y = smem[nthreads]; res.y = smem[nthreads];
...@@ -1252,7 +1252,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1252,7 +1252,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
res.x = smem[0]; res.x = smem[0];
res.y = smem[nthreads]; res.y = smem[nthreads];
...@@ -1262,7 +1262,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1262,7 +1262,7 @@ namespace cv { namespace gpu { namespace device
} }
} }
#else #else
if (tid == 0) if (tid == 0)
{ {
DstType res; DstType res;
res.x = smem[0]; res.x = smem[0];
...@@ -1293,7 +1293,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1293,7 +1293,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
res.x = smem[0]; res.x = smem[0];
res.y = smem[nthreads]; res.y = smem[nthreads];
...@@ -1323,7 +1323,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1323,7 +1323,7 @@ namespace cv { namespace gpu { namespace device
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{ {
val = ptr[x0 + x * blockDim.x]; val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
Op::call(val.z), Op::call(val.w)); Op::call(val.z), Op::call(val.w));
} }
} }
...@@ -1372,7 +1372,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1372,7 +1372,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
sumInSmem<nthreads, R>(smem + 3 * nthreads, tid); sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
res.x = smem[0]; res.x = smem[0];
res.y = smem[nthreads]; res.y = smem[nthreads];
...@@ -1383,7 +1383,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1383,7 +1383,7 @@ namespace cv { namespace gpu { namespace device
} }
} }
#else #else
if (tid == 0) if (tid == 0)
{ {
DstType res; DstType res;
res.x = smem[0]; res.x = smem[0];
...@@ -1417,7 +1417,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1417,7 +1417,7 @@ namespace cv { namespace gpu { namespace device
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
sumInSmem<nthreads, R>(smem + 3 * nthreads, tid); sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
res.x = smem[0]; res.x = smem[0];
res.y = smem[nthreads]; res.y = smem[nthreads];
...@@ -1488,7 +1488,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1488,7 +1488,7 @@ namespace cv { namespace gpu { namespace device
sum[1] = result[1]; sum[1] = result[1];
sum[2] = result[2]; sum[2] = result[2];
sum[3] = result[3]; sum[3] = result[3];
} }
template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int); template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int); template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
...@@ -1537,7 +1537,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1537,7 +1537,7 @@ namespace cv { namespace gpu { namespace device
sum[1] = result[1]; sum[1] = result[1];
sum[2] = result[2]; sum[2] = result[2];
sum[3] = result[3]; sum[3] = result[3];
} }
template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int); template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int); template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
...@@ -1608,7 +1608,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1608,7 +1608,7 @@ namespace cv { namespace gpu { namespace device
sum[1] = result[1]; sum[1] = result[1];
sum[2] = result[2]; sum[2] = result[2];
sum[3] = result[3]; sum[3] = result[3];
} }
template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int); template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int); template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
...@@ -1728,7 +1728,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1728,7 +1728,7 @@ namespace cv { namespace gpu { namespace device
sum[1] = result[1]; sum[1] = result[1];
sum[2] = result[2]; sum[2] = result[2];
sum[3] = result[3]; sum[3] = result[3];
} }
template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int); template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int); template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
...@@ -1894,7 +1894,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1894,7 +1894,7 @@ namespace cv { namespace gpu { namespace device
{ {
for (int y = threadIdx.y; y < src.rows; y += 16) for (int y = threadIdx.y; y < src.rows; y += 16)
myVal = op(myVal, src.ptr(y)[x]); myVal = op(myVal, src.ptr(y)[x]);
} }
smem[threadIdx.x * 16 + threadIdx.y] = myVal; smem[threadIdx.x * 16 + threadIdx.y] = myVal;
__syncthreads(); __syncthreads();
...@@ -1931,11 +1931,11 @@ namespace cv { namespace gpu { namespace device ...@@ -1931,11 +1931,11 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
reduceRows_caller<SumReductor, T, S, D>, reduceRows_caller<SumReductor, T, S, D>,
reduceRows_caller<AvgReductor, T, S, D>, reduceRows_caller<AvgReductor, T, S, D>,
reduceRows_caller<MaxReductor, T, S, D>, reduceRows_caller<MaxReductor, T, S, D>,
reduceRows_caller<MinReductor, T, S, D> reduceRows_caller<MinReductor, T, S, D>
}; };
...@@ -1944,15 +1944,15 @@ namespace cv { namespace gpu { namespace device ...@@ -1944,15 +1944,15 @@ namespace cv { namespace gpu { namespace device
template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
...@@ -2068,7 +2068,7 @@ namespace cv { namespace gpu { namespace device ...@@ -2068,7 +2068,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
static const caller_t callers[4][4] = static const caller_t callers[4][4] =
{ {
{reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>}, {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
{reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>}, {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
...@@ -2083,15 +2083,15 @@ namespace cv { namespace gpu { namespace device ...@@ -2083,15 +2083,15 @@ namespace cv { namespace gpu { namespace device
template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/common.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace optical_flow namespace optical_flow
{ {
...@@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device ...@@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device
#define NUM_VERTS_PER_ARROW 6 #define NUM_VERTS_PER_ARROW 6
__global__ void NeedleMapAverageKernel(const DevMem2Df u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg) __global__ void NeedleMapAverageKernel(const DevMem2Df u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
{ {
__shared__ float smem[2 * NEEDLE_MAP_SCALE]; __shared__ float smem[2 * NEEDLE_MAP_SCALE];
volatile float* u_col_sum = smem; volatile float* u_col_sum = smem;
...@@ -70,7 +70,7 @@ namespace cv { namespace gpu { namespace device ...@@ -70,7 +70,7 @@ namespace cv { namespace gpu { namespace device
} }
if (threadIdx.x < 8) if (threadIdx.x < 8)
{ {
// now add the column sums // now add the column sums
const uint X = threadIdx.x; const uint X = threadIdx.x;
...@@ -80,8 +80,8 @@ namespace cv { namespace gpu { namespace device ...@@ -80,8 +80,8 @@ namespace cv { namespace gpu { namespace device
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1]; v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
} }
if (X | 0xfe == 0xfc) // bits 0 & 1 == 0 if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
{ {
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2]; u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2]; v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
} }
...@@ -110,7 +110,7 @@ namespace cv { namespace gpu { namespace device ...@@ -110,7 +110,7 @@ namespace cv { namespace gpu { namespace device
v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0]; v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
} }
} }
void NeedleMapAverage_gpu(DevMem2Df u, DevMem2Df v, DevMem2Df u_avg, DevMem2Df v_avg) void NeedleMapAverage_gpu(DevMem2Df u, DevMem2Df v, DevMem2Df u_avg, DevMem2Df v_avg)
{ {
const dim3 block(NEEDLE_MAP_SCALE); const dim3 block(NEEDLE_MAP_SCALE);
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
// //
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
// //
// The original code was written by Paul Furgale and Chi Hay Tong // The original code was written by Paul Furgale and Chi Hay Tong
// and later optimized and prepared for integration into OpenCV by Itseez. // and later optimized and prepared for integration into OpenCV by Itseez.
// //
//M*/ //M*/
...@@ -51,7 +51,7 @@ ...@@ -51,7 +51,7 @@
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace orb namespace orb
{ {
...@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device ...@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
// cull // cull
int cull_gpu(int* loc, float* response, int size, int n_points) int cull_gpu(int* loc, float* response, int size, int n_points)
{ {
thrust::device_ptr<int> loc_ptr(loc); thrust::device_ptr<int> loc_ptr(loc);
thrust::device_ptr<float> response_ptr(response); thrust::device_ptr<float> response_ptr(response);
...@@ -83,10 +83,10 @@ namespace cv { namespace gpu { namespace device ...@@ -83,10 +83,10 @@ namespace cv { namespace gpu { namespace device
{ {
const short2 loc = loc_[ptidx]; const short2 loc = loc_[ptidx];
const int r = blockSize / 2; const int r = blockSize / 2;
const int x0 = loc.x - r; const int x0 = loc.x - r;
const int y0 = loc.y - r; const int y0 = loc.y - r;
int a = 0, b = 0, c = 0; int a = 0, b = 0, c = 0;
for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x) for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
...@@ -94,12 +94,12 @@ namespace cv { namespace gpu { namespace device ...@@ -94,12 +94,12 @@ namespace cv { namespace gpu { namespace device
const int i = ind / blockSize; const int i = ind / blockSize;
const int j = ind % blockSize; const int j = ind % blockSize;
int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 + int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
(img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) + (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
(img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1)); (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 + int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
(img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) + (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
(img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1)); (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
a += Ix * Ix; a += Ix * Ix;
...@@ -160,7 +160,7 @@ namespace cv { namespace gpu { namespace device ...@@ -160,7 +160,7 @@ namespace cv { namespace gpu { namespace device
int m_01 = 0, m_10 = 0; int m_01 = 0, m_10 = 0;
const short2 loc = loc_[ptidx]; const short2 loc = loc_[ptidx];
// Treat the center line differently, v=0 // Treat the center line differently, v=0
for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x) for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
m_10 += u * image(loc.y, loc.x + u); m_10 += u * image(loc.y, loc.x + u);
...@@ -173,7 +173,7 @@ namespace cv { namespace gpu { namespace device ...@@ -173,7 +173,7 @@ namespace cv { namespace gpu { namespace device
int v_sum = 0; int v_sum = 0;
int m_sum = 0; int m_sum = 0;
const int d = c_u_max[v]; const int d = c_u_max[v];
for (int u = threadIdx.x - d; u <= d; u += blockDim.x) for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
{ {
int val_plus = image(loc.y + v, loc.x + u); int val_plus = image(loc.y + v, loc.x + u);
...@@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device ...@@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device
{ {
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i) __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
{ {
pattern_x += 16 * i; pattern_x += 16 * i;
pattern_y += 16 * i; pattern_y += 16 * i;
int t0, t1, val; int t0, t1, val;
...@@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device ...@@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device
t0 = GET_VALUE(14); t1 = GET_VALUE(15); t0 = GET_VALUE(14); t1 = GET_VALUE(15);
val |= (t0 < t1) << 7; val |= (t0 < t1) << 7;
return val; return val;
} }
}; };
...@@ -266,23 +266,23 @@ namespace cv { namespace gpu { namespace device ...@@ -266,23 +266,23 @@ namespace cv { namespace gpu { namespace device
{ {
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i) __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
{ {
pattern_x += 12 * i; pattern_x += 12 * i;
pattern_y += 12 * i; pattern_y += 12 * i;
int t0, t1, t2, val; int t0, t1, t2, val;
t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2); t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0); val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5); t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2; val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8); t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4; val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11); t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6; val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
return val; return val;
} }
}; };
...@@ -291,9 +291,9 @@ namespace cv { namespace gpu { namespace device ...@@ -291,9 +291,9 @@ namespace cv { namespace gpu { namespace device
{ {
__device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i) __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
{ {
pattern_x += 16 * i; pattern_x += 16 * i;
pattern_y += 16 * i; pattern_y += 16 * i;
int t0, t1, t2, t3, k, val; int t0, t1, t2, t3, k, val;
int a, b; int a, b;
...@@ -304,7 +304,7 @@ namespace cv { namespace gpu { namespace device ...@@ -304,7 +304,7 @@ namespace cv { namespace gpu { namespace device
if( t3 > t2 ) t2 = t3, b = 3; if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b; k = t0 > t2 ? a : b;
val = k; val = k;
t0 = GET_VALUE(4); t1 = GET_VALUE(5); t0 = GET_VALUE(4); t1 = GET_VALUE(5);
t2 = GET_VALUE(6); t3 = GET_VALUE(7); t2 = GET_VALUE(6); t3 = GET_VALUE(7);
a = 0, b = 2; a = 0, b = 2;
...@@ -312,7 +312,7 @@ namespace cv { namespace gpu { namespace device ...@@ -312,7 +312,7 @@ namespace cv { namespace gpu { namespace device
if( t3 > t2 ) t2 = t3, b = 3; if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b; k = t0 > t2 ? a : b;
val |= k << 2; val |= k << 2;
t0 = GET_VALUE(8); t1 = GET_VALUE(9); t0 = GET_VALUE(8); t1 = GET_VALUE(9);
t2 = GET_VALUE(10); t3 = GET_VALUE(11); t2 = GET_VALUE(10); t3 = GET_VALUE(11);
a = 0, b = 2; a = 0, b = 2;
...@@ -320,7 +320,7 @@ namespace cv { namespace gpu { namespace device ...@@ -320,7 +320,7 @@ namespace cv { namespace gpu { namespace device
if( t3 > t2 ) t2 = t3, b = 3; if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b; k = t0 > t2 ? a : b;
val |= k << 4; val |= k << 4;
t0 = GET_VALUE(12); t1 = GET_VALUE(13); t0 = GET_VALUE(12); t1 = GET_VALUE(13);
t2 = GET_VALUE(14); t3 = GET_VALUE(15); t2 = GET_VALUE(14); t3 = GET_VALUE(15);
a = 0, b = 2; a = 0, b = 2;
...@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device ...@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
if( t3 > t2 ) t2 = t3, b = 3; if( t3 > t2 ) t2 = t3, b = 3;
k = t0 > t2 ? a : b; k = t0 > t2 ? a : b;
val |= k << 6; val |= k << 6;
return val; return val;
} }
}; };
...@@ -399,7 +399,7 @@ namespace cv { namespace gpu { namespace device ...@@ -399,7 +399,7 @@ namespace cv { namespace gpu { namespace device
y[ptidx] = loc.y * scale; y[ptidx] = loc.y * scale;
} }
} }
void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream) void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
{ {
dim3 block(256); dim3 block(256);
......
...@@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device ...@@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device
{ {
static void call(DevMem2D_<T> src, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int) static void call(DevMem2D_<T> src, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int)
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
...@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device ...@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() ); \ cudaSafeCall( cudaDeviceSynchronize() ); \
} \ } \
}; };
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar) OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4) OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
...@@ -188,7 +188,7 @@ namespace cv { namespace gpu { namespace device ...@@ -188,7 +188,7 @@ namespace cv { namespace gpu { namespace device
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{ {
static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy,
DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc) DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
{ {
if (stream == 0) if (stream == 0)
...@@ -198,13 +198,13 @@ namespace cv { namespace gpu { namespace device ...@@ -198,13 +198,13 @@ namespace cv { namespace gpu { namespace device
} }
}; };
template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,
DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc) DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
{ {
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,
DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc); DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc);
static const caller_t callers[3][5] = static const caller_t callers[3][5] =
{ {
{ {
RemapDispatcher<PointFilter, BrdReflect101, T>::call, RemapDispatcher<PointFilter, BrdReflect101, T>::call,
...@@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device ...@@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device
} }
}; };
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap, callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap,
static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc); static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
} }
......
...@@ -228,7 +228,7 @@ namespace cv { namespace gpu { namespace device ...@@ -228,7 +228,7 @@ namespace cv { namespace gpu { namespace device
} }
}; };
template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
DevMem2Db dst, int interpolation, cudaStream_t stream) DevMem2Db dst, int interpolation, cudaStream_t stream)
{ {
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream); typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream);
...@@ -244,7 +244,7 @@ namespace cv { namespace gpu { namespace device ...@@ -244,7 +244,7 @@ namespace cv { namespace gpu { namespace device
if (interpolation == 3 && (fx <= 1.f || fy <= 1.f)) if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
interpolation = 1; interpolation = 1;
callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy, callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy,
static_cast< DevMem2D_<T> >(dst), stream); static_cast< DevMem2D_<T> >(dst), stream);
} }
......
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/vec_traits.hpp" #include "opencv2/gpu/device/vec_traits.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace video_encoding namespace video_encoding
{ {
...@@ -159,12 +159,12 @@ namespace cv { namespace gpu { namespace device ...@@ -159,12 +159,12 @@ namespace cv { namespace gpu { namespace device
void YV12_gpu(const DevMem2Db src, int cn, DevMem2Db dst) void YV12_gpu(const DevMem2Db src, int cn, DevMem2Db dst)
{ {
typedef void (*func_t)(const DevMem2Db src, PtrStepb dst); typedef void (*func_t)(const DevMem2Db src, PtrStepb dst);
static const func_t funcs[] = static const func_t funcs[] =
{ {
0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4> 0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>
}; };
funcs[cn](src, dst); funcs[cn](src, dst);
} }
} }
......
...@@ -48,9 +48,9 @@ ...@@ -48,9 +48,9 @@
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/static_check.hpp" #include "opencv2/gpu/device/static_check.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace row_filter namespace row_filter
{ {
#define MAX_KERNEL_SIZE 32 #define MAX_KERNEL_SIZE 32
...@@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device ...@@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
__shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (y >= src.rows) if (y >= src.rows)
...@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace device ...@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream); typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
static const caller_t callers[5][33] = static const caller_t callers[5][33] =
{ {
{ {
0, 0,
...@@ -337,9 +337,9 @@ namespace cv { namespace gpu { namespace device ...@@ -337,9 +337,9 @@ namespace cv { namespace gpu { namespace device
linearRowFilter_caller<30, T, D, BrdRowWrap>, linearRowFilter_caller<30, T, D, BrdRowWrap>,
linearRowFilter_caller<31, T, D, BrdRowWrap>, linearRowFilter_caller<31, T, D, BrdRowWrap>,
linearRowFilter_caller<32, T, D, BrdRowWrap> linearRowFilter_caller<32, T, D, BrdRowWrap>
} }
}; };
loadKernel(kernel, ksize); loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
......
...@@ -60,7 +60,7 @@ ...@@ -60,7 +60,7 @@
#define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__) #define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__)
#endif #endif
namespace cv { namespace gpu namespace cv { namespace gpu
{ {
void nppError(int err, const char *file, const int line, const char *func = ""); void nppError(int err, const char *file, const int line, const char *func = "");
void ncvError(int err, const char *file, const int line, const char *func = ""); void ncvError(int err, const char *file, const int line, const char *func = "");
......
...@@ -42,12 +42,12 @@ ...@@ -42,12 +42,12 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace split_merge namespace split_merge
{ {
template <typename T, size_t elem_size = sizeof(T)> template <typename T, size_t elem_size = sizeof(T)>
struct TypeTraits struct TypeTraits
{ {
typedef T type; typedef T type;
typedef T type2; typedef T type2;
...@@ -74,7 +74,7 @@ namespace cv { namespace gpu { namespace device ...@@ -74,7 +74,7 @@ namespace cv { namespace gpu { namespace device
}; };
template <typename T> template <typename T>
struct TypeTraits<T, 4> struct TypeTraits<T, 4>
{ {
typedef int type; typedef int type;
typedef int2 type2; typedef int2 type2;
...@@ -83,7 +83,7 @@ namespace cv { namespace gpu { namespace device ...@@ -83,7 +83,7 @@ namespace cv { namespace gpu { namespace device
}; };
template <typename T> template <typename T>
struct TypeTraits<T, 8> struct TypeTraits<T, 8>
{ {
typedef double type; typedef double type;
typedef double2 type2; typedef double2 type2;
...@@ -95,11 +95,11 @@ namespace cv { namespace gpu { namespace device ...@@ -95,11 +95,11 @@ namespace cv { namespace gpu { namespace device
typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream); typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
//------------------------------------------------------------ //------------------------------------------------------------
// Merge // Merge
template <typename T> template <typename T>
__global__ void mergeC2_(const uchar* src0, size_t src0_step, __global__ void mergeC2_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
typedef typename TypeTraits<T>::type2 dst_type; typedef typename TypeTraits<T>::type2 dst_type;
...@@ -111,8 +111,8 @@ namespace cv { namespace gpu { namespace device ...@@ -111,8 +111,8 @@ namespace cv { namespace gpu { namespace device
const T* src1_y = (const T*)(src1 + y * src1_step); const T* src1_y = (const T*)(src1 + y * src1_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step); dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
dst_type dst_elem; dst_type dst_elem;
dst_elem.x = src0_y[x]; dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x]; dst_elem.y = src1_y[x];
...@@ -122,9 +122,9 @@ namespace cv { namespace gpu { namespace device ...@@ -122,9 +122,9 @@ namespace cv { namespace gpu { namespace device
template <typename T> template <typename T>
__global__ void mergeC3_(const uchar* src0, size_t src0_step, __global__ void mergeC3_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step, const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
typedef typename TypeTraits<T>::type3 dst_type; typedef typename TypeTraits<T>::type3 dst_type;
...@@ -137,8 +137,8 @@ namespace cv { namespace gpu { namespace device ...@@ -137,8 +137,8 @@ namespace cv { namespace gpu { namespace device
const T* src2_y = (const T*)(src2 + y * src2_step); const T* src2_y = (const T*)(src2 + y * src2_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step); dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
dst_type dst_elem; dst_type dst_elem;
dst_elem.x = src0_y[x]; dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x]; dst_elem.y = src1_y[x];
...@@ -149,9 +149,9 @@ namespace cv { namespace gpu { namespace device ...@@ -149,9 +149,9 @@ namespace cv { namespace gpu { namespace device
template <> template <>
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step, const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -162,8 +162,8 @@ namespace cv { namespace gpu { namespace device ...@@ -162,8 +162,8 @@ namespace cv { namespace gpu { namespace device
const double* src2_y = (const double*)(src2 + y * src2_step); const double* src2_y = (const double*)(src2 + y * src2_step);
double* dst_y = (double*)(dst + y * dst_step); double* dst_y = (double*)(dst + y * dst_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
dst_y[3 * x] = src0_y[x]; dst_y[3 * x] = src0_y[x];
dst_y[3 * x + 1] = src1_y[x]; dst_y[3 * x + 1] = src1_y[x];
dst_y[3 * x + 2] = src2_y[x]; dst_y[3 * x + 2] = src2_y[x];
...@@ -172,10 +172,10 @@ namespace cv { namespace gpu { namespace device ...@@ -172,10 +172,10 @@ namespace cv { namespace gpu { namespace device
template <typename T> template <typename T>
__global__ void mergeC4_(const uchar* src0, size_t src0_step, __global__ void mergeC4_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step, const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step, const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
typedef typename TypeTraits<T>::type4 dst_type; typedef typename TypeTraits<T>::type4 dst_type;
...@@ -189,8 +189,8 @@ namespace cv { namespace gpu { namespace device ...@@ -189,8 +189,8 @@ namespace cv { namespace gpu { namespace device
const T* src3_y = (const T*)(src3 + y * src3_step); const T* src3_y = (const T*)(src3 + y * src3_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step); dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
dst_type dst_elem; dst_type dst_elem;
dst_elem.x = src0_y[x]; dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x]; dst_elem.y = src1_y[x];
...@@ -202,10 +202,10 @@ namespace cv { namespace gpu { namespace device ...@@ -202,10 +202,10 @@ namespace cv { namespace gpu { namespace device
template <> template <>
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step, const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step, const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -217,8 +217,8 @@ namespace cv { namespace gpu { namespace device ...@@ -217,8 +217,8 @@ namespace cv { namespace gpu { namespace device
const double* src3_y = (const double*)(src3 + y * src3_step); const double* src3_y = (const double*)(src3 + y * src3_step);
double2* dst_y = (double2*)(dst + y * dst_step); double2* dst_y = (double2*)(dst + y * dst_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]); dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]); dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
} }
...@@ -303,7 +303,7 @@ namespace cv { namespace gpu { namespace device ...@@ -303,7 +303,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> template <typename T>
__global__ void splitC2_(const uchar* src, size_t src_step, __global__ void splitC2_(const uchar* src, size_t src_step,
int rows, int cols, int rows, int cols,
uchar* dst0, size_t dst0_step, uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step) uchar* dst1, size_t dst1_step)
...@@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device ...@@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
T* dst0_y = (T*)(dst0 + y * dst0_step); T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step); T* dst1_y = (T*)(dst1 + y * dst1_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
src_type src_elem = src_y[x]; src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x; dst0_y[x] = src_elem.x;
...@@ -327,7 +327,7 @@ namespace cv { namespace gpu { namespace device ...@@ -327,7 +327,7 @@ namespace cv { namespace gpu { namespace device
template <typename T> template <typename T>
__global__ void splitC3_(const uchar* src, size_t src_step, __global__ void splitC3_(const uchar* src, size_t src_step,
int rows, int cols, int rows, int cols,
uchar* dst0, size_t dst0_step, uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step, uchar* dst1, size_t dst1_step,
...@@ -343,7 +343,7 @@ namespace cv { namespace gpu { namespace device ...@@ -343,7 +343,7 @@ namespace cv { namespace gpu { namespace device
T* dst1_y = (T*)(dst1 + y * dst1_step); T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step); T* dst2_y = (T*)(dst2 + y * dst2_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
src_type src_elem = src_y[x]; src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x; dst0_y[x] = src_elem.x;
...@@ -368,7 +368,7 @@ namespace cv { namespace gpu { namespace device ...@@ -368,7 +368,7 @@ namespace cv { namespace gpu { namespace device
double* dst1_y = (double*)(dst1 + y * dst1_step); double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step); double* dst2_y = (double*)(dst2 + y * dst2_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
dst0_y[x] = src_y[3 * x]; dst0_y[x] = src_y[3 * x];
dst1_y[x] = src_y[3 * x + 1]; dst1_y[x] = src_y[3 * x + 1];
...@@ -395,7 +395,7 @@ namespace cv { namespace gpu { namespace device ...@@ -395,7 +395,7 @@ namespace cv { namespace gpu { namespace device
T* dst2_y = (T*)(dst2 + y * dst2_step); T* dst2_y = (T*)(dst2 + y * dst2_step);
T* dst3_y = (T*)(dst3 + y * dst3_step); T* dst3_y = (T*)(dst3 + y * dst3_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
src_type src_elem = src_y[x]; src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x; dst0_y[x] = src_elem.x;
...@@ -423,7 +423,7 @@ namespace cv { namespace gpu { namespace device ...@@ -423,7 +423,7 @@ namespace cv { namespace gpu { namespace device
double* dst2_y = (double*)(dst2 + y * dst2_step); double* dst2_y = (double*)(dst2 + y * dst2_step);
double* dst3_y = (double*)(dst3 + y * dst3_step); double* dst3_y = (double*)(dst3 + y * dst3_step);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
double2 src_elem1 = src_y[2 * x]; double2 src_elem1 = src_y[2 * x];
double2 src_elem2 = src_y[2 * x + 1]; double2 src_elem2 = src_y[2 * x + 1];
......
...@@ -42,9 +42,9 @@ ...@@ -42,9 +42,9 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace stereobm namespace stereobm
{ {
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Stereo BM //////////////////////////////////////////////// /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
...@@ -70,7 +70,7 @@ namespace cv { namespace gpu { namespace device ...@@ -70,7 +70,7 @@ namespace cv { namespace gpu { namespace device
template<int RADIUS> template<int RADIUS>
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd) __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
{ {
unsigned int cache = 0; unsigned int cache = 0;
unsigned int cache2 = 0; unsigned int cache2 = 0;
...@@ -401,8 +401,8 @@ namespace cv { namespace gpu { namespace device ...@@ -401,8 +401,8 @@ namespace cv { namespace gpu { namespace device
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap); prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForSobel ) ); cudaSafeCall( cudaUnbindTexture (texForSobel ) );
} }
......
...@@ -44,9 +44,9 @@ ...@@ -44,9 +44,9 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace stereobp namespace stereobp
{ {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// load constants //////////////////////// /////////////////////// load constants ////////////////////////
......
...@@ -44,9 +44,9 @@ ...@@ -44,9 +44,9 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace stereocsbp namespace stereocsbp
{ {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// load constants //////////////////////// /////////////////////// load constants ////////////////////////
...@@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device ...@@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device
__constant__ int cth; __constant__ int cth;
__constant__ size_t cimg_step; __constant__ size_t cimg_step;
__constant__ size_t cmsg_step; __constant__ size_t cmsg_step;
__constant__ size_t cdisp_step1; __constant__ size_t cdisp_step1;
__constant__ size_t cdisp_step2; __constant__ size_t cdisp_step2;
...@@ -392,7 +392,7 @@ namespace cv { namespace gpu { namespace device ...@@ -392,7 +392,7 @@ namespace cv { namespace gpu { namespace device
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane); get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
else else
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane); get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
...@@ -575,7 +575,7 @@ namespace cv { namespace gpu { namespace device ...@@ -575,7 +575,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream); callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -588,13 +588,13 @@ namespace cv { namespace gpu { namespace device ...@@ -588,13 +588,13 @@ namespace cv { namespace gpu { namespace device
template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step, template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream); int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////////// init message ///////////////////////// //////////////////////// init message /////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <typename T> template <typename T>
__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new, __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur, const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
...@@ -691,7 +691,7 @@ namespace cv { namespace gpu { namespace device ...@@ -691,7 +691,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -720,7 +720,7 @@ namespace cv { namespace gpu { namespace device ...@@ -720,7 +720,7 @@ namespace cv { namespace gpu { namespace device
const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur, const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
float* selected_disp_pyr_new, const float* selected_disp_pyr_cur, float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
float* data_cost_selected, const float* data_cost, size_t msg_step, float* data_cost_selected, const float* data_cost, size_t msg_step,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream); int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////// calc all iterations ///////////////////// //////////////////// calc all iterations /////////////////////
...@@ -805,7 +805,7 @@ namespace cv { namespace gpu { namespace device ...@@ -805,7 +805,7 @@ namespace cv { namespace gpu { namespace device
for(int t = 0; t < iters; ++t) for(int t = 0; t < iters; ++t)
{ {
compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1); compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
...@@ -814,7 +814,7 @@ namespace cv { namespace gpu { namespace device ...@@ -814,7 +814,7 @@ namespace cv { namespace gpu { namespace device
template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step, template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
int h, int w, int nr_plane, int iters, cudaStream_t stream); int h, int w, int nr_plane, int iters, cudaStream_t stream);
template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
int h, int w, int nr_plane, int iters, cudaStream_t stream); int h, int w, int nr_plane, int iters, cudaStream_t stream);
...@@ -879,7 +879,7 @@ namespace cv { namespace gpu { namespace device ...@@ -879,7 +879,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream); const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step, template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
......
...@@ -98,7 +98,7 @@ namespace cv { namespace gpu { namespace device ...@@ -98,7 +98,7 @@ namespace cv { namespace gpu { namespace device
{ {
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y)); dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap); buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device ...@@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device
{ {
static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, int) static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, int)
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
...@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device ...@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_WARP_TEX #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
{ {
static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc) static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
{ {
if (stream == 0) if (stream == 0)
...@@ -266,7 +266,7 @@ namespace cv { namespace gpu { namespace device ...@@ -266,7 +266,7 @@ namespace cv { namespace gpu { namespace device
} }
}; };
template <class Transform, typename T> template <class Transform, typename T>
void warp_caller(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Db dst, int interpolation, void warp_caller(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Db dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc) int borderMode, const float* borderValue, cudaStream_t stream, int cc)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment