Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
d661b8e3
Commit
d661b8e3
authored
Oct 31, 2010
by
Anatoly Baksheev
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added PtrStep PtrElemStep structures. Refactored name spaces,
parent
501e81eb
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
385 additions
and
574 deletions
+385
-574
devmem2d.hpp
modules/gpu/include/opencv2/gpu/devmem2d.hpp
+56
-33
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+1
-0
matrix_operations.hpp
modules/gpu/include/opencv2/gpu/matrix_operations.hpp
+1
-0
beliefpropagation.cu
modules/gpu/src/cuda/beliefpropagation.cu
+19
-43
color.cu
modules/gpu/src/cuda/color.cu
+56
-104
constantspacebp.cu
modules/gpu/src/cuda/constantspacebp.cu
+47
-79
cuda_shared.hpp
modules/gpu/src/cuda/cuda_shared.hpp
+2
-2
filters.cu
modules/gpu/src/cuda/filters.cu
+13
-11
imgproc.cu
modules/gpu/src/cuda/imgproc.cu
+18
-39
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+19
-30
matrix_operations.cu
modules/gpu/src/cuda/matrix_operations.cu
+13
-61
split_merge.cu
modules/gpu/src/cuda/split_merge.cu
+24
-24
stereobm.cu
modules/gpu/src/cuda/stereobm.cu
+84
-121
transform.hpp
modules/gpu/src/cuda/transform.hpp
+14
-20
vecmath.hpp
modules/gpu/src/cuda/vecmath.hpp
+8
-1
filtering.cpp
modules/gpu/src/filtering.cpp
+6
-2
stereobm_gpu.cpp
modules/gpu/src/stereobm_gpu.cpp
+4
-4
No files found.
modules/gpu/include/opencv2/gpu/devmem2d.hpp
View file @
d661b8e3
...
...
@@ -50,56 +50,79 @@ namespace cv
// Simple lightweight structures that encapsulates information about an image on device.
// It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
template
<
typename
T
>
struct
PtrStep_
{
T
*
ptr
;
size_t
step
;
typedef
T
elem_type
;
enum
{
elem_size
=
sizeof
(
elem_type
)
};
#if defined(__CUDACC__)
__host__
__device__
#define __CV_GPU_HOST_DEVICE__ __host__ __device__
#else
#define __CV_GPU_HOST_DEVICE__
#endif
size_t
elemSize
()
const
{
return
elem_size
;
}
};
template
<
typename
T
>
struct
DevMem2D_
{
int
cols
;
int
rows
;
T
*
ptr
;
T
*
data
;
size_t
step
;
size_t
elem_step
;
/*__host__*/
DevMem2D_
()
:
cols
(
0
),
rows
(
0
),
ptr
(
0
),
step
(
0
),
elem_step
(
0
)
{}
/*__host__*/
DevMem2D_
(
int
rows_
,
int
cols_
,
T
*
ptr_
,
size_t
step_
)
:
cols
(
cols_
),
rows
(
rows_
),
ptr
(
ptr_
),
step
(
step_
),
elem_step
(
step_
/
sizeof
(
T
))
{}
template
<
typename
U
>
/*__host__*/
DevMem2D_
()
:
cols
(
0
),
rows
(
0
),
data
(
0
),
step
(
0
)
{}
DevMem2D_
(
int
rows_
,
int
cols_
,
T
*
data_
,
size_t
step_
)
:
cols
(
cols_
),
rows
(
rows_
),
data
(
data_
),
step
(
step_
)
{}
template
<
typename
U
>
explicit
DevMem2D_
(
const
DevMem2D_
<
U
>&
d
)
:
cols
(
d
.
cols
),
rows
(
d
.
rows
),
ptr
((
T
*
)
d
.
ptr
),
step
(
d
.
step
),
elem_step
(
d
.
step
/
sizeof
(
T
))
{}
:
cols
(
d
.
cols
),
rows
(
d
.
rows
),
data
((
T
*
)
d
.
data
),
step
(
d
.
step
)
{}
typedef
T
elem_type
;
enum
{
elem_size
=
sizeof
(
elem_type
)
};
template
<
typename
U
>
/*__host__*/
operator
PtrStep_
<
U
>
()
const
{
PtrStep_
<
U
>
dt
;
dt
.
ptr
=
ptr
;
dt
.
step
=
step
;
return
dt
;
}
__CV_GPU_HOST_DEVICE__
size_t
elemSize
()
const
{
return
elem_size
;
}
__CV_GPU_HOST_DEVICE__
T
*
ptr
(
int
y
=
0
)
{
return
(
T
*
)(
(
char
*
)
data
+
y
*
step
);
}
__CV_GPU_HOST_DEVICE__
const
T
*
ptr
(
int
y
=
0
)
const
{
return
(
const
T
*
)(
(
const
char
*
)
data
+
y
*
step
);
}
};
template
<
typename
T
>
struct
PtrStep_
{
T
*
data
;
size_t
step
;
typedef
typename
PtrStep_
<
T
>::
elem_type
elem_type
;
enum
{
elem_size
=
PtrStep_
<
T
>::
elem_size
};
#if defined(__CUDACC__)
__host__
__device__
#endif
size_t
elemSize
()
const
{
return
elem_size
;
}
PtrStep_
()
:
data
(
0
),
step
(
0
)
{}
PtrStep_
(
const
DevMem2D_
<
T
>&
mem
)
:
data
(
mem
.
data
),
step
(
mem
.
step
)
{}
typedef
T
elem_type
;
enum
{
elem_size
=
sizeof
(
elem_type
)
};
__CV_GPU_HOST_DEVICE__
size_t
elemSize
()
const
{
return
elem_size
;
}
__CV_GPU_HOST_DEVICE__
T
*
ptr
(
int
y
=
0
)
{
return
(
T
*
)(
(
char
*
)
data
+
y
*
step
);
}
__CV_GPU_HOST_DEVICE__
const
T
*
ptr
(
int
y
=
0
)
const
{
return
(
const
T
*
)(
(
const
char
*
)
data
+
y
*
step
);
}
};
template
<
typename
T
>
struct
PtrElemStep_
:
public
PtrStep_
<
T
>
{
PtrElemStep_
(
const
DevMem2D_
<
T
>&
mem
)
:
PtrStep_
<
T
>
(
mem
)
{
step
/=
elem_size
;
}
private
:
template
<
bool
>
struct
StaticCheck
;
template
<>
struct
StaticCheck
<
true
>
{};
StaticCheck
<
256
%
sizeof
(
T
)
==
0
>
ElemStepTypeCheck
;
};
typedef
DevMem2D_
<
unsigned
char
>
DevMem2D
;
typedef
DevMem2D_
<
float
>
DevMem2Df
;
typedef
DevMem2D_
<
int
>
DevMem2Di
;
}
typedef
PtrStep_
<
unsigned
char
>
PtrStep
;
typedef
PtrStep_
<
float
>
PtrStepf
;
typedef
PtrStep_
<
int
>
PtrStepi
;
typedef
PtrElemStep_
<
unsigned
char
>
PtrElemStep
;
typedef
PtrElemStep_
<
float
>
PtrElemStepf
;
typedef
PtrElemStep_
<
int
>
PtrElemStepi
;
#undef __CV_GPU_HOST_DEVICE__
}
}
#endif
/* __OPENCV_GPU_DEVMEM2D_HPP__ */
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
d661b8e3
...
...
@@ -109,6 +109,7 @@ namespace cv
//! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.
// Contains just image size, data ptr and step.
template
<
class
T
>
operator
DevMem2D_
<
T
>
()
const
;
template
<
class
T
>
operator
PtrStep_
<
T
>
()
const
;
//! pefroms blocking upload data to GpuMat. .
void
upload
(
const
cv
::
Mat
&
m
);
...
...
modules/gpu/include/opencv2/gpu/matrix_operations.hpp
View file @
d661b8e3
...
...
@@ -207,6 +207,7 @@ inline GpuMat& GpuMat::operator = (const GpuMat& m)
inline
GpuMat
&
GpuMat
::
operator
=
(
const
Mat
&
m
)
{
upload
(
m
);
return
*
this
;
}
template
<
class
T
>
inline
GpuMat
::
operator
DevMem2D_
<
T
>
()
const
{
return
DevMem2D_
<
T
>
(
rows
,
cols
,
(
T
*
)
data
,
step
);
}
template
<
class
T
>
inline
GpuMat
::
operator
PtrStep_
<
T
>
()
const
{
return
PtrStep_
<
T
>
(
*
this
);
}
//CPP: void GpuMat::upload(const Mat& m);
...
...
modules/gpu/src/cuda/beliefpropagation.cu
View file @
d661b8e3
...
...
@@ -50,36 +50,32 @@ using namespace cv::gpu;
#define FLT_MAX 3.402823466e+38F
#endif
namespace cv { namespace gpu { namespace bp {
///////////////////////////////////////////////////////////////
/////////////////////// load constants ////////////////////////
///////////////////////////////////////////////////////////////
namespace bp_kernels
{
__constant__ int cndisp;
__constant__ float cmax_data_term;
__constant__ float cdata_weight;
__constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump;
};
namespace cv { namespace gpu { namespace bp {
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
{
cudaSafeCall( cudaMemcpyToSymbol(
bp_kernels::
cndisp, &ndisp, sizeof(int )) );
cudaSafeCall( cudaMemcpyToSymbol(
bp_kernels::
cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(
bp_kernels::
cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(
bp_kernels::
cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(
bp_kernels::
cdisc_single_jump, &disc_single_jump, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
}
}}}
///////////////////////////////////////////////////////////////
////////////////////////// comp data //////////////////////////
///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T>
__global__ void comp_data_gray(const uchar* l, const uchar* r, size_t step, T* data, size_t data_step, int cols, int rows)
{
...
...
@@ -145,9 +141,7 @@ namespace bp_kernels
}
}
}
}
namespace cv { namespace gpu { namespace bp {
typedef void (*CompDataFunc)(const DevMem2D& l, const DevMem2D& r, int channels, DevMem2D mdata, const cudaStream_t& stream);
template<typename T>
...
...
@@ -160,9 +154,9 @@ namespace cv { namespace gpu { namespace bp {
grid.y = divUp(l.rows, threads.y);
if (channels == 1)
bp_kernels::comp_data_gray<T><<<grid, threads, 0, stream>>>(l.ptr, r.ptr, l.step, (T*)mdata.ptr
, mdata.step/sizeof(T), l.cols, l.rows);
comp_data_gray<T><<<grid, threads, 0, stream>>>(l.data, r.data, l.step, (T*)mdata.data
, mdata.step/sizeof(T), l.cols, l.rows);
else
bp_kernels::comp_data_bgr<T><<<grid, threads, 0, stream>>>(l.ptr, r.ptr, l.step, (T*)mdata.ptr
, mdata.step/sizeof(T), l.cols, l.rows);
comp_data_bgr<T><<<grid, threads, 0, stream>>>(l.data, r.data, l.step, (T*)mdata.data
, mdata.step/sizeof(T), l.cols, l.rows);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -187,14 +181,11 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(l, r, channels, mdata, stream);
}
}}}
///////////////////////////////////////////////////////////////
//////////////////////// data step down ///////////////////////
///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T>
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step)
{
...
...
@@ -217,9 +208,7 @@ namespace bp_kernels
}
}
}
}
namespace cv { namespace gpu { namespace bp {
typedef void (*DataStepDownFunc)(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, DevMem2D dst, const cudaStream_t& stream);
template<typename T>
...
...
@@ -231,7 +220,7 @@ namespace cv { namespace gpu { namespace bp {
grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y);
bp_kernels::data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)src.ptr, src.step/sizeof(T), (T*)dst.ptr
, dst.step/sizeof(T));
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)src.data, src.step/sizeof(T), (T*)dst.data
, dst.step/sizeof(T));
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -256,14 +245,11 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(dst_cols, dst_rows, src_rows, src, dst, stream);
}
}}}
///////////////////////////////////////////////////////////////
/////////////////// level up messages ////////////////////////
///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T>
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step)
{
...
...
@@ -282,9 +268,7 @@ namespace bp_kernels
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
}
}
}
namespace cv { namespace gpu { namespace bp {
typedef void (*LevelUpMessagesFunc)(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, const cudaStream_t& stream);
template<typename T>
...
...
@@ -298,10 +282,10 @@ namespace cv { namespace gpu { namespace bp {
int src_idx = (dst_idx + 1) & 1;
bp_kernels::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mus[src_idx].ptr, mus[src_idx].step/sizeof(T), (T*)mus[dst_idx].ptr
, mus[dst_idx].step/sizeof(T));
bp_kernels::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mds[src_idx].ptr, mds[src_idx].step/sizeof(T), (T*)mds[dst_idx].ptr
, mds[dst_idx].step/sizeof(T));
bp_kernels::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mls[src_idx].ptr, mls[src_idx].step/sizeof(T), (T*)mls[dst_idx].ptr
, mls[dst_idx].step/sizeof(T));
bp_kernels::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mrs[src_idx].ptr, mrs[src_idx].step/sizeof(T), (T*)mrs[dst_idx].ptr
, mrs[dst_idx].step/sizeof(T));
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mus[src_idx].data, mus[src_idx].step/sizeof(T), (T*)mus[dst_idx].data
, mus[dst_idx].step/sizeof(T));
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mds[src_idx].data, mds[src_idx].step/sizeof(T), (T*)mds[dst_idx].data
, mds[dst_idx].step/sizeof(T));
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mls[src_idx].data, mls[src_idx].step/sizeof(T), (T*)mls[dst_idx].data
, mls[dst_idx].step/sizeof(T));
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mrs[src_idx].data, mrs[src_idx].step/sizeof(T), (T*)mrs[dst_idx].data
, mrs[dst_idx].step/sizeof(T));
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -326,14 +310,11 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(dst_idx, dst_cols, dst_rows, src_rows, mus, mds, mls, mrs, stream);
}
}}}
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T>
__device__ void calc_min_linear_penalty(T* dst, size_t step)
{
...
...
@@ -427,9 +408,7 @@ namespace bp_kernels
message(us + msg_step, ds - msg_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
}
}
}
namespace cv { namespace gpu { namespace bp {
typedef void (*CalcAllIterationFunc)(int cols, int rows, int iters, DevMem2D& u, DevMem2D& d, DevMem2D& l, DevMem2D& r, const DevMem2D& data, const cudaStream_t& stream);
template<typename T>
...
...
@@ -443,7 +422,7 @@ namespace cv { namespace gpu { namespace bp {
for(int t = 0; t < iters; ++t)
{
bp_kernels::one_iteration<T><<<grid, threads, 0, stream>>>(t, (T*)u.ptr, (T*)d.ptr, (T*)l.ptr, (T*)r.ptr, u.step/sizeof(T), (const T*)data.ptr
, data.step/sizeof(T), cols, rows);
one_iteration<T><<<grid, threads, 0, stream>>>(t, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, u.step/sizeof(T), (const T*)data.data
, data.step/sizeof(T), cols, rows);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -469,14 +448,11 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(cols, rows, iters, u, d, l, r, data, stream);
}
}}}
///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T>
__global__ void output(int cols, int rows, const T* u, const T* d, const T* l, const T* r, const T* data, size_t step, short* disp, size_t res_step)
{
...
...
@@ -513,9 +489,7 @@ namespace bp_kernels
disp[res_step * y + x] = saturate_cast<short>(best);
}
}
}
namespace cv { namespace gpu { namespace bp {
typedef void (*OutputFunc)(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, DevMem2D disp, const cudaStream_t& stream);
template<typename T>
...
...
@@ -527,7 +501,7 @@ namespace cv { namespace gpu { namespace bp {
grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y);
bp_kernels::output<T><<<grid, threads, 0, stream>>>(disp.cols, disp.rows, (const T*)u.ptr, (const T*)d.ptr, (const T*)l.ptr, (const T*)r.ptr, (const T*)data.ptr, u.step/sizeof(T), (short*)disp.ptr
, disp.step/sizeof(short));
output<T><<<grid, threads, 0, stream>>>(disp.cols, disp.rows, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, u.step/sizeof(T), (short*)disp.data
, disp.step/sizeof(short));
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -552,4 +526,5 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(u, d, l, r, data, disp, stream);
}
}}}
\ No newline at end of file
modules/gpu/src/cuda/color.cu
View file @
d661b8e3
...
...
@@ -54,7 +54,7 @@ using namespace cv::gpu;
#define FLT_EPSILON 1.192092896e-07F
#endif
namespace c
olor_krnls
namespace c
v { namespace gpu { namespace color
{
template<typename T> struct ColorChannel {};
template<> struct ColorChannel<uchar>
...
...
@@ -95,12 +95,9 @@ namespace color_krnls
{
return vec.w;
}
}
////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
namespace color_krnls
{
template <int SRCCN, int DSTCN, typename T>
__global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
{
...
...
@@ -123,10 +120,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
}
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN>
void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{
...
...
@@ -136,8 +130,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -178,12 +172,9 @@ namespace cv { namespace gpu { namespace color
RGB2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
}
}}}
/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
namespace color_krnls
{
template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {};
template <int DSTCN> struct RGB5x52RGBConverter<5, DSTCN>
{
...
...
@@ -272,10 +263,7 @@ namespace color_krnls
*(ushort*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter<SRCCN, GREEN_BITS>::cvt(&src.x, bidx);
}
}
}
namespace cv { namespace gpu { namespace color
{
template <int GREEN_BITS, int DSTCN>
void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{
...
...
@@ -285,8 +273,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -313,8 +301,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -331,12 +319,9 @@ namespace cv { namespace gpu { namespace color
RGB2RGB5x5_callers[srccn - 3][green_bits - 5](src, dst, bidx, stream);
}
}}}
///////////////////////////////// Grayscale to Color ////////////////////////////////
namespace color_krnls
{
template <int DSTCN, typename T>
__global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
{
...
...
@@ -387,10 +372,7 @@ namespace color_krnls
*(ushort*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter<GREEN_BITS>::cvt(src);
}
}
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int DSTCN>
void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
{
...
...
@@ -400,8 +382,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols);
Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -440,8 +422,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols);
Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -457,12 +439,9 @@ namespace cv { namespace gpu { namespace color
Gray2RGB5x5_callers[green_bits - 5](src, dst, stream);
}
}}}
///////////////////////////////// Color to Grayscale ////////////////////////////////
namespace color_krnls
{
#undef R2Y
#undef G2Y
#undef B2Y
...
...
@@ -541,10 +520,7 @@ namespace color_krnls
*(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor<T>::cvt(&src.x, bidx);
}
}
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN>
void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{
...
...
@@ -554,8 +530,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -594,8 +570,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols);
RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -611,12 +587,9 @@ namespace cv { namespace gpu { namespace color
RGB5x52Gray_callers[green_bits - 5](src, dst, stream);
}
}}}
///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
namespace color_krnls
{
__constant__ float cYCrCbCoeffs_f[5];
__constant__ int cYCrCbCoeffs_i[5];
...
...
@@ -712,10 +685,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
}
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN>
void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{
...
...
@@ -725,8 +695,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -741,7 +711,7 @@ namespace cv { namespace gpu { namespace color
{RGB2YCrCb_caller<uchar, 4, 3>, RGB2YCrCb_caller<uchar, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
YCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
}
...
...
@@ -755,7 +725,7 @@ namespace cv { namespace gpu { namespace color
{RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
YCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
}
...
...
@@ -769,7 +739,7 @@ namespace cv { namespace gpu { namespace color
{RGB2YCrCb_caller<float, 4, 3>, RGB2YCrCb_caller<float, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
YCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
}
...
...
@@ -783,8 +753,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -799,7 +769,7 @@ namespace cv { namespace gpu { namespace color
{YCrCb2RGB_caller<uchar, 4, 3>, YCrCb2RGB_caller<uchar, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
YCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
}
...
...
@@ -813,7 +783,7 @@ namespace cv { namespace gpu { namespace color
{YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
YCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
}
...
...
@@ -827,16 +797,13 @@ namespace cv { namespace gpu { namespace color
{YCrCb2RGB_caller<float, 4, 3>, YCrCb2RGB_caller<float, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
YCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
}
}}}
////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
namespace color_krnls
{
__constant__ float cXYZ_D65f[9];
__constant__ int cXYZ_D65i[9];
...
...
@@ -922,10 +889,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
}
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN>
void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
{
...
...
@@ -935,8 +899,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols);
RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -951,7 +915,7 @@ namespace cv { namespace gpu { namespace color
{RGB2XYZ_caller<uchar, 4, 3>, RGB2XYZ_caller<uchar, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
XYZ_D65i, coeffs, 9 * sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
}
...
...
@@ -965,7 +929,7 @@ namespace cv { namespace gpu { namespace color
{RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
XYZ_D65i, coeffs, 9 * sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
}
...
...
@@ -979,7 +943,7 @@ namespace cv { namespace gpu { namespace color
{RGB2XYZ_caller<float, 4, 3>, RGB2XYZ_caller<float, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
XYZ_D65f, coeffs, 9 * sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
}
...
...
@@ -993,8 +957,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
color_krnls::XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols);
XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -1009,7 +973,7 @@ namespace cv { namespace gpu { namespace color
{XYZ2RGB_caller<uchar, 4, 3>, XYZ2RGB_caller<uchar, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
XYZ_D65i, coeffs, 9 * sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
}
...
...
@@ -1023,7 +987,7 @@ namespace cv { namespace gpu { namespace color
{XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
XYZ_D65i, coeffs, 9 * sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
}
...
...
@@ -1037,16 +1001,13 @@ namespace cv { namespace gpu { namespace color
{XYZ2RGB_caller<float, 4, 3>, XYZ2RGB_caller<float, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
XYZ_D65f, coeffs, 9 * sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
}
}}}
////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
namespace color_krnls
{
__constant__ int cHsvDivTable[256];
template<typename T, int HR> struct RGB2HSVConvertor;
...
...
@@ -1220,10 +1181,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
}
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN>
void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
{
...
...
@@ -1234,11 +1192,11 @@ namespace cv { namespace gpu { namespace color
grid.y = divUp(src.rows, threads.y);
if (hrange == 180)
color_krnls::RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
else
color_krnls::RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -1288,7 +1246,7 @@ namespace cv { namespace gpu { namespace color
4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,
4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096
};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
HsvDivTable, div_table, sizeof(div_table)) );
cudaSafeCall( cudaMemcpyToSymbol(cHsvDivTable, div_table, sizeof(div_table)) );
RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
...
...
@@ -1316,11 +1274,11 @@ namespace cv { namespace gpu { namespace color
grid.y = divUp(src.rows, threads.y);
if (hrange == 180)
color_krnls::HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
else
color_krnls::HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -1338,7 +1296,7 @@ namespace cv { namespace gpu { namespace color
static const int sector_data[][3] =
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
HsvSectorData, sector_data, sizeof(sector_data)) );
cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) );
HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
...
...
@@ -1355,16 +1313,13 @@ namespace cv { namespace gpu { namespace color
static const int sector_data[][3] =
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
HsvSectorData, sector_data, sizeof(sector_data)) );
cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) );
HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
}}}
/////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
namespace color_krnls
{
template<typename T, int HR> struct RGB2HLSConvertor;
template<int HR> struct RGB2HLSConvertor<float, HR>
{
...
...
@@ -1532,10 +1487,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
}
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN>
void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
{
...
...
@@ -1546,11 +1498,11 @@ namespace cv { namespace gpu { namespace color
grid.y = divUp(src.rows, threads.y);
if (hrange == 180)
color_krnls::RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
else
color_krnls::RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -1591,11 +1543,11 @@ namespace cv { namespace gpu { namespace color
grid.y = divUp(src.rows, threads.y);
if (hrange == 180)
color_krnls::HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
else
color_krnls::HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr
, src.step,
dst.
ptr
, dst.step, src.rows, src.cols, bidx);
HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data
, src.step,
dst.
data
, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -1613,7 +1565,7 @@ namespace cv { namespace gpu { namespace color
static const int sector_data[][3]=
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
HlsSectorData, sector_data, sizeof(sector_data)) );
cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) );
HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
...
...
@@ -1630,7 +1582,7 @@ namespace cv { namespace gpu { namespace color
static const int sector_data[][3]=
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(c
olor_krnls::c
HlsSectorData, sector_data, sizeof(sector_data)) );
cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) );
HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
...
...
modules/gpu/src/cuda/constantspacebp.cu
View file @
d661b8e3
...
...
@@ -54,8 +54,9 @@ using namespace cv::gpu;
#define SHRT_MAX 32767
#endif
namespace c
sbp_krnls
namespace c
v { namespace gpu { namespace csbp
{
template <typename T> struct TypeLimits;
template <> struct TypeLimits<short>
{
...
...
@@ -65,14 +66,11 @@ namespace csbp_krnls
{
static __device__ float max() {return FLT_MAX;}
};
}
///////////////////////////////////////////////////////////////
/////////////////////// load constants ////////////////////////
///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
__constant__ int cndisp;
__constant__ float cmax_data_term;
...
...
@@ -91,36 +89,30 @@ namespace csbp_krnls
__constant__ uchar* cleft;
__constant__ uchar* cright;
__constant__ uchar* ctemp;
}
namespace cv { namespace gpu { namespace csbp
{
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
const DevMem2D& left, const DevMem2D& right, const DevMem2D& temp)
{
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
ndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisc_single_jump, &disc_single_jump, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
th, &min_disp_th, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
img_step, &left.step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::cleft, &left.ptr, sizeof(left.ptr
)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::cright, &right.ptr, sizeof(right.ptr
)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::ctemp, &temp.ptr, sizeof(temp.ptr
)) );
cudaSafeCall( cudaMemcpyToSymbol(c
left, &left.data, sizeof(left.data
)) );
cudaSafeCall( cudaMemcpyToSymbol(c
right, &right.data, sizeof(right.data
)) );
cudaSafeCall( cudaMemcpyToSymbol(c
temp, &temp.data, sizeof(temp.data
)) );
}
}}}
///////////////////////////////////////////////////////////////
/////////////////////// init data cost ////////////////////////
///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <int channels>
struct DataCostPerPixel
{
...
...
@@ -334,10 +326,8 @@ namespace csbp_krnls
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
}
}
}
namespace cv { namespace gpu { namespace csbp
{
template <typename T>
void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
{
...
...
@@ -349,8 +339,8 @@ namespace cv { namespace gpu { namespace csbp
switch (channels)
{
case 1:
csbp_krnls::
init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
case 3:
csbp_krnls::
init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
}
...
...
@@ -367,8 +357,8 @@ namespace cv { namespace gpu { namespace csbp
switch (channels)
{
case 1:
csbp_krnls::
init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
case 3:
csbp_krnls::
init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
}
...
...
@@ -388,8 +378,8 @@ namespace cv { namespace gpu { namespace csbp
};
size_t disp_step = msg_step * h;
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
disp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
msg_step1, &msg_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
if (stream == 0)
...
...
@@ -402,9 +392,9 @@ namespace cv { namespace gpu { namespace csbp
grid.y = divUp(h, threads.y);
if (use_local_init_data_cost == true)
csbp_krnls::
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
else
csbp_krnls::
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -421,14 +411,10 @@ namespace cv { namespace gpu { namespace csbp
init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, use_local_init_data_cost, stream);
}
}}}
///////////////////////////////////////////////////////////////
////////////////////// compute data cost //////////////////////
///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <typename T, int channels>
__global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
{
...
...
@@ -536,10 +522,7 @@ namespace csbp_krnls
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
}
}
}
namespace cv { namespace gpu { namespace csbp
{
template <typename T>
void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
...
...
@@ -552,8 +535,8 @@ namespace cv { namespace gpu { namespace csbp
switch(channels)
{
case 1: c
sbp_krnls::c
ompute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
case 3: c
sbp_krnls::c
ompute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
}
...
...
@@ -571,13 +554,12 @@ namespace cv { namespace gpu { namespace csbp
switch (channels)
{
case 1: c
sbp_krnls::c
ompute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
case 3: c
sbp_krnls::c
ompute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
}
template<class T>
void compute_data_cost_tmpl(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
...
...
@@ -594,10 +576,10 @@ namespace cv { namespace gpu { namespace csbp
size_t disp_step1 = msg_step1 * h;
size_t disp_step2 = msg_step2 * h2;
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
disp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
disp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
msg_step1, &msg_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
msg_step2, &msg_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
...
...
@@ -616,15 +598,12 @@ namespace cv { namespace gpu { namespace csbp
compute_data_cost_tmpl(disp_selected_pyr, data_cost, msg_step1, msg_step2, rows, cols, h, w, h2, level, nr_plane, channels, stream);
}
}}}
///////////////////////////////////////////////////////////////
//////////////////////// init message /////////////////////////
///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <typename T>
template <typename T>
__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
...
...
@@ -705,10 +684,8 @@ namespace csbp_krnls
data_cost, disparity_selected_cur, nr_plane, nr_plane2);
}
}
}
namespace cv { namespace gpu { namespace csbp
{
template<class T>
void init_message_tmpl(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
...
...
@@ -719,10 +696,10 @@ namespace cv { namespace gpu { namespace csbp
size_t disp_step1 = msg_step1 * h;
size_t disp_step2 = msg_step2 * h2;
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
disp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
disp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
msg_step1, &msg_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
msg_step2, &msg_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
...
...
@@ -730,7 +707,7 @@ namespace cv { namespace gpu { namespace csbp
grid.x = divUp(w, threads.x);
grid.y = divUp(h, threads.y);
csbp_krnls::
init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
u_cur, d_cur, l_cur, r_cur,
selected_disp_pyr_new, selected_disp_pyr_cur,
data_cost_selected, data_cost,
...
...
@@ -761,14 +738,11 @@ namespace cv { namespace gpu { namespace csbp
selected_disp_pyr_new, selected_disp_pyr_cur, data_cost_selected, data_cost, msg_step1, msg_step2,
h, w, nr_plane, h2, w2, nr_plane2, stream);
}
}}}
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <typename T>
__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
...
...
@@ -829,17 +803,15 @@ namespace csbp_krnls
message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);
}
}
}
namespace cv { namespace gpu { namespace csbp
{
template<class T>
void calc_all_iterations_tmpl(T* u, T* d, T* l, T* r, const T* data_cost_selected,
const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
{
size_t disp_step = msg_step * h;
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
disp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
msg_step1, &msg_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
...
...
@@ -849,7 +821,7 @@ namespace cv { namespace gpu { namespace csbp
for(int t = 0; t < iters; ++t)
{
c
sbp_krnls::c
ompute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -868,14 +840,12 @@ namespace cv { namespace gpu { namespace csbp
calc_all_iterations_tmpl(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, msg_step, h, w, nr_plane, iters, stream);
}
}}}
///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <typename T>
__global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
const T* data_cost_selected, const T* disp_selected_pyr,
...
...
@@ -910,17 +880,15 @@ namespace csbp_krnls
disp[res_step * y + x] = best;
}
}
}
namespace cv { namespace gpu { namespace csbp
{
template<class T>
void compute_disp_tmpl(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)
{
size_t disp_step = disp.rows * msg_step;
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
disp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(c
sbp_krnls::c
msg_step1, &msg_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
...
...
@@ -928,8 +896,8 @@ namespace cv { namespace gpu { namespace csbp
grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y);
c
sbp_krnls::c
ompute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
disp.
ptr
, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
disp.
data
, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
modules/gpu/src/cuda/cuda_shared.hpp
View file @
d661b8e3
...
...
@@ -54,9 +54,9 @@ namespace cv
typedef
unsigned
char
uchar
;
typedef
signed
char
schar
;
typedef
unsigned
short
ushort
;
typedef
unsigned
int
uint
;
typedef
unsigned
int
uint
;
static
inline
int
divUp
(
int
a
,
int
b
)
{
return
(
a
%
b
==
0
)
?
a
/
b
:
a
/
b
+
1
;
}
static
inline
int
divUp
(
int
total
,
int
grain
)
{
return
(
total
+
grain
-
1
)
/
grain
;
}
template
<
class
T
>
static
inline
void
uploadConstant
(
const
char
*
name
,
const
T
&
value
)
{
cudaSafeCall
(
cudaMemcpyToSymbol
(
name
,
&
value
,
sizeof
(
T
))
);
}
...
...
modules/gpu/src/cuda/filters.cu
View file @
d661b8e3
...
...
@@ -128,8 +128,8 @@ namespace cv { namespace gpu { namespace filters
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.
ptr, src.elem_step
,
dst.
ptr, dst.elem_step
, anchor, src.cols, src.rows);
filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.
data, src.step/src.elemSize()
,
dst.
data, dst.step/dst.elemSize()
, anchor, src.cols, src.rows);
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -152,10 +152,12 @@ namespace cv { namespace gpu { namespace filters
callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
}
void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
template void linearRowFilter_gpu<4, uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
/* void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
{
linearRowFilter_gpu<4, uchar4, uchar4>(src, dst, kernel, ksize, anchor);
}
}
*/
void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
{
linearRowFilter_gpu<4, uchar4, char4>(src, dst, kernel, ksize, anchor);
...
...
@@ -262,8 +264,8 @@ namespace cv { namespace gpu { namespace filters
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.
ptr, src.elem_step
,
dst.
ptr, dst.elem_step
, anchor, src.cols, src.rows);
filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.
data, src.step/src.elemSize()
,
dst.
data, dst.step/dst.elemSize()
, anchor, src.cols, src.rows);
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -357,7 +359,7 @@ namespace cv { namespace gpu { namespace bf
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
{
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.
ptr, sizeof(table_space.ptr
)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.
data, sizeof(table_space.data
)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
...
...
@@ -491,15 +493,15 @@ namespace cv { namespace gpu { namespace bf
case 1:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.
ptr, disp.step/sizeof(T), img.ptr
, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.
ptr, disp.step/sizeof(T), img.ptr
, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.
data, disp.step/sizeof(T), img.data
, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.
data, disp.step/sizeof(T), img.data
, img.step, disp.rows, disp.cols);
}
break;
case 3:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.
ptr, disp.step/sizeof(T), img.ptr
, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.
ptr, disp.step/sizeof(T), img.ptr
, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.
data, disp.step/sizeof(T), img.data
, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.
data, disp.step/sizeof(T), img.data
, img.step, disp.rows, disp.cols);
}
break;
default:
...
...
modules/gpu/src/cuda/imgproc.cu
View file @
d661b8e3
...
...
@@ -45,7 +45,7 @@
using namespace cv::gpu;
/////////////////////////////////// Remap ///////////////////////////////////////////////
namespace
imgproc_krnls
namespace
cv { namespace gpu { namespace imgproc
{
texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
...
...
@@ -121,10 +121,7 @@ namespace imgproc_krnls
*(dst + y * dst_step + 3 * x + 2) = out.z;
}
}
}
namespace cv { namespace gpu { namespace imgproc
{
void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
{
dim3 threads(16, 16, 1);
...
...
@@ -132,15 +129,15 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y);
imgproc_krnls::
tex_remap.filterMode = cudaFilterModeLinear;
imgproc_krnls::tex_remap.addressMode[0] = imgproc_krnls::
tex_remap.addressMode[1] = cudaAddressModeWrap;
tex_remap.filterMode = cudaFilterModeLinear;
tex_remap.addressMode[0] =
tex_remap.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D(0,
imgproc_krnls::tex_remap, src.ptr
, desc, src.cols, src.rows, src.step) );
cudaSafeCall( cudaBindTexture2D(0,
tex_remap, src.data
, desc, src.cols, src.rows, src.step) );
imgproc_krnls::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr
, dst.step, dst.cols, dst.rows);
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data
, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture(
imgproc_krnls::
tex_remap) );
cudaSafeCall( cudaUnbindTexture(tex_remap) );
}
void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
...
...
@@ -150,17 +147,13 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y);
imgproc_krnls::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr
, dst.step, dst.cols, dst.rows);
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data
, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaThreadSynchronize() );
}
}}}
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
namespace imgproc_krnls
{
texture<uchar4, 2> tex_meanshift;
__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
...
...
@@ -252,10 +245,7 @@ namespace imgproc_krnls
*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
}
}
}
namespace cv { namespace gpu { namespace imgproc
{
extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps)
{
dim3 grid(1, 1, 1);
...
...
@@ -264,11 +254,11 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0,
imgproc_krnls::tex_meanshift, src.ptr
, desc, src.cols, src.rows, src.step ) );
cudaSafeCall( cudaBindTexture2D( 0,
tex_meanshift, src.data
, desc, src.cols, src.rows, src.step ) );
imgproc_krnls::meanshift_kernel<<< grid, threads >>>( dst.ptr
, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
meanshift_kernel<<< grid, threads >>>( dst.data
, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture(
imgproc_krnls::
tex_meanshift ) );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps)
{
...
...
@@ -278,18 +268,15 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0,
imgproc_krnls::tex_meanshift, src.ptr
, desc, src.cols, src.rows, src.step ) );
cudaSafeCall( cudaBindTexture2D( 0,
tex_meanshift, src.data
, desc, src.cols, src.rows, src.step ) );
imgproc_krnls::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr
, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data
, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture(
imgproc_krnls::
tex_meanshift ) );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
}}}
/////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
namespace imgproc_krnls
{
template <typename T>
__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
{
...
...
@@ -389,10 +376,8 @@ namespace imgproc_krnls
line[x >> 1] = res;
}
}
}
namespace cv { namespace gpu { namespace imgproc
{
void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
{
dim3 threads(16, 16, 1);
...
...
@@ -400,7 +385,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(src.cols, threads.x << 2);
grid.y = divUp(src.rows, threads.y);
imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr
, dst.step, src.cols, src.rows, ndisp);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data
, dst.step, src.cols, src.rows, ndisp);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -413,17 +398,14 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(src.cols, threads.x << 1);
grid.y = divUp(src.rows, threads.y);
imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr
, dst.step, src.cols, src.rows, ndisp);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data
, dst.step, src.cols, src.rows, ndisp);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
}}}
/////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
namespace imgproc_krnls
{
__constant__ float cq[16];
template <typename T>
...
...
@@ -455,10 +437,7 @@ namespace imgproc_krnls
*(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;
}
}
}
namespace cv { namespace gpu { namespace imgproc
{
template <typename T>
inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
{
...
...
@@ -467,9 +446,9 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y);
cudaSafeCall( cudaMemcpyToSymbol(
imgproc_krnls::
cq, q, 16 * sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
imgproc_krnls::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr
, xyzw.step / sizeof(float), disp.rows, disp.cols);
reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data
, xyzw.step / sizeof(float), disp.rows, disp.cols);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
modules/gpu/src/cuda/mathfunc.cu
View file @
d661b8e3
...
...
@@ -41,9 +41,7 @@
//M*/
#include "cuda_shared.hpp"
#include "saturate_cast.hpp"
#include "transform.hpp"
#include "vecmath.hpp"
using namespace cv::gpu;
...
...
@@ -54,7 +52,7 @@ using namespace cv::gpu;
//////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar
namespace
mathfunc_krnls
namespace
cv { namespace gpu { namespace mathfunc
{
struct Nothing
{
...
...
@@ -133,10 +131,7 @@ namespace mathfunc_krnls
yptr[y * y_step + x] = mag_data * sin_a;
}
}
}
namespace cv { namespace gpu { namespace mathfunc
{
template <typename Mag, typename Angle>
void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
{
...
...
@@ -148,9 +143,9 @@ namespace cv { namespace gpu { namespace mathfunc
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
mathfunc_krnls::
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.
ptr, x.elem_step, y.ptr, y.elem_step
,
mag.
ptr, mag.elem_step, angle.ptr, angle.elem_step
, scale, x.cols, x.rows);
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.
data, x.step/x.elemSize(), y.data, y.step/y.elemSize()
,
mag.
data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize()
, scale, x.cols, x.rows);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -163,27 +158,27 @@ namespace cv { namespace gpu { namespace mathfunc
{
{
{
cartToPolar_caller<
mathfunc_krnls::Magnitude, mathfunc_krnls::
Atan2>,
cartToPolar_caller<
mathfunc_krnls::Magnitude, mathfunc_krnls::
Nothing>
cartToPolar_caller<
Magnitude,
Atan2>,
cartToPolar_caller<
Magnitude,
Nothing>
},
{
cartToPolar_caller<
mathfunc_krnls::MagnitudeSqr, mathfunc_krnls::
Atan2>,
cartToPolar_caller<
mathfunc_krnls::MagnitudeSqr, mathfunc_krnls::
Nothing>,
cartToPolar_caller<
MagnitudeSqr,
Atan2>,
cartToPolar_caller<
MagnitudeSqr,
Nothing>,
}
},
{
{
cartToPolar_caller<
mathfunc_krnls::Nothing, mathfunc_krnls::
Atan2>,
cartToPolar_caller<
mathfunc_krnls::Nothing, mathfunc_krnls::
Nothing>
cartToPolar_caller<
Nothing,
Atan2>,
cartToPolar_caller<
Nothing,
Nothing>
},
{
cartToPolar_caller<
mathfunc_krnls::Nothing, mathfunc_krnls::
Atan2>,
cartToPolar_caller<
mathfunc_krnls::Nothing, mathfunc_krnls::
Nothing>,
cartToPolar_caller<
Nothing,
Atan2>,
cartToPolar_caller<
Nothing,
Nothing>,
}
}
};
callers[mag.
ptr == 0][magSqr][angle.ptr
== 0](x, y, mag, angle, angleInDegrees, stream);
callers[mag.
data == 0][magSqr][angle.data
== 0](x, y, mag, angle, angleInDegrees, stream);
}
template <typename Mag>
...
...
@@ -197,8 +192,8 @@ namespace cv { namespace gpu { namespace mathfunc
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.elem_step
,
angle.
ptr, angle.elem_step, scale, x.ptr, x.elem_step, y.ptr, y.elem_step
, mag.cols, mag.rows);
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize()
,
angle.
data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize()
, mag.cols, mag.rows);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -209,19 +204,16 @@ namespace cv { namespace gpu { namespace mathfunc
typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] =
{
polarToCart_caller<
mathfunc_krnls::
NonEmptyMag>,
polarToCart_caller<
mathfunc_krnls::
EmptyMag>
polarToCart_caller<NonEmptyMag>,
polarToCart_caller<EmptyMag>
};
callers[mag.
ptr
== 0](mag, angle, x, y, angleInDegrees, stream);
callers[mag.
data
== 0](mag, angle, x, y, angleInDegrees, stream);
}
}}}
//////////////////////////////////////////////////////////////////////////////////////
// Compare
namespace mathfunc_krnls
{
template <typename T1, typename T2>
struct NotEqual
{
...
...
@@ -230,14 +222,11 @@ namespace mathfunc_krnls
return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
}
};
}
namespace cv { namespace gpu { namespace mathfunc
{
template <typename T1, typename T2>
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
{
mathfunc_krnls::
NotEqual<T1, T2> op;
NotEqual<T1, T2> op;
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
}
...
...
modules/gpu/src/cuda/matrix_operations.cu
View file @
d661b8e3
...
...
@@ -40,16 +40,11 @@
//
//M*/
#include <stddef.h>
#include <stdio.h>
#include "cuda_shared.hpp"
#include "cuda_runtime.h"
#include "saturate_cast.hpp"
using namespace cv::gpu;
namespace cv { namespace gpu { namespace matrix_operations {
namespace matop_krnls
{
template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
...
...
@@ -115,14 +110,11 @@ namespace matop_krnls
typedef int2 read_type;
typedef short2 write_type;
};
}
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
template<typename T>
__global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
{
...
...
@@ -136,10 +128,6 @@ namespace matop_krnls
mat_dst[idx] = mat_src[idx];
}
}
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
template<typename T>
...
...
@@ -147,17 +135,12 @@ namespace cv { namespace gpu { namespace matrix_operations
{
dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
if (stream == 0)
{
::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
}
cudaSafeCall ( cudaThreadSynchronize() );
}
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
...
...
@@ -180,14 +163,11 @@ namespace cv { namespace gpu { namespace matrix_operations
func(mat_src, mat_dst, mask, channels, stream);
}
}}}
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// SetTo //////////////////////////////////
///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
__constant__ double scalar_d[4];
template<typename T>
...
...
@@ -216,10 +196,6 @@ namespace matop_krnls
mat[idx] = scalar_d[ x % channels ];
}
}
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
...
...
@@ -229,16 +205,9 @@ namespace cv { namespace gpu { namespace matrix_operations
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, (unsigned char *)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
if (stream == 0)
{
::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
}
}
template <typename T>
...
...
@@ -247,20 +216,15 @@ namespace cv { namespace gpu { namespace matrix_operations
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
if (stream == 0)
{
matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
}
}
void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
{
cudaSafeCall( cudaMemcpyToSymbol(
matop_krnls::
scalar_d, scalar, sizeof(double) * 4));
cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
static SetToFunc_without_mask tab[8] =
{
...
...
@@ -284,7 +248,7 @@ namespace cv { namespace gpu { namespace matrix_operations
void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
cudaSafeCall( cudaMemcpyToSymbol(
matop_krnls::
scalar_d, scalar, sizeof(double) * 4));
cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
static SetToFunc_with_mask tab[8] =
{
...
...
@@ -305,14 +269,11 @@ namespace cv { namespace gpu { namespace matrix_operations
func(mat, mask, channels, stream);
}
}}}
///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
template <typename T, typename DT>
__global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta)
{
...
...
@@ -348,29 +309,20 @@ namespace matop_krnls
}
}
}
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
template<typename T, typename DT>
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
{
const int shift =
::matop_krnls::
ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
const int shift = ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
dim3 block(32, 8);
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
convert_to<T, DT><<<grid, block, 0, stream>>>(src.data, src.step, dst.data, dst.step, width, height, alpha, beta);
if (stream == 0)
{
matop_krnls::convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
matop_krnls::convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
}
}
void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream)
...
...
modules/gpu/src/cuda/split_merge.cu
View file @
d661b8e3
...
...
@@ -230,9 +230,9 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].
ptr
, src[0].step,
src[1].
ptr
, src[1].step,
dst.rows, dst.cols, dst.
ptr
, dst.step);
src[0].
data
, src[0].step,
src[1].
data
, src[1].step,
dst.rows, dst.cols, dst.
data
, dst.step);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -244,10 +244,10 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].
ptr
, src[0].step,
src[1].
ptr
, src[1].step,
src[2].
ptr
, src[2].step,
dst.rows, dst.cols, dst.
ptr
, dst.step);
src[0].
data
, src[0].step,
src[1].
data
, src[1].step,
src[2].
data
, src[2].step,
dst.rows, dst.cols, dst.
data
, dst.step);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -259,11 +259,11 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].
ptr
, src[0].step,
src[1].
ptr
, src[1].step,
src[2].
ptr
, src[2].step,
src[3].
ptr
, src[3].step,
dst.rows, dst.cols, dst.
ptr
, dst.step);
src[0].
data
, src[0].step,
src[1].
data
, src[1].step,
src[2].
data
, src[2].step,
src[3].
data
, src[3].step,
dst.rows, dst.cols, dst.
data
, dst.step);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -433,9 +433,9 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
src.
ptr
, src.step, src.rows, src.cols,
dst[0].
ptr
, dst[0].step,
dst[1].
ptr
, dst[1].step);
src.
data
, src.step, src.rows, src.cols,
dst[0].
data
, dst[0].step,
dst[1].
data
, dst[1].step);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -447,10 +447,10 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
src.
ptr
, src.step, src.rows, src.cols,
dst[0].
ptr
, dst[0].step,
dst[1].
ptr
, dst[1].step,
dst[2].
ptr
, dst[2].step);
src.
data
, src.step, src.rows, src.cols,
dst[0].
data
, dst[0].step,
dst[1].
data
, dst[1].step,
dst[2].
data
, dst[2].step);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -462,11 +462,11 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
src.
ptr
, src.step, src.rows, src.cols,
dst[0].
ptr
, dst[0].step,
dst[1].
ptr
, dst[1].step,
dst[2].
ptr
, dst[2].step,
dst[3].
ptr
, dst[3].step);
src.
data
, src.step, src.rows, src.cols,
dst[0].
data
, dst[0].step,
dst[1].
data
, dst[1].step,
dst[2].
data
, dst[2].step,
dst[3].
data
, dst[3].step);
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
modules/gpu/src/cuda/stereobm.cu
View file @
d661b8e3
...
...
@@ -43,8 +43,7 @@
//#include "cuda_shared.hpp"
#include "opencv2/gpu/devmem2d.hpp"
#include "safe_call.hpp"
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
using namespace cv::gpu;
...
...
@@ -55,7 +54,7 @@ using namespace cv::gpu;
#define ROWSperTHREAD 21 // the number of rows a thread will process
namespace
stereobm_gpu
namespace
cv { namespace gpu { namespace bm
{
#define BLOCK_W 128 // the thread block width (464)
...
...
@@ -233,7 +232,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
}
template<int RADIUS>
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step,
unsigned char* disp, size_t disp_pitch
, int maxdisp)
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step,
PtrStep disp
, int maxdisp)
{
extern __shared__ unsigned int col_ssd_cache[];
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
...
...
@@ -246,7 +245,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned char* disparImage = disp
+ X + Y * disp_pitch
;
unsigned char* disparImage = disp
.data + X + Y * disp.step
;
/* if (X < cwidth)
{
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
...
...
@@ -305,7 +304,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[idx])
{
disparImage[disp
_pitch
* row] = (unsigned char)(d + minSSD.y);
disparImage[disp
.step
* row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x;
}
}
...
...
@@ -313,88 +312,73 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
} // for d loop
}
}
namespace cv { namespace gpu { namespace bm
template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream)
{
template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, const cudaStream_t & stream)
{
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
if (stream == 0)
{
stereobm_gpu::stereoKernel<RADIUS><<<grid, threads, smem_size>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
stereobm_gpu::stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
};
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
};
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, const
cudaStream_t & stream);
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp,
cudaStream_t & stream);
const static kernel_caller_t callers[] =
{
0,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
};
const int calles_num = sizeof(callers)/sizeof(callers[0]);
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, const cudaStream_t
& stream)
{
int winsz2 = winsz >> 1;
const static kernel_caller_t callers[] =
{
0,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
};
const int calles_num = sizeof(callers)/sizeof(callers[0]);
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t
& stream)
{
int winsz2 = winsz >> 1;
if (winsz2 == 0 || winsz2 >= calles_num)
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
if (winsz2 == 0 || winsz2 >= calles_num)
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall( cudaMemset2D(disp.ptr
, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.ptr
, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemset2D(disp.data
, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.data
, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::
cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::
cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSDImage, &minSSD_buf.ptr, sizeof(minSSD_buf.ptr
) ) );
cudaSafeCall( cudaMemcpyToSymbol(
cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol(
cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data
) ) );
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::
cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol(
cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
callers[winsz2](left, right, disp, maxdisp, stream);
}
}}}
callers[winsz2](left, right, disp, maxdisp, stream);
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
namespace stereobm_gpu
{
texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
extern "C" __global__ void prefilter_kernel(
unsigned char *output, size_t step, int width, int heigh
t, int prefilterCap)
extern "C" __global__ void prefilter_kernel(
DevMem2D outpu
t, int prefilterCap)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x <
width && y < height
)
if (x <
output.cols && y < output.rows
)
{
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
...
...
@@ -402,48 +386,35 @@ extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step,
conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
output
[y * step +
x] = conv & 0xFF;
output
.ptr(y)[
x] = conv & 0xFF;
}
}
}
namespace cv { namespace gpu { namespace bm
extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap, cudaStream_t & stream)
{
extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap, const cudaStream_t & stream)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForSobel, input.ptr, desc, input.cols, input.rows, input.step ) );
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, threads.y);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, threads.y);
if (stream == 0)
{
stereobm_gpu::prefilter_kernel<<<grid, threads>>>(output.ptr, output.step, output.cols, output.rows, prefilterCap);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
stereobm_gpu::prefilter_kernel<<<grid, threads, 0, stream>>>(output.ptr, output.step, output.cols, output.rows, prefilterCap);
}
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForSobel ) );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
cudaSafeCall( cudaUnbindTexture (texForSobel ) );
}
}}}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
namespace stereobm_gpu
{
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
__device__ float sobel(int x, int y)
...
...
@@ -478,7 +449,7 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
#define RpT (2 * ROWSperTHREAD) // got experimentally
extern "C" __global__ void textureness_kernel(
unsigned char *disp, size_t disp_step, int winsz, float threshold, int width, int height
)
extern "C" __global__ void textureness_kernel(
DevMem2D disp, int winsz, float threshold
)
{
int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2;
...
...
@@ -489,9 +460,9 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
int x = blockIdx.x * blockDim.x + threadIdx.x;
int beg_row = blockIdx.y * RpT;
int end_row = min(beg_row + RpT,
height
);
int end_row = min(beg_row + RpT,
disp.rows
);
if (x <
width
)
if (x <
disp.cols
)
{
int y = beg_row;
...
...
@@ -512,7 +483,7 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp
[y * disp_
step + x] = 0;
disp
.data[y * disp.
step + x] = 0;
__syncthreads();
...
...
@@ -530,45 +501,37 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp
[y * disp_
step + x] = 0;
disp
.data[y * disp.
step + x] = 0;
__syncthreads();
}
}
}
}
namespace cv { namespace gpu { namespace bm
extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, cudaStream_t & stream)
{
extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, const cudaStream_t & stream)
{
avgTexturenessThreshold *= winsz * winsz;
avgTexturenessThreshold *= winsz * winsz;
stereobm_gpu::
texForTF.filterMode = cudaFilterModeLinear;
stereobm_gpu::
texForTF.addressMode[0] = cudaAddressModeWrap;
stereobm_gpu::
texForTF.addressMode[1] = cudaAddressModeWrap;
texForTF.filterMode = cudaFilterModeLinear;
texForTF.addressMode[0] = cudaAddressModeWrap;
texForTF.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForTF, input.ptr
, desc, input.cols, input.rows, input.step ) );
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data
, desc, input.cols, input.rows, input.step ) );
dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1);
dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
if (stream == 0)
{
stereobm_gpu::textureness_kernel<<<grid, threads, smem_size>>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
stereobm_gpu::textureness_kernel<<<grid, threads, smem_size, stream>>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows);
}
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
}
cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForTF) );
}
}}}
modules/gpu/src/cuda/transform.hpp
View file @
d661b8e3
...
...
@@ -44,36 +44,32 @@
#define __OPENCV_GPU_TRANSFORM_HPP__
#include "cuda_shared.hpp"
#include "saturate_cast.hpp"
#include "vecmath.hpp"
namespace
cv
{
namespace
gpu
{
namespace
algo_krnls
namespace
cv
{
namespace
gpu
{
namespace
device
{
template
<
typename
T
,
typename
D
,
typename
UnOp
>
static
__global__
void
transform
(
const
T
*
src
,
size_t
src_step
,
D
*
dst
,
size_t
dst_step
,
int
width
,
int
height
,
UnOp
op
)
static
__global__
void
transform
(
const
DevMem2D_
<
T
>
src
,
PtrStep_
<
D
>
dst
,
UnOp
op
)
{
const
int
x
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
y
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
if
(
x
<
width
&&
y
<
height
)
if
(
x
<
src
.
cols
&&
y
<
src
.
rows
)
{
T
src_data
=
src
[
y
*
src_step
+
x
];
dst
[
y
*
dst_step
+
x
]
=
op
(
src_data
,
x
,
y
);
T
src_data
=
src
.
ptr
(
y
)[
x
];
dst
.
ptr
(
y
)[
x
]
=
op
(
src_data
,
x
,
y
);
}
}
template
<
typename
T1
,
typename
T2
,
typename
D
,
typename
BinOp
>
static
__global__
void
transform
(
const
T1
*
src1
,
size_t
src1_step
,
const
T2
*
src2
,
size_t
src2_step
,
D
*
dst
,
size_t
dst_step
,
int
width
,
int
height
,
BinOp
op
)
static
__global__
void
transform
(
const
DevMem2D_
<
T1
>
src1
,
const
PtrStep_
<
T2
>
src2
,
PtrStep_
<
D
>
dst
,
BinOp
op
)
{
const
int
x
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
y
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
if
(
x
<
width
&&
y
<
height
)
if
(
x
<
src1
.
cols
&&
y
<
src1
.
rows
)
{
T1
src1_data
=
src1
[
y
*
src1_step
+
x
];
T2
src2_data
=
src2
[
y
*
src2_step
+
x
];
dst
[
y
*
dst_step
+
x
]
=
op
(
src1_data
,
src2_data
,
x
,
y
);
T1
src1_data
=
src1
.
ptr
(
y
)[
x
];
T2
src2_data
=
src2
.
ptr
(
y
)[
x
];
dst
.
ptr
(
y
)[
x
]
=
op
(
src1_data
,
src2_data
,
x
,
y
);
}
}
}}}
...
...
@@ -83,7 +79,7 @@ namespace cv
namespace
gpu
{
template
<
typename
T
,
typename
D
,
typename
UnOp
>
static
void
transform
(
const
DevMem2D_
<
T
>&
src
,
const
DevMem2D_
<
D
>&
dst
,
UnOp
op
,
cudaStream_t
stream
)
static
void
transform
2
(
const
DevMem2D_
<
T
>&
src
,
const
DevMem2D_
<
D
>&
dst
,
UnOp
op
,
cudaStream_t
stream
)
{
dim3
threads
(
16
,
16
,
1
);
dim3
grid
(
1
,
1
,
1
);
...
...
@@ -91,8 +87,7 @@ namespace cv
grid
.
x
=
divUp
(
src
.
cols
,
threads
.
x
);
grid
.
y
=
divUp
(
src
.
rows
,
threads
.
y
);
algo_krnls
::
transform
<<<
grid
,
threads
,
0
,
stream
>>>
(
src
.
ptr
,
src
.
elem_step
,
dst
.
ptr
,
dst
.
elem_step
,
src
.
cols
,
src
.
rows
,
op
);
device
::
transform
<
T
,
D
,
UnOp
><<<
grid
,
threads
,
0
,
stream
>>>
(
src
,
dst
,
op
);
if
(
stream
==
0
)
cudaSafeCall
(
cudaThreadSynchronize
()
);
...
...
@@ -106,11 +101,10 @@ namespace cv
grid
.
x
=
divUp
(
src1
.
cols
,
threads
.
x
);
grid
.
y
=
divUp
(
src1
.
rows
,
threads
.
y
);
algo_krnls
::
transform
<<<
grid
,
threads
,
0
,
stream
>>>
(
src1
.
ptr
,
src1
.
elem_step
,
src2
.
ptr
,
src2
.
elem_step
,
dst
.
ptr
,
dst
.
elem_step
,
src1
.
cols
,
src1
.
rows
,
op
);
device
::
transform
<
T1
,
T2
,
D
,
BinOp
><<<
grid
,
threads
,
0
,
stream
>>>
(
src1
,
src2
,
dst
,
op
);
if
(
stream
==
0
)
cudaSafeCall
(
cudaThreadSynchronize
()
);
cudaSafeCall
(
cudaThreadSynchronize
()
);
}
}
}
...
...
modules/gpu/src/cuda/vecmath.hpp
View file @
d661b8e3
...
...
@@ -384,7 +384,14 @@ namespace cv
template
<
typename
VecD
,
typename
VecS
>
static
__device__
VecD
saturate_cast_caller
(
const
VecS
&
v
)
{
SatCast
<
VecTraits
<
VecD
>::
cn
,
VecD
>
cast
;
SatCast
<
VecTraits
<
VecD
>::
cn
,
VecD
>
cast
;
return
cast
(
v
);
}
...
...
modules/gpu/src/filtering.cpp
View file @
d661b8e3
...
...
@@ -577,7 +577,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
namespace
cv
{
namespace
gpu
{
namespace
filters
{
void
linearRowFilter_gpu_8u_8u_c4
(
const
DevMem2D
&
src
,
const
DevMem2D
&
dst
,
const
float
kernel
[],
int
ksize
,
int
anchor
);
template
<
int
CN
,
typename
T
,
typename
D
>
void
linearRowFilter_gpu
(
const
DevMem2D
&
src
,
const
DevMem2D
&
dst
,
const
float
kernel
[],
int
ksize
,
int
anchor
);
//void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void
linearRowFilter_gpu_8u_8s_c4
(
const
DevMem2D
&
src
,
const
DevMem2D
&
dst
,
const
float
kernel
[],
int
ksize
,
int
anchor
);
void
linearRowFilter_gpu_8s_8u_c4
(
const
DevMem2D
&
src
,
const
DevMem2D
&
dst
,
const
float
kernel
[],
int
ksize
,
int
anchor
);
void
linearRowFilter_gpu_8s_8s_c4
(
const
DevMem2D
&
src
,
const
DevMem2D
&
dst
,
const
float
kernel
[],
int
ksize
,
int
anchor
);
...
...
@@ -653,7 +656,8 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
static
const
nppFilter1D_t
nppFilter1D_callers
[]
=
{
0
,
nppiFilterRow_8u_C1R
,
0
,
0
,
nppiFilterRow_8u_C4R
};
static
const
gpuFilter1D_t
gpuFilter1D_callers
[
6
][
6
]
=
{
{
linearRowFilter_gpu_8u_8u_c4
,
linearRowFilter_gpu_8u_8s_c4
,
0
,
0
,
0
,
0
},
{
linearRowFilter_gpu
<
4
,
uchar4
,
uchar4
>
/*linearRowFilter_gpu_8u_8u_c4*/
,
linearRowFilter_gpu_8u_8s_c4
,
0
,
0
,
0
,
0
},
{
linearRowFilter_gpu_8s_8u_c4
,
linearRowFilter_gpu_8s_8s_c4
,
0
,
0
,
0
,
0
},
{
0
,
0
,
linearRowFilter_gpu_16u_16u_c2
,
linearRowFilter_gpu_16u_16s_c2
,
0
,
0
},
{
0
,
0
,
linearRowFilter_gpu_16s_16u_c2
,
linearRowFilter_gpu_16s_16s_c2
,
0
,
0
},
...
...
modules/gpu/src/stereobm_gpu.cpp
View file @
d661b8e3
...
...
@@ -61,9 +61,9 @@ namespace cv { namespace gpu
namespace
bm
{
//extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf);
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
ndisp
,
int
winsz
,
const
DevMem2D_
<
uint
>&
minSSD_buf
,
c
onst
c
udaStream_t
&
stream
);
extern
"C"
void
prefilter_xsobel
(
const
DevMem2D
&
input
,
const
DevMem2D
&
output
,
int
prefilterCap
/*= 31*/
,
const
cudaStream_t
&
stream
);
extern
"C"
void
postfilter_textureness
(
const
DevMem2D
&
input
,
int
winsz
,
float
avgTexturenessThreshold
,
const
DevMem2D
&
disp
,
c
onst
c
udaStream_t
&
stream
);
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
ndisp
,
int
winsz
,
const
DevMem2D_
<
uint
>&
minSSD_buf
,
cudaStream_t
&
stream
);
extern
"C"
void
prefilter_xsobel
(
const
DevMem2D
&
input
,
const
DevMem2D
output
,
int
prefilterCap
/*= 31*/
,
cudaStream_t
&
stream
);
extern
"C"
void
postfilter_textureness
(
const
DevMem2D
&
input
,
int
winsz
,
float
avgTexturenessThreshold
,
const
DevMem2D
&
disp
,
cudaStream_t
&
stream
);
}
}}
...
...
@@ -98,7 +98,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
return
false
;
}
static
void
stereo_bm_gpu_operator
(
GpuMat
&
minSSD
,
GpuMat
&
leBuf
,
GpuMat
&
riBuf
,
int
preset
,
int
ndisp
,
int
winSize
,
float
avergeTexThreshold
,
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
c
onst
cudaStream_t
&
stream
)
static
void
stereo_bm_gpu_operator
(
GpuMat
&
minSSD
,
GpuMat
&
leBuf
,
GpuMat
&
riBuf
,
int
preset
,
int
ndisp
,
int
winSize
,
float
avergeTexThreshold
,
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
c
udaStream_t
stream
)
{
CV_DbgAssert
(
left
.
rows
==
right
.
rows
&&
left
.
cols
==
right
.
cols
);
CV_DbgAssert
(
left
.
type
()
==
CV_8UC1
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment