Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
e7f6c4b7
Commit
e7f6c4b7
authored
Jun 20, 2012
by
Marina Kolpakova
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
scan operations are moved in separate header
parent
8748cbc2
Show whitespace changes
Inline
Side-by-side
Showing
28 changed files
with
230 additions
and
487 deletions
+230
-487
OpenCVDetectCUDA.cmake
cmake/OpenCVDetectCUDA.cmake
+0
-0
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+0
-4
perf_imgproc.cpp
modules/gpu/perf/perf_imgproc.cpp
+3
-3
element_operations.cu
modules/gpu/src/cuda/element_operations.cu
+4
-4
matrix_reductions.cu
modules/gpu/src/cuda/matrix_reductions.cu
+1
-1
resize.cu
modules/gpu/src/cuda/resize.cu
+2
-363
split_merge.cu
modules/gpu/src/cuda/split_merge.cu
+18
-18
NCVBroxOpticalFlow.cu
modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+3
-3
NCV.cu
modules/gpu/src/nvidia/core/NCV.cu
+8
-8
common.hpp
modules/gpu/src/opencv2/gpu/device/common.hpp
+0
-0
datamov_utils.hpp
modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+0
-0
dynamic_smem.hpp
modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+0
-0
emulation.hpp
modules/gpu/src/opencv2/gpu/device/emulation.hpp
+0
-0
funcattrib.hpp
modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+0
-0
functional.hpp
modules/gpu/src/opencv2/gpu/device/functional.hpp
+24
-2
limits.hpp
modules/gpu/src/opencv2/gpu/device/limits.hpp
+0
-0
saturate_cast.hpp
modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+0
-0
scan.hpp
modules/gpu/src/opencv2/gpu/device/scan.hpp
+167
-0
static_check.hpp
modules/gpu/src/opencv2/gpu/device/static_check.hpp
+0
-0
transform.hpp
modules/gpu/src/opencv2/gpu/device/transform.hpp
+0
-0
type_traits.hpp
modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+0
-0
utility.hpp
modules/gpu/src/opencv2/gpu/device/utility.hpp
+0
-0
vec_distance.hpp
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+0
-0
vec_math.hpp
modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+0
-0
vec_traits.hpp
modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+0
-0
warp.hpp
modules/gpu/src/opencv2/gpu/device/warp.hpp
+0
-0
resize.cpp
modules/gpu/src/resize.cpp
+0
-42
test_resize.cpp
modules/gpu/test/test_resize.cpp
+0
-39
No files found.
cmake/OpenCVDetectCUDA.cmake
View file @
e7f6c4b7
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
e7f6c4b7
...
...
@@ -629,10 +629,6 @@ CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, doubl
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
CV_EXPORTS
void
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
dsize
,
double
fx
=
0
,
double
fy
=
0
,
int
interpolation
=
INTER_LINEAR
,
Stream
&
stream
=
Stream
::
Null
());
//! resizes the image
//! Supports INTER_AREA
CV_EXPORTS
void
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
GpuMat
&
buffer
,
Size
dsize
,
double
fx
=
0
,
double
fy
=
0
,
int
interpolation
=
INTER_AREA
,
Stream
&
stream
=
Stream
::
Null
());
//! warps the image using affine transformation
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
CV_EXPORTS
void
warpAffine
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
Mat
&
M
,
Size
dsize
,
int
flags
=
INTER_LINEAR
,
...
...
modules/gpu/perf/perf_imgproc.cpp
View file @
e7f6c4b7
...
...
@@ -118,10 +118,10 @@ GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
INSTANTIATE_TEST_CASE_P
(
ImgProc
,
ResizeArea
,
testing
::
Combine
(
ALL_DEVICES
,
testing
::
Values
(
perf
::
sz1080p
,
cv
::
Size
(
4096
,
2048
)
),
testing
::
Values
(
MatType
(
CV_8UC1
)
/*
, MatType(CV_8UC3), MatType(CV_8UC4),
testing
::
Values
(
perf
::
sz1080p
/*, cv::Size(4096, 2048)*/
),
testing
::
Values
(
MatType
(
CV_8UC1
),
MatType
(
CV_8UC3
),
MatType
(
CV_8UC4
),
MatType
(
CV_16UC1
),
MatType
(
CV_16UC3
),
MatType
(
CV_16UC4
),
MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)
*/
),
MatType
(
CV_32FC1
),
MatType
(
CV_32FC3
),
MatType
(
CV_32FC4
)),
testing
::
Values
(
Scale
(
0.2
),
Scale
(
0.1
),
Scale
(
0.05
))));
//////////////////////////////////////////////////////////////////////
...
...
modules/gpu/src/cuda/element_operations.cu
View file @
e7f6c4b7
...
...
@@ -1253,7 +1253,7 @@ namespace cv { namespace gpu { namespace device
{
const T val;
__host__ explicit CompareScalar(T val
) : val(val
) {}
__host__ explicit CompareScalar(T val
_) : val(val_
) {}
__device__ __forceinline__ uchar operator()(T src) const
{
...
...
@@ -1266,7 +1266,7 @@ namespace cv { namespace gpu { namespace device
{
const TYPE_VEC(T, 2) val;
__host__ explicit CompareScalar(TYPE_VEC(T, 2) val
) : val(val
) {}
__host__ explicit CompareScalar(TYPE_VEC(T, 2) val
_) : val(val_
) {}
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
{
...
...
@@ -1281,7 +1281,7 @@ namespace cv { namespace gpu { namespace device
{
const TYPE_VEC(T, 3) val;
__host__ explicit CompareScalar(TYPE_VEC(T, 3) val
) : val(val
) {}
__host__ explicit CompareScalar(TYPE_VEC(T, 3) val
_) : val(val_
) {}
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
{
...
...
@@ -1297,7 +1297,7 @@ namespace cv { namespace gpu { namespace device
{
const TYPE_VEC(T, 4) val;
__host__ explicit CompareScalar(TYPE_VEC(T, 4) val
) : val(val
) {}
__host__ explicit CompareScalar(TYPE_VEC(T, 4) val
_) : val(val_
) {}
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
{
...
...
modules/gpu/src/cuda/matrix_reductions.cu
View file @
e7f6c4b7
...
...
@@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device
struct Mask8U
{
explicit Mask8U(PtrStepb mask
): mask(mask
) {}
explicit Mask8U(PtrStepb mask
_): mask(mask_
) {}
__device__ __forceinline__ bool operator()(int y, int x) const
{
...
...
modules/gpu/src/cuda/resize.cu
View file @
e7f6c4b7
...
...
@@ -46,7 +46,8 @@
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp"
# include <cfloat>
#include <cfloat>
#include <opencv2/gpu/device/scan.hpp>
namespace cv { namespace gpu { namespace device
{
...
...
@@ -285,367 +286,5 @@ namespace cv { namespace gpu { namespace device
typedef float scan_line_type;
};
// template <typename T>
// __global__ void resize_area_scan(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy, DevMem2D_<T> buffer)
// {
// typedef typename scan_traits<T>::scan_line_type W;
// extern __shared__ W line[];
// const int x = threadIdx.x;
// const int y = blockIdx.x;
// if (y >= src.rows) return;
// int offset = 1;
// line[2 * x + 0] = src(y, 2 * x + 0);
// line[2 * x + 1] = src(y, 2 * x + 1);
// __syncthreads();//???
// // reduction
// for (int d = blockDim.x; d > 0; d >>= 1)
// {
// __syncthreads();
// if (x < d)
// {
// int ai = 2 * x * offset -1 + 1 * offset;
// int bi = 2 * x * offset -1 + 2 * offset;
// line[bi] += line[ai];
// }
// offset *= 2;
// }
// __syncthreads();
// // convolution
// if (x == 0) { line[(blockDim.x << 1) - 1] = 0; printf("offset: %d!!!!!!!!!!!!!\n", fx);}
// for (int d = 1; d < (blockDim.x << 1); d *= 2)
// {
// offset >>= 1;
// __syncthreads();
// if (x < d)
// {
// int ai = offset * 2 * x + 1 * offset - 1;
// int bi = offset * 2 * x + 2 * offset - 1;
// W t = line[ai];
// line[ai] = line[bi];
// line[bi] += t;
// }
// }
// __syncthreads();
// // calculate sum
// int start = 0;
// int out_idx = 0;
// int end = start + fx;
// while (start < (blockDim.x << 1) && end < (blockDim.x << 1))
// {
// buffer(y, out_idx) = saturate_cast<T>((line[end] - line[start]) / fx);
// start = end;
// end = start + fx;
// out_idx++;
// }
// }
template <typename T>
__device__ void scan_y(DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,int fx, int fy, DevMem2D_<T> dst,
typename scan_traits<T>::scan_line_type* line, int g_base)
{
typedef typename scan_traits<T>::scan_line_type W;
const int y = threadIdx.x;
const int x = blockIdx.x;
float scale = 1.f / (fx * fy);
if (x >= buffer.cols) return;
int offset = 1;
line[2 * y + 0] = buffer((g_base * fy) + 2 * y + 1, x);
if (y != (blockDim.x -1) )
line[2 * y + 1] = buffer((g_base * fy) + 2 * y + 2, x);
else
line[2 * y + 1] = 0;
__syncthreads();
// reduction
for (int d = blockDim.x; d > 0; d >>= 1)
{
__syncthreads();
if (y < d)
{
int ai = 2 * y * offset -1 + 1 * offset;
int bi = 2 * y * offset -1 + 2 * offset;
line[bi] += line[ai];
}
offset *= 2;
}
__syncthreads();
// convolution
if (y == 0) line[(blockDim.x << 1) - 1] = (W)buffer(0, x);
for (int d = 1; d < (blockDim.x << 1); d *= 2)
{
offset >>= 1;
__syncthreads();
if (y < d)
{
int ai = offset * 2 * y + 1 * offset - 1;
int bi = offset * 2 * y + 2 * offset - 1;
W t = line[ai];
line[ai] = line[bi];
line[bi] += t;
}
}
__syncthreads();
if (y < dst.rows)
{
W start = (y == 0)? (W)0:line[y * fy -1];
W end = line[y * fy + fy - 1];
dst(g_base + y ,x) = saturate_cast<T>((end - start) * scale);
}
}
template <typename T>
__device__ void scan_x(const DevMem2D_<T> src, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,
typename scan_traits<T>::scan_line_type* line, int g_base)
{
typedef typename scan_traits<T>::scan_line_type W;
const int x = threadIdx.x;
const int y = blockIdx.x;
float scale = 1.f / (fx * fy);
if (y >= src.rows) return;
int offset = 1;
line[2 * x + 0] = (W)src(y, (g_base * fx) + 2 * x + 1);
if (x != (blockDim.x -1) )
line[2 * x + 1] = (W)src(y, (g_base * fx) + 2 * x + 2);
else
line[2 * x + 1] = 0;
__syncthreads();
// reduction
for (int d = blockDim.x; d > 0; d >>= 1)
{
__syncthreads();
if (x < d)
{
int ai = 2 * x * offset -1 + 1 * offset;
int bi = 2 * x * offset -1 + 2 * offset;
line[bi] += line[ai];
}
offset *= 2;
}
__syncthreads();
// convolution
if (x == 0) line[(blockDim.x << 1) - 1] = (W)src(y, 0);
for (int d = 1; d < (blockDim.x << 1); d *= 2)
{
offset >>= 1;
__syncthreads();
if (x < d)
{
int ai = offset * 2 * x + 1 * offset - 1;
int bi = offset * 2 * x + 2 * offset - 1;
W t = line[ai];
line[ai] = line[bi];
line[bi] += t;
}
}
__syncthreads();
if (x < buffer.cols)
{
W start = (x == 0)? (W)0:line[x * fx -1];
W end = line[x * fx + fx - 1];
buffer(y, g_base + x) =(end - start);
}
}
enum ScanKind { exclusive, inclusive } ;
template <ScanKind Kind , class T>
__device__ __forceinline__ T scan_warp ( volatile T *ptr , const unsigned int idx = threadIdx.x )
{
const unsigned int lane = idx & 31;
if ( lane >= 1) ptr [idx ] = ptr [idx - 1] + ptr [idx];
if ( lane >= 2) ptr [idx ] = ptr [idx - 2] + ptr [idx];
if ( lane >= 4) ptr [idx ] = ptr [idx - 4] + ptr [idx];
if ( lane >= 8) ptr [idx ] = ptr [idx - 8] + ptr [idx];
if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx];
if( Kind == inclusive )
return ptr [idx ];
else
return (lane > 0) ? ptr [idx - 1] : 0;
}
template <ScanKind Kind , class T>
__device__ __forceinline__ T scan_block( volatile T *ptr)
{
const unsigned int idx = threadIdx.x;
const unsigned int lane = idx & 31;
const unsigned int warp = idx >> 5;
T val = scan_warp <Kind>( ptr , idx );
__syncthreads ();
if( lane == 31 )
ptr [ warp ] = ptr [idx ];
__syncthreads ();
if( warp == 0 )
scan_warp<inclusive>( ptr , idx );
__syncthreads ();
if ( warp > 0)
val = ptr [warp -1] + val;
__syncthreads ();
ptr[idx] = val;
__syncthreads ();
return val ;
}
template<typename T, typename W>
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride)
{
extern __shared__ W sbuf[];
const unsigned int tid = threadIdx. x;
// load line-block on shared memory
int y = blockIdx.x / thred_lines;
int input_stride = (blockIdx.x % thred_lines) * stride;
int x = input_stride + tid;
// store global data in shared memory
if (x < src.cols && y < src.rows)
sbuf[tid] = src(y, x);
else
sbuf[tid] = 0;
__syncthreads();
scan_block<inclusive, W>(sbuf);
float scale = __fdividef(1.f, fx);
int out_stride = input_stride / fx;
int count = blockDim.x / fx;
if (tid < count)
{
int start_idx = (tid == 0)? 0 : tid * fx - 1;
int end_idx = tid * fx + fx - 1;
W start = (tid == 0)? (W)0:sbuf[start_idx];
W end = sbuf[end_idx];
dst(y, out_stride + tid) = (end - start);
}
}
template<typename T, typename W>
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride)
{
extern __shared__ W sbuf[];
const unsigned int tid = threadIdx. x;
// load line-block on shared memory
int x = blockIdx.x / thred_lines;
int global_stride = (blockIdx.x % thred_lines) * stride;
int y = global_stride + tid;
// store global data in shared memory
if (x < src.cols && y < src.rows)
sbuf[tid] = src(y, x);
else
sbuf[tid] = 0;
__syncthreads();
scan_block<inclusive, W>(sbuf);
float scale = __fdividef(1.f, fx * fy);
int out_stride = global_stride / fx;
int count = blockDim.x / fx;
if (tid < count)
{
int start_idx = (tid == 0)? 0 : tid * fx - 1;
int end_idx = tid * fx + fx - 1;
W start = (tid == 0)? (W)0:sbuf[start_idx];
W end = sbuf[end_idx];
dst(out_stride + tid, x) = saturate_cast<T>((end - start) * scale);
}
}
template <typename T>
void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
int interpolation, DevMem2Df buffer, cudaStream_t stream)
{
(void)interpolation;
int iscale_x = round(fx);
int iscale_y = round(fy);
int warps = 4;
const int threads = 32 * warps;
int input_stride = threads / iscale_x;
int thred_lines = divUp(src.cols, input_stride * iscale_x);
int blocks = src.rows * thred_lines;
typedef typename scan_traits<T>::scan_line_type smem_type;
resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
(src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x);
input_stride = threads / iscale_y;
thred_lines = divUp(src.rows, input_stride * iscale_y);
blocks = dst.cols * thred_lines;
resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
(buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device
modules/gpu/src/cuda/split_merge.cu
View file @
e7f6c4b7
...
...
@@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC2_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC2_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
...
...
@@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC3_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC3_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
...
...
@@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim
.y));
mergeC4_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(dst.cols, block.x), divUp(dst.rows, block
.y));
mergeC4_<T><<<grid
, block
, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
...
...
@@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC2_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC2_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
...
...
@@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC3_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC3_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
...
...
@@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
template <typename T>
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 block
Dim
(32, 8);
dim3 grid
Dim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim
.y));
splitC4_<T><<<grid
Dim, blockDim
, 0, stream>>>(
dim3 block(32, 8);
dim3 grid
(divUp(src.cols, block.x), divUp(src.rows, block
.y));
splitC4_<T><<<grid
, block
, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
...
...
modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
View file @
e7f6c4b7
...
...
@@ -1121,18 +1121,18 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
dim3 p_threads(32, 8);
NcvSize32u srcSize (kLevelWidth, kLevelHeight);
NcvSize32u
inner_
srcSize (kLevelWidth, kLevelHeight);
NcvSize32u dstSize (nw, nh);
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
NcvRect32u dstROI (0, 0, nw, nh);
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(),
srcSize, kLevelStride * sizeof (float), srcROI,
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(),
inner_srcSize, kLevelStride * sizeof (float), srcROI,
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(),
srcSize, kLevelStride * sizeof (float), srcROI,
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(),
inner_srcSize, kLevelStride * sizeof (float), srcROI,
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
...
...
modules/gpu/src/nvidia/core/NCV.cu
View file @
e7f6c4b7
...
...
@@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
//===================================================================
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment
_
)
:
currentSize(0),
_maxSize(0),
...
...
@@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
begin(NULL),
end(NULL),
_memType(NCVMemoryTypeNone),
_alignment(alignment),
_alignment(alignment
_
),
bReusesMemory(false)
{
NcvBool bProperAlignment = (alignment
& (alignment-
1)) == 0;
NcvBool bProperAlignment = (alignment
_ & (alignment_ -
1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
}
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment
_
, void *reusePtr)
:
currentSize(0),
_maxSize(0),
allocBegin(NULL),
_memType(memT),
_alignment(alignment)
_alignment(alignment
_
)
{
NcvBool bProperAlignment = (alignment
& (alignment-
1)) == 0;
NcvBool bProperAlignment = (alignment
_ & (alignment_ -
1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
...
...
@@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
//===================================================================
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment
_
)
:
currentSize(0),
_maxSize(0),
_memType(memT),
_alignment(alignment)
_alignment(alignment
_
)
{
ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
}
...
...
modules/gpu/src/opencv2/gpu/device/common.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/emulation.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/functional.hpp
View file @
e7f6c4b7
...
...
@@ -416,6 +416,8 @@ namespace cv { namespace gpu { namespace device
{
return
src1
*
src1
+
src2
*
src2
;
}
__device__
__forceinline__
hypot_sqr_func
(
const
hypot_sqr_func
&
other
)
:
binary_function
<
T
,
T
,
float
>
(){}
__device__
__forceinline__
hypot_sqr_func
()
:
binary_function
<
T
,
T
,
float
>
(){}
};
// Saturate Cast Functor
...
...
@@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
{
return
(
src
>
thresh
)
*
maxVal
;
}
__device__
__forceinline__
thresh_binary_func
(
const
thresh_binary_func
&
other
)
:
unary_function
<
T
,
T
>
(),
thresh
(
other
.
thresh
),
maxVal
(
other
.
maxVal
){}
...
...
@@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
{
return
(
src
<=
thresh
)
*
maxVal
;
}
__device__
__forceinline__
thresh_binary_inv_func
(
const
thresh_binary_inv_func
&
other
)
:
unary_function
<
T
,
T
>
(),
thresh
(
other
.
thresh
),
maxVal
(
other
.
maxVal
){}
...
...
@@ -523,8 +527,12 @@ namespace cv { namespace gpu { namespace device
return
!
pred
(
x
);
}
__device__
__forceinline__
unary_negate
(
const
unary_negate
&
other
)
:
unary_function
<
typename
Predicate
::
argument_type
,
bool
>
(){}
__device__
__forceinline__
unary_negate
()
:
unary_function
<
typename
Predicate
::
argument_type
,
bool
>
(){}
const
Predicate
pred
;
};
template
<
typename
Predicate
>
__host__
__device__
__forceinline__
unary_negate
<
Predicate
>
not1
(
const
Predicate
&
pred
)
{
return
unary_negate
<
Predicate
>
(
pred
);
...
...
@@ -534,13 +542,20 @@ namespace cv { namespace gpu { namespace device
{
explicit
__host__
__device__
__forceinline__
binary_negate
(
const
Predicate
&
p
)
:
pred
(
p
)
{}
__device__
__forceinline__
bool
operator
()(
typename
TypeTraits
<
typename
Predicate
::
first_argument_type
>::
ParameterType
x
,
typename
TypeTraits
<
typename
Predicate
::
second_argument_type
>::
ParameterType
y
)
const
__device__
__forceinline__
bool
operator
()(
typename
TypeTraits
<
typename
Predicate
::
first_argument_type
>::
ParameterType
x
,
typename
TypeTraits
<
typename
Predicate
::
second_argument_type
>::
ParameterType
y
)
const
{
return
!
pred
(
x
,
y
);
}
__device__
__forceinline__
binary_negate
(
const
binary_negate
&
other
)
:
binary_function
<
typename
Predicate
::
first_argument_type
,
typename
Predicate
::
second_argument_type
,
bool
>
(){}
__device__
__forceinline__
binary_negate
()
:
binary_function
<
typename
Predicate
::
first_argument_type
,
typename
Predicate
::
second_argument_type
,
bool
>
(){}
const
Predicate
pred
;
};
template
<
typename
BinaryPredicate
>
__host__
__device__
__forceinline__
binary_negate
<
BinaryPredicate
>
not2
(
const
BinaryPredicate
&
pred
)
{
return
binary_negate
<
BinaryPredicate
>
(
pred
);
...
...
@@ -555,9 +570,13 @@ namespace cv { namespace gpu { namespace device
return
op
(
arg1
,
a
);
}
__device__
__forceinline__
binder1st
(
const
binder1st
&
other
)
:
unary_function
<
typename
Op
::
second_argument_type
,
typename
Op
::
result_type
>
(){}
const
Op
op
;
const
typename
Op
::
first_argument_type
arg1
;
};
template
<
typename
Op
,
typename
T
>
__host__
__device__
__forceinline__
binder1st
<
Op
>
bind1st
(
const
Op
&
op
,
const
T
&
x
)
{
return
binder1st
<
Op
>
(
op
,
typename
Op
::
first_argument_type
(
x
));
...
...
@@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
return
op
(
a
,
arg2
);
}
__device__
__forceinline__
binder2nd
(
const
binder2nd
&
other
)
:
unary_function
<
typename
Op
::
first_argument_type
,
typename
Op
::
result_type
>
(),
op
(
other
.
op
),
arg2
(
other
.
arg2
){}
const
Op
op
;
const
typename
Op
::
second_argument_type
arg2
;
};
template
<
typename
Op
,
typename
T
>
__host__
__device__
__forceinline__
binder2nd
<
Op
>
bind2nd
(
const
Op
&
op
,
const
T
&
x
)
{
return
binder2nd
<
Op
>
(
op
,
typename
Op
::
second_argument_type
(
x
));
}
// Functor Traits
template
<
typename
F
>
struct
IsUnaryFunction
{
typedef
char
Yes
;
...
...
modules/gpu/src/opencv2/gpu/device/limits.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/scan.hpp
0 → 100644
View file @
e7f6c4b7
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_SCAN_HPP__
#define __OPENCV_GPU_SCAN_HPP__
enum
ScanKind
{
EXCLUSIVE
=
0
,
INCLUSIVE
=
1
};
template
<
ScanKind
Kind
,
typename
T
,
typename
F
>
struct
WarpScan
{
__device__
__forceinline__
WarpScan
()
{}
__device__
__forceinline__
WarpScan
(
const
WarpScan
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
,
const
unsigned
int
idx
)
{
const
unsigned
int
lane
=
idx
&
31
;
F
op
;
if
(
lane
>=
1
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
1
],
ptr
[
idx
]);
if
(
lane
>=
2
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
2
],
ptr
[
idx
]);
if
(
lane
>=
4
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
4
],
ptr
[
idx
]);
if
(
lane
>=
8
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
8
],
ptr
[
idx
]);
if
(
lane
>=
16
)
ptr
[
idx
]
=
op
(
ptr
[
idx
-
16
],
ptr
[
idx
]);
if
(
Kind
==
INCLUSIVE
)
return
ptr
[
idx
];
else
return
(
lane
>
0
)
?
ptr
[
idx
-
1
]
:
0
;
}
__device__
__forceinline__
unsigned
int
index
(
const
unsigned
int
tid
)
{
return
tid
;
}
__device__
__forceinline__
void
init
(
volatile
T
*
ptr
){}
static
const
int
warp_offset
=
0
;
typedef
WarpScan
<
INCLUSIVE
,
T
,
F
>
merge
;
};
template
<
ScanKind
Kind
,
typename
T
,
typename
F
>
struct
WarpScanNoComp
{
__device__
__forceinline__
WarpScanNoComp
()
{}
__device__
__forceinline__
WarpScanNoComp
(
const
WarpScanNoComp
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
,
const
unsigned
int
idx
)
{
const
unsigned
int
lane
=
threadIdx
.
x
&
31
;
F
op
;
ptr
[
idx
]
=
op
(
ptr
[
idx
-
1
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
2
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
4
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
8
],
ptr
[
idx
]);
ptr
[
idx
]
=
op
(
ptr
[
idx
-
16
],
ptr
[
idx
]);
if
(
Kind
==
INCLUSIVE
)
return
ptr
[
idx
];
else
return
(
lane
>
0
)
?
ptr
[
idx
-
1
]
:
0
;
}
__device__
__forceinline__
unsigned
int
index
(
const
unsigned
int
tid
)
{
return
(
tid
>>
warp_log
)
*
warp_smem_stride
+
16
+
(
tid
&
warp_mask
);
}
__device__
__forceinline__
void
init
(
volatile
T
*
ptr
)
{
ptr
[
threadIdx
.
x
]
=
0
;
}
static
const
int
warp_smem_stride
=
32
+
16
+
1
;
static
const
int
warp_offset
=
16
;
static
const
int
warp_log
=
5
;
static
const
int
warp_mask
=
31
;
typedef
WarpScanNoComp
<
INCLUSIVE
,
T
,
F
>
merge
;
};
template
<
ScanKind
Kind
,
typename
T
,
typename
Sc
,
typename
F
>
struct
BlockScan
{
__device__
__forceinline__
BlockScan
()
{}
__device__
__forceinline__
BlockScan
(
const
BlockScan
&
other
)
{
(
void
)
other
;
}
__device__
__forceinline__
T
operator
()(
volatile
T
*
ptr
)
{
const
unsigned
int
tid
=
threadIdx
.
x
;
const
unsigned
int
lane
=
tid
&
warp_mask
;
const
unsigned
int
warp
=
tid
>>
warp_log
;
Sc
scan
;
typename
Sc
::
merge
merge_scan
;
const
unsigned
int
idx
=
scan
.
index
(
tid
);
T
val
=
scan
(
ptr
,
idx
);
__syncthreads
();
if
(
warp
==
0
)
scan
.
init
(
ptr
);
__syncthreads
();
if
(
lane
==
31
)
ptr
[
scan
.
warp_offset
+
warp
]
=
(
Kind
==
INCLUSIVE
)
?
val
:
ptr
[
idx
];
__syncthreads
();
if
(
warp
==
0
)
merge_scan
(
ptr
,
idx
);
__syncthreads
();
if
(
warp
>
0
)
val
=
ptr
[
scan
.
warp_offset
+
warp
-
1
]
+
val
;
__syncthreads
();
ptr
[
idx
]
=
val
;
__syncthreads
();
return
val
;
}
static
const
int
warp_log
=
5
;
static
const
int
warp_mask
=
31
;
};
#endif
\ No newline at end of file
modules/gpu/src/opencv2/gpu/device/static_check.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/transform.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/type_traits.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/utility.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/vec_math.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
View file @
e7f6c4b7
modules/gpu/src/opencv2/gpu/device/warp.hpp
View file @
e7f6c4b7
modules/gpu/src/resize.cpp
View file @
e7f6c4b7
...
...
@@ -80,51 +80,9 @@ namespace cv { namespace gpu { namespace device
template
<
typename
T
>
void
resize_gpu
(
DevMem2Db
src
,
DevMem2Db
srcWhole
,
int
xoff
,
int
yoff
,
float
fx
,
float
fy
,
DevMem2Db
dst
,
int
interpolation
,
cudaStream_t
stream
);
template
<
typename
T
>
void
resize_area_gpu
(
const
DevMem2Db
src
,
DevMem2Db
dst
,
float
fx
,
float
fy
,
int
interpolation
,
DevMem2Df
buffer
,
cudaStream_t
stream
);
}
}}}
void
cv
::
gpu
::
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
GpuMat
&
buffer
,
Size
dsize
,
double
fx
,
double
fy
,
int
interpolation
,
Stream
&
s
)
{
CV_Assert
(
src
.
depth
()
<=
CV_32F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
interpolation
==
INTER_AREA
);
CV_Assert
(
(
fx
<
1.0
)
&&
(
fy
<
1.0
));
CV_Assert
(
!
(
dsize
==
Size
())
||
(
fx
>
0
&&
fy
>
0
));
CV_Assert
(
src
.
cols
>=
128
&&
src
.
rows
>=
128
);
CV_Assert
((
fx
-
128.0
)
<=
0
&&
(
fy
-
128.0
)
<=
0
);
if
(
dsize
==
Size
())
dsize
=
Size
(
saturate_cast
<
int
>
(
src
.
cols
*
fx
),
saturate_cast
<
int
>
(
src
.
rows
*
fy
));
else
{
fx
=
static_cast
<
double
>
(
dsize
.
width
)
/
src
.
cols
;
fy
=
static_cast
<
double
>
(
dsize
.
height
)
/
src
.
rows
;
}
fx
=
static_cast
<
float
>
(
1.0
/
fx
);
fy
=
static_cast
<
float
>
(
1.0
/
fy
);
dst
.
create
(
dsize
,
src
.
type
());
buffer
.
create
(
cv
::
Size
(
dsize
.
width
,
src
.
rows
),
CV_32FC1
);
if
(
dsize
==
src
.
size
())
{
if
(
s
)
s
.
enqueueCopy
(
src
,
dst
);
else
src
.
copyTo
(
dst
);
return
;
}
cudaStream_t
stream
=
StreamAccessor
::
getStream
(
s
);
cv
::
gpu
::
device
::
imgproc
::
resize_area_gpu
<
uchar
>
(
src
,
dst
,
fx
,
fy
,
interpolation
,
buffer
,
stream
);
}
void
cv
::
gpu
::
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
dsize
,
double
fx
,
double
fy
,
int
interpolation
,
Stream
&
s
)
{
CV_Assert
(
src
.
depth
()
<=
CV_32F
&&
src
.
channels
()
<=
4
);
...
...
modules/gpu/test/test_resize.cpp
View file @
e7f6c4b7
...
...
@@ -182,45 +182,6 @@ PARAM_TEST_CASE(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, double, Inte
}
};
TEST_P
(
ResizeArea
,
Accuracy
)
{
cv
::
Mat
src
=
randomMat
(
size
,
type
);
cv
::
gpu
::
GpuMat
dst
=
createMat
(
cv
::
Size
(
cv
::
saturate_cast
<
int
>
(
src
.
cols
*
coeff
),
cv
::
saturate_cast
<
int
>
(
src
.
rows
*
coeff
)),
type
,
useRoi
);
cv
::
gpu
::
GpuMat
buffer
=
createMat
(
cv
::
Size
(
dst
.
cols
,
src
.
rows
),
CV_32FC1
);
cv
::
gpu
::
resize
(
loadMat
(
src
,
useRoi
),
dst
,
buffer
,
cv
::
Size
(),
coeff
,
coeff
,
interpolation
);
cv
::
Mat
dst_cpu
;
cv
::
resize
(
src
,
dst_cpu
,
cv
::
Size
(),
coeff
,
coeff
,
interpolation
);
cv
::
Mat
gpu_buff
;
buffer
.
download
(
gpu_buff
);
cv
::
Mat
gpu
;
dst
.
download
(
gpu
);
// std::cout // << src
// // << std::endl << std::endl
// // << gpu_buff
// // << std::endl << std::endl
// << gpu
// << std::endl << std::endl
// << dst_cpu<< std::endl;
EXPECT_MAT_NEAR
(
dst_cpu
,
dst
,
src
.
depth
()
==
CV_32F
?
1e-2
:
1.0
);
}
INSTANTIATE_TEST_CASE_P
(
GPU_ImgProc
,
ResizeArea
,
testing
::
Combine
(
ALL_DEVICES
,
testing
::
Values
(
cv
::
Size
(
640
,
480
)),
//DIFFERENT_SIZES,
testing
::
Values
(
MatType
(
CV_8UC1
)
/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/
),
testing
::
Values
(
0.05
,
0.1
),
testing
::
Values
(
Interpolation
(
cv
::
INTER_AREA
)),
WHOLE_SUBMAT
));
///////////////////////////////////////////////////////////////////
// Test NPP
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment