Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0c325cac
Commit
0c325cac
authored
Apr 24, 2011
by
Anton Obukhov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[~] Minor refactoring, clean-up
[+] Added 128-bit transpose
parent
e2caf4a3
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
661 additions
and
720 deletions
+661
-720
NCVHaarObjectDetection.cu
modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+4
-302
NCVHaarObjectDetection.hpp
modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+92
-138
NPP_staging.cu
modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+62
-177
NPP_staging.hpp
modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
+70
-52
NCV.cu
modules/gpu/src/nvidia/core/NCV.cu
+337
-8
NCV.hpp
modules/gpu/src/nvidia/core/NCV.hpp
+79
-26
NCVRuntimeTemplates.hpp
modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
+2
-2
TestHypothesesFilter.cpp
modules/gpu/test/nvidia/TestHypothesesFilter.cpp
+1
-1
TestResize.cpp
modules/gpu/test/nvidia/TestResize.cpp
+14
-14
No files found.
modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
View file @
0c325cac
...
...
@@ -63,8 +63,6 @@
#include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp"
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
//==============================================================================
//
...
...
@@ -785,7 +783,6 @@ void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbInitMaskPositively,
tbCacheTextureIImg,
tbCacheTextureCascade,
...
...
@@ -890,7 +887,6 @@ void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbCacheTextureIImg,
tbCacheTextureCascade,
tbDoAtomicCompaction);
...
...
@@ -957,7 +953,6 @@ void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbMaskByInmask,
tbDoAtomicCompaction);
}
...
...
@@ -1554,172 +1549,6 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
}
//==============================================================================
//
// Visualize file
//
//==============================================================================
const Ncv32u NUMTHREADS_DRAWRECTS = 32;
const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
template <class T>
__global__ void drawRects(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color)
{
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
if (blockId > numRects * 4)
{
return;
}
NcvRect32u curRect = d_rects[blockId >> 2];
NcvBool bVertical = blockId & 0x1;
NcvBool bTopLeft = blockId & 0x2;
Ncv32u pt0x, pt0y;
if (bVertical)
{
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
pt0y = curRect.y;
if (pt0x < dstWidth)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptY < pt0y + curRect.height && ptY < dstHeight)
{
d_dst[ptY * dstStride + pt0x] = color;
}
}
}
}
else
{
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = curRect.x;
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
if (pt0y < dstHeight)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptX < pt0x + curRect.width && ptX < dstWidth)
{
d_dst[pt0y * dstStride + ptX] = color;
}
}
}
}
}
template <class T>
static NCVStatus drawRectsWrapperDevice(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color,
cudaStream_t cuStream)
{
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
if (numRects == 0)
{
return NCV_SUCCESS;
}
#if defined _SELF_TEST_
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
NcvRect32s *h_rects;
ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color));
#endif
dim3 grid(numRects * 4);
dim3 block(NUMTHREADS_DRAWRECTS);
if (grid.x > 65535)
{
grid.y = (grid.x + 65534) / 65535;
grid.x = 65535;
}
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
#if defined _SELF_TEST_
T *h_dst_after;
ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
bool bPass = true;
for (Ncv32u i=0; i<dstHeight && bPass; i++)
{
for (Ncv32u j=0; j<dstWidth && bPass; j++)
{
if (h_dst[i*dstStride+j] != h_dst_after[i*dstStride+j])
{
printf("::drawRectsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_dst[i*dstStride+j], h_dst_after[i*dstStride+j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_dst_after), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaFreeHost(h_rects), NCV_CUDA_ERROR);
printf("::drawRectsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
//==============================================================================
//
// Pipeline file
...
...
@@ -1901,13 +1730,13 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
NCV_SKIP_COND_BEGIN
nppStat = nppiStD
ownsampleNearest
_32u_C1R(
nppStat = nppiStD
ecimate
_32u_C1R(
d_integralImage.ptr(), d_integralImage.pitch(),
d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
srcIIRoi, scale, true);
ncvAssertReturnNcvStat(nppStat);
nppStat = nppiStD
ownsampleNearest
_64u_C1R(
nppStat = nppiStD
ecimate
_64u_C1R(
d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
srcIIRoi, scale, true);
...
...
@@ -1969,7 +1798,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
}
Ncv32u numStrongHypothesesNow = dstNumRects;
ncvStat = ncv
FilterHypothes
es_host(
ncvStat = ncv
GroupRectangl
es_host(
h_hypothesesIntermediate,
numStrongHypothesesNow,
minNeighbors,
...
...
@@ -2031,7 +1860,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
ncvStat = ncv
FilterHypothes
es_host(
ncvStat = ncv
GroupRectangl
es_host(
h_hypothesesIntermediate,
dstNumRects,
minNeighbors,
...
...
@@ -2285,133 +2114,6 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
}
NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
Ncv32u &numHypotheses,
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights)
{
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (hypothesesWeights != NULL)
{
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
}
if (numHypotheses == 0)
{
return NCV_SUCCESS;
}
std::vector<NcvRect32u> rects(numHypotheses);
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
std::vector<Ncv32u> weights;
if (hypothesesWeights != NULL)
{
groupRectangles(rects, minNeighbors, intersectEps, &weights);
}
else
{
groupRectangles(rects, minNeighbors, intersectEps, NULL);
}
numHypotheses = (Ncv32u)rects.size();
if (numHypotheses > 0)
{
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
}
if (hypothesesWeights != NULL)
{
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
}
return NCV_SUCCESS;
}
template <class T>
static NCVStatus drawRectsWrapperHost(T *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
T color)
{
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
for (Ncv32u i=0; i<numRects; i++)
{
NcvRect32u rect = h_rects[i];
if (rect.x < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x] = color;
}
}
if (rect.x+rect.width-1 < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x+rect.width-1] = color;
}
}
if (rect.y < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[rect.y*dstStride+j] = color;
}
}
if (rect.y + rect.height - 1 < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
}
}
}
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
NCVStatus loadFromXML(const std::string &filename,
HaarClassifierCascadeDescriptor &haar,
std::vector<HaarStage64> &haarStages,
...
...
modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
View file @
0c325cac
...
...
@@ -346,153 +346,107 @@ enum
NCVPipeObjDet_VisualizeInPlace
=
0x004
,
};
NCV_EXPORTS
NCVStatus
ncvDetectObjectsMultiScale_device
(
NCVMatrix
<
Ncv8u
>
&
d_srcImg
,
NcvSize32u
srcRoi
,
NCVVector
<
NcvRect32u
>
&
d_dstRects
,
Ncv32u
&
dstNumRects
,
HaarClassifierCascadeDescriptor
&
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarStage64
>
&
d_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
d_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
d_HaarFeatures
,
NcvSize32u
minObjSize
,
Ncv32u
minNeighbors
,
//default 4
Ncv32f
scaleStep
,
//default 1.2f
Ncv32u
pixelStep
,
//default 1
Ncv32u
flags
,
//default NCVPipeObjDet_Default
INCVMemAllocator
&
gpuAllocator
,
INCVMemAllocator
&
cpuAllocator
,
cudaDeviceProp
&
devProp
,
cudaStream_t
cuStream
);
NCV
_EXPORTS
NCV
Status
ncvDetectObjectsMultiScale_device
(
NCVMatrix
<
Ncv8u
>
&
d_srcImg
,
NcvSize32u
srcRoi
,
NCVVector
<
NcvRect32u
>
&
d_dstRects
,
Ncv32u
&
dstNumRects
,
HaarClassifierCascadeDescriptor
&
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarStage64
>
&
d_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
d_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
d_HaarFeatures
,
NcvSize32u
minObjSize
,
Ncv32u
minNeighbors
,
//default 4
Ncv32f
scaleStep
,
//default 1.2f
Ncv32u
pixelStep
,
//default 1
Ncv32u
flags
,
//default NCVPipeObjDet_Default
INCVMemAllocator
&
gpuAllocator
,
INCVMemAllocator
&
cpuAllocator
,
cudaDeviceProp
&
devProp
,
cudaStream_t
cuStream
);
#define OBJDET_MASK_ELEMENT_INVALID_32U 0xFFFFFFFF
#define HAAR_STDDEV_BORDER 1
NCV_EXPORTS
NCVStatus
ncvApplyHaarClassifierCascade_device
(
NCVMatrix
<
Ncv32u
>
&
d_integralImage
,
NCVMatrix
<
Ncv32f
>
&
d_weights
,
NCVMatrixAlloc
<
Ncv32u
>
&
d_pixelMask
,
Ncv32u
&
numDetections
,
HaarClassifierCascadeDescriptor
&
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarStage64
>
&
d_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
d_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
d_HaarFeatures
,
NcvBool
bMaskElements
,
NcvSize32u
anchorsRoi
,
Ncv32u
pixelStep
,
Ncv32f
scaleArea
,
INCVMemAllocator
&
gpuAllocator
,
INCVMemAllocator
&
cpuAllocator
,
cudaDeviceProp
&
devProp
,
cudaStream_t
cuStream
);
NCV_EXPORTS
NCVStatus
ncvApplyHaarClassifierCascade_host
(
NCVMatrix
<
Ncv32u
>
&
h_integralImage
,
NCVMatrix
<
Ncv32f
>
&
h_weights
,
NCVMatrixAlloc
<
Ncv32u
>
&
h_pixelMask
,
Ncv32u
&
numDetections
,
HaarClassifierCascadeDescriptor
&
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
h_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
h_HaarFeatures
,
NcvBool
bMaskElements
,
NcvSize32u
anchorsRoi
,
Ncv32u
pixelStep
,
Ncv32f
scaleArea
);
NCV_EXPORTS
NCVStatus
ncvDrawRects_8u_device
(
Ncv8u
*
d_dst
,
Ncv32u
dstStride
,
Ncv32u
dstWidth
,
Ncv32u
dstHeight
,
NcvRect32u
*
d_rects
,
Ncv32u
numRects
,
Ncv8u
color
,
cudaStream_t
cuStream
);
NCV_EXPORTS
NCVStatus
ncvDrawRects_32u_device
(
Ncv32u
*
d_dst
,
Ncv32u
dstStride
,
Ncv32u
dstWidth
,
Ncv32u
dstHeight
,
NcvRect32u
*
d_rects
,
Ncv32u
numRects
,
Ncv32u
color
,
cudaStream_t
cuStream
);
NCV_EXPORTS
NCVStatus
ncvDrawRects_8u_host
(
Ncv8u
*
h_dst
,
Ncv32u
dstStride
,
Ncv32u
dstWidth
,
Ncv32u
dstHeight
,
NcvRect32u
*
h_rects
,
Ncv32u
numRects
,
Ncv8u
color
);
NCV_EXPORTS
NCVStatus
ncvDrawRects_32u_host
(
Ncv32u
*
h_dst
,
Ncv32u
dstStride
,
Ncv32u
dstWidth
,
Ncv32u
dstHeight
,
NcvRect32u
*
h_rects
,
Ncv32u
numRects
,
Ncv32u
color
);
NCV_EXPORTS
NCVStatus
ncvApplyHaarClassifierCascade_device
(
NCVMatrix
<
Ncv32u
>
&
d_integralImage
,
NCVMatrix
<
Ncv32f
>
&
d_weights
,
NCVMatrixAlloc
<
Ncv32u
>
&
d_pixelMask
,
Ncv32u
&
numDetections
,
HaarClassifierCascadeDescriptor
&
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarStage64
>
&
d_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
d_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
d_HaarFeatures
,
NcvBool
bMaskElements
,
NcvSize32u
anchorsRoi
,
Ncv32u
pixelStep
,
Ncv32f
scaleArea
,
INCVMemAllocator
&
gpuAllocator
,
INCVMemAllocator
&
cpuAllocator
,
cudaDeviceProp
&
devProp
,
cudaStream_t
cuStream
);
NCV_EXPORTS
NCVStatus
ncvApplyHaarClassifierCascade_host
(
NCVMatrix
<
Ncv32u
>
&
h_integralImage
,
NCVMatrix
<
Ncv32f
>
&
h_weights
,
NCVMatrixAlloc
<
Ncv32u
>
&
h_pixelMask
,
Ncv32u
&
numDetections
,
HaarClassifierCascadeDescriptor
&
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
h_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
h_HaarFeatures
,
NcvBool
bMaskElements
,
NcvSize32u
anchorsRoi
,
Ncv32u
pixelStep
,
Ncv32f
scaleArea
);
#define RECT_SIMILARITY_PROPORTION 0.2f
NCV_EXPORTS
NCVStatus
ncvGrowDetectionsVector_device
(
NCVVector
<
Ncv32u
>
&
pixelMask
,
Ncv32u
numPixelMaskDetections
,
NCVVector
<
NcvRect32u
>
&
hypotheses
,
Ncv32u
&
totalDetections
,
Ncv32u
totalMaxDetections
,
Ncv32u
rectWidth
,
Ncv32u
rectHeight
,
Ncv32f
curScale
,
cudaStream_t
cuStream
);
NCV_EXPORTS
NCVStatus
ncvGrowDetectionsVector_host
(
NCVVector
<
Ncv32u
>
&
pixelMask
,
Ncv32u
numPixelMaskDetections
,
NCVVector
<
NcvRect32u
>
&
hypotheses
,
Ncv32u
&
totalDetections
,
Ncv32u
totalMaxDetections
,
Ncv32u
rectWidth
,
Ncv32u
rectHeight
,
Ncv32f
curScale
);
NCV_EXPORTS
NCVStatus
ncvFilterHypotheses_host
(
NCVVector
<
NcvRect32u
>
&
hypotheses
,
Ncv32u
&
numHypotheses
,
Ncv32u
minNeighbors
,
Ncv32f
intersectEps
,
NCVVector
<
Ncv32u
>
*
hypothesesWeights
);
NCV_EXPORTS
NCVStatus
ncvHaarGetClassifierSize
(
const
std
::
string
&
filename
,
Ncv32u
&
numStages
,
Ncv32u
&
numNodes
,
Ncv32u
&
numFeatures
);
NCV_EXPORTS
NCVStatus
ncvHaarLoadFromFile_host
(
const
std
::
string
&
filename
,
HaarClassifierCascadeDescriptor
&
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
h_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
h_HaarFeatures
);
NCV_EXPORTS
NCVStatus
ncvHaarStoreNVBIN_host
(
const
std
::
string
&
filename
,
HaarClassifierCascadeDescriptor
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
h_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
h_HaarFeatures
);
NCV_EXPORTS
NCVStatus
ncvGrowDetectionsVector_device
(
NCVVector
<
Ncv32u
>
&
pixelMask
,
Ncv32u
numPixelMaskDetections
,
NCVVector
<
NcvRect32u
>
&
hypotheses
,
Ncv32u
&
totalDetections
,
Ncv32u
totalMaxDetections
,
Ncv32u
rectWidth
,
Ncv32u
rectHeight
,
Ncv32f
curScale
,
cudaStream_t
cuStream
);
NCV_EXPORTS
NCVStatus
ncvGrowDetectionsVector_host
(
NCVVector
<
Ncv32u
>
&
pixelMask
,
Ncv32u
numPixelMaskDetections
,
NCVVector
<
NcvRect32u
>
&
hypotheses
,
Ncv32u
&
totalDetections
,
Ncv32u
totalMaxDetections
,
Ncv32u
rectWidth
,
Ncv32u
rectHeight
,
Ncv32f
curScale
);
NCV_EXPORTS
NCVStatus
ncvHaarGetClassifierSize
(
const
std
::
string
&
filename
,
Ncv32u
&
numStages
,
Ncv32u
&
numNodes
,
Ncv32u
&
numFeatures
);
NCV_EXPORTS
NCVStatus
ncvHaarLoadFromFile_host
(
const
std
::
string
&
filename
,
HaarClassifierCascadeDescriptor
&
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
h_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
h_HaarFeatures
);
NCV_EXPORTS
NCVStatus
ncvHaarStoreNVBIN_host
(
const
std
::
string
&
filename
,
HaarClassifierCascadeDescriptor
haar
,
NCVVector
<
HaarStage64
>
&
h_HaarStages
,
NCVVector
<
HaarClassifierNode128
>
&
h_HaarNodes
,
NCVVector
<
HaarFeature64
>
&
h_HaarFeatures
);
...
...
modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
View file @
0c325cac
...
...
@@ -44,10 +44,6 @@
#include <cuda_runtime.h>
#include "NPP_staging.hpp"
#if defined _SELF_TEST_
#include <stdio.h>
#endif
texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
...
...
@@ -161,12 +157,6 @@ const Ncv32u NUM_SCAN_THREADS = 256;
const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
struct T_true {};
struct T_false {};
template <typename T, typename U> struct is_same : T_false {};
template <typename T> struct is_same<T, T> : T_true {};
template<class T_in, class T_out>
struct _scanElemOp
{
...
...
@@ -175,13 +165,16 @@ struct _scanElemOp
{
return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
}
private:
template <int v> struct Int2Type { enum { value = v }; };
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
{
return (T_out)elem;
}
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
{
return (T_out)(elem*elem);
...
...
@@ -190,25 +183,25 @@ private:
template<class T>
inline __device__ T readElem(T *d_src, Ncv32u srcStride, Ncv32u curElemOffs);
inline __device__ T readElem(T *d_src, Ncv32u
texOffs, Ncv32u
srcStride, Ncv32u curElemOffs);
template<>
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u
texOffs, Ncv32u
srcStride, Ncv32u curElemOffs)
{
return tex1Dfetch(tex8u, srcStride * blockIdx.x + curElemOffs);
return tex1Dfetch(tex8u,
texOffs +
srcStride * blockIdx.x + curElemOffs);
}
template<>
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u
texOffs, Ncv32u
srcStride, Ncv32u curElemOffs)
{
return d_src[curElemOffs];
}
template<>
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u
texOffs, Ncv32u
srcStride, Ncv32u curElemOffs)
{
return d_src[curElemOffs];
}
...
...
@@ -233,7 +226,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32
* \return None
*/
template <class T_in, class T_out, bool tbDoSqr>
__global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
__global__ void scanRows(T_in *d_src, Ncv32u
texOffs, Ncv32u
srcWidth, Ncv32u srcStride,
T_out *d_II, Ncv32u IIstride)
{
//advance pointers to the current line
...
...
@@ -263,7 +256,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
if (curElemOffs < srcWidth)
{
//load elements
curElem = readElem<T_in>(d_src, srcStride, curElemOffs);
curElem = readElem<T_in>(d_src,
texOffs,
srcStride, curElemOffs);
}
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
...
...
@@ -298,55 +291,28 @@ NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
{
cudaChannelFormatDesc cfdTex;
size_t alignmentOffset = 0;
if (sizeof(T_in) == 1)
{
cfdTex = cudaCreateChannelDesc<Ncv8u>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
if (alignmentOffset > 0)
{
ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
}
}
scanRows
<T_in, T_out, tbDoSqr>
<<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
(d_src, roi.width, srcStride, d_dst, dstStride);
(d_src,
(Ncv32u)alignmentOffset,
roi.width, srcStride, d_dst, dstStride);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
#if defined _SELF_TEST_
T_in *h_src;
T_out *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * roi.height * sizeof(T_in)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * roi.height * sizeof(T_out)), NPPST_MEM_ALLOC_ERR);
memset(h_src, 0, srcStride * roi.height * sizeof(T_in));
memset(h_dst, 0, dstStride * roi.height * sizeof(T_out));
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * roi.height * sizeof(T_in), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * roi.height * sizeof(T_out), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
NcvBool bPass = true;
for (Ncv32u i=0; i<roi.height && bPass; i++)
{
T_out curElem = 0;
for (Ncv32u j=0; j<roi.width+1 && bPass; j++)
{
if (curElem != h_dst[i * dstStride + j])
{
printf("CIntegralImage::scanRowsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, curElem, h_dst[i * dstStride + j]);
bPass = false;
}
if (j < roi.width)
{
curElem += scanElemOp<T_op>(h_src[i*srcStride+j]);
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("CIntegralImage::scanRowsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NPPST_SUCCESS;
}
Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
static
Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
{
Ncv32u alignMask = allocatorAlignment-1;
Ncv32u inverseAlignMask = ~alignMask;
...
...
@@ -676,7 +642,7 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
//==============================================================================
//
// D
ownsampleNearest
.cu
// D
ecimate
.cu
//
//==============================================================================
...
...
@@ -686,25 +652,25 @@ const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
template<class T, NcvBool tbCacheTexture>
__device__ T getElem_D
ownsampleNearest
(Ncv32u x, T *d_src);
__device__ T getElem_D
ecimate
(Ncv32u x, T *d_src);
template<>
__device__ Ncv32u getElem_D
ownsampleNearest
<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
__device__ Ncv32u getElem_D
ecimate
<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
{
return tex1Dfetch(tex32u, x);
}
template<>
__device__ Ncv32u getElem_D
ownsampleNearest
<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
__device__ Ncv32u getElem_D
ecimate
<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
{
return d_src[x];
}
template<>
__device__ Ncv64u getElem_D
ownsampleNearest
<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
__device__ Ncv64u getElem_D
ecimate
<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
{
uint2 tmp = tex1Dfetch(tex64u, x);
Ncv64u res = (Ncv64u)tmp.y;
...
...
@@ -715,14 +681,14 @@ __device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_sr
template<>
__device__ Ncv64u getElem_D
ownsampleNearest
<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
__device__ Ncv64u getElem_D
ecimate
<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
{
return d_src[x];
}
template <class T, NcvBool tbCacheTexture>
__global__ void d
ownsampleNearest
_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
__global__ void d
ecimate
_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
NcvSize32u dstRoi, Ncv32u scale)
{
int curX = blockIdx.x * blockDim.x + threadIdx.x;
...
...
@@ -733,12 +699,12 @@ __global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u
return;
}
d_dst[curY * dstStep + curX] = getElem_D
ownsampleNearest
<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
d_dst[curY * dstStep + curX] = getElem_D
ecimate
<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
}
template <class T>
static NCVStatus d
ownsampleNearest
WrapperDevice(T *d_src, Ncv32u srcStep,
static NCVStatus d
ecimate
WrapperDevice(T *d_src, Ncv32u srcStep,
T *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture)
...
...
@@ -761,7 +727,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
if (!readThruTexture)
{
d
ownsampleNearest
_C1R
d
ecimate
_C1R
<T, false>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
...
...
@@ -787,7 +753,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
}
d
ownsampleNearest
_C1R
d
ecimate
_C1R
<T, true>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
...
...
@@ -795,39 +761,12 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
#if defined _SELF_TEST_
T *h_src;
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStep * srcRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStep * dstRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStep * srcRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStep * dstRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
bool bPass = true;
for (Ncv32u i=0; i<dstRoi.height && bPass; i++)
{
for (Ncv32u j=0; j<dstRoi.width && bPass; j++)
{
if (h_dst[i*dstStep+j] != h_src[i*scale*srcStep + j*scale])
{
printf("::downsampleNearestWrapperDevice self test failed: i=%d, j=%d, cpu=%ld, gpu=%ld\n", i, j, (long long)h_src[i*scale*srcStep + j*scale], (long long)h_dst[i*dstStep+j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("::downsampleNearestWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NPPST_SUCCESS;
}
template <class T>
static NCVStatus d
ownsampleNearest
WrapperHost(T *h_src, Ncv32u srcStep,
static NCVStatus d
ecimate
WrapperHost(T *h_src, Ncv32u srcStep,
T *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale)
{
...
...
@@ -856,40 +795,40 @@ static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
}
#define implementNppD
ownsampleNearest
(bit, typ) \
NCVStatus nppiStD
ownsampleNearest
_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
#define implementNppD
ecimate
(bit, typ) \
NCVStatus nppiStD
ecimate
_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
Ncv##bit##typ *d_dst, Ncv32u dstStep, \
NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
{ \
return d
ownsampleNearest
WrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
return d
ecimate
WrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
(Ncv##bit##u *)d_dst, dstStep, \
srcRoi, scale, readThruTexture); \
}
#define implementNppD
ownsampleNearest
Host(bit, typ) \
NCVStatus nppiStD
ownsampleNearest
_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
#define implementNppD
ecimate
Host(bit, typ) \
NCVStatus nppiStD
ecimate
_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
Ncv##bit##typ *h_dst, Ncv32u dstStep, \
NcvSize32u srcRoi, Ncv32u scale) \
{ \
return d
ownsampleNearest
WrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
return d
ecimate
WrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
(Ncv##bit##u *)h_dst, dstStep, \
srcRoi, scale); \
}
implementNppD
ownsampleNearest
(32, u)
implementNppD
ownsampleNearest
(32, s)
implementNppD
ownsampleNearest
(32, f)
implementNppD
ownsampleNearest
(64, u)
implementNppD
ownsampleNearest
(64, s)
implementNppD
ownsampleNearest
(64, f)
implementNppD
ownsampleNearest
Host(32, u)
implementNppD
ownsampleNearest
Host(32, s)
implementNppD
ownsampleNearest
Host(32, f)
implementNppD
ownsampleNearest
Host(64, u)
implementNppD
ownsampleNearest
Host(64, s)
implementNppD
ownsampleNearest
Host(64, f)
implementNppD
ecimate
(32, u)
implementNppD
ecimate
(32, s)
implementNppD
ecimate
(32, f)
implementNppD
ecimate
(64, u)
implementNppD
ecimate
(64, s)
implementNppD
ecimate
(64, f)
implementNppD
ecimate
Host(32, u)
implementNppD
ecimate
Host(32, s)
implementNppD
ecimate
Host(32, f)
implementNppD
ecimate
Host(64, u)
implementNppD
ecimate
Host(64, s)
implementNppD
ecimate
Host(64, f)
//==============================================================================
...
...
@@ -1051,46 +990,6 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
#if defined _SELF_TEST_
Ncv32u *h_sum;
Ncv64u *h_sqsum;
Ncv32f *h_norm_d;
Ncv32u ExtHeight = roi.height + rect.y + rect.height;
ncvAssertCUDAReturn(cudaMallocHost(&h_sum, sumStep * ExtHeight * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_norm_d, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMemcpy(h_sum, d_sum, sumStep * ExtHeight * sizeof(Ncv32u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_sqsum, d_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_norm_d, d_norm, normStep * roi.height * sizeof(Ncv32f), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
Ncv32f *h_norm_h;
ncvAssertCUDAReturn(cudaMallocHost(&h_norm_h, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertReturnNcvStat(nppRectStdDev_32f_C1R_host(h_sum, sqsumStep, h_sqsum, sqsumStep, h_norm_h, normStep, roi, rect, scaleArea));
const Ncv64f relEPS = 0.005;
bool bPass = true;
for (Ncv32u i=0; i<roi.height && bPass; i++)
{
for (Ncv32u j=0; j<roi.width && bPass; j++)
{
Ncv64f absErr = fabs(h_norm_h[i * normStep + j] - h_norm_d[i * normStep + j]);
Ncv64f relErr = absErr / h_norm_h[i * normStep + j];
if (relErr > relEPS)
{
printf("::ncvRectStdDev_32f_C1R self test failed: i=%d, j=%d, cpu=%f, gpu=%f\n", i, j, h_norm_h[i * normStep + j], h_norm_d[i * normStep + j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_sum), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_sqsum), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_norm_d), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_norm_h), NPPST_MEMFREE_ERR);
printf("::ncvRectStdDev_32f_C1R %s\n", bPass?"PASSED":"FAILED");
#endif
return NPPST_SUCCESS;
}
...
...
@@ -1251,34 +1150,6 @@ NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
(d_src, srcStride, d_dst, dstStride, srcRoi);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
#if defined _SELF_TEST_
Ncv32u widthExt = grid.x * TRANSPOSE_TILE_DIM;
Ncv32u heightExt = grid.y * TRANSPOSE_TILE_DIM;
T *h_src;
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * heightExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * widthExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
memset(h_src, 0, srcStride * heightExt * sizeof(T));
memset(h_dst, 0, dstStride * widthExt * sizeof(T));
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * heightExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * widthExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
NcvBool bPass = true;
for (Ncv32u i=0; i<srcRoi.height && bPass; i++)
{
for (Ncv32u j=0; j<srcRoi.width && bPass; j++)
{
if (h_src[i * srcStride + j] != h_dst[j * dstStride + i])
{
printf("CIntegralImage::transposeWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_src[j * srcStride + i], h_dst[i * dstStride + j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("CIntegralImage::transposeWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NPPST_SUCCESS;
}
...
...
@@ -1341,6 +1212,20 @@ implementNppTransposeHost(64,s)
implementNppTransposeHost(64,f)
NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
{
return transposeWrapperDevice<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
}
NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
{
return transposeWrapperHost<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
}
//==============================================================================
//
// Compact.cu
...
...
modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
View file @
0c325cac
...
...
@@ -96,65 +96,65 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_32u_C1R
(
Ncv32u
*
d_src
,
Ncv32u
srcStep
,
Ncv32u
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
NCVStatus
nppiStD
ecimate
_32u_C1R
(
Ncv32u
*
d_src
,
Ncv32u
srcStep
,
Ncv32u
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
* \see nppiStD
ownsampleNearest
_32u_C1R
* \see nppiStD
ecimate
_32u_C1R
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_32s_C1R
(
Ncv32s
*
d_src
,
Ncv32u
srcStep
,
Ncv32s
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
NCVStatus
nppiStD
ecimate
_32s_C1R
(
Ncv32s
*
d_src
,
Ncv32u
srcStep
,
Ncv32s
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
* \see nppiStD
ownsampleNearest
_32u_C1R
* \see nppiStD
ecimate
_32u_C1R
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_32f_C1R
(
Ncv32f
*
d_src
,
Ncv32u
srcStep
,
Ncv32f
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
NCVStatus
nppiStD
ecimate
_32f_C1R
(
Ncv32f
*
d_src
,
Ncv32u
srcStep
,
Ncv32f
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
* \see nppiStD
ownsampleNearest
_32u_C1R
* \see nppiStD
ecimate
_32u_C1R
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_64u_C1R
(
Ncv64u
*
d_src
,
Ncv32u
srcStep
,
Ncv64u
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
NCVStatus
nppiStD
ecimate
_64u_C1R
(
Ncv64u
*
d_src
,
Ncv32u
srcStep
,
Ncv64u
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
* \see nppiStD
ownsampleNearest
_32u_C1R
* \see nppiStD
ecimate
_32u_C1R
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_64s_C1R
(
Ncv64s
*
d_src
,
Ncv32u
srcStep
,
Ncv64s
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
NCVStatus
nppiStD
ecimate
_64s_C1R
(
Ncv64s
*
d_src
,
Ncv32u
srcStep
,
Ncv64s
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
* \see nppiStD
ownsampleNearest
_32u_C1R
* \see nppiStD
ecimate
_32u_C1R
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_64f_C1R
(
Ncv64f
*
d_src
,
Ncv32u
srcStep
,
Ncv64f
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
NCVStatus
nppiStD
ecimate
_64f_C1R
(
Ncv64f
*
d_src
,
Ncv32u
srcStep
,
Ncv64f
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
,
NcvBool
readThruTexture
);
/**
...
...
@@ -170,59 +170,59 @@ NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_32u_C1R_host
(
Ncv32u
*
h_src
,
Ncv32u
srcStep
,
Ncv32u
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
NCVStatus
nppiStD
ecimate
_32u_C1R_host
(
Ncv32u
*
h_src
,
Ncv32u
srcStep
,
Ncv32u
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
* \see nppiStD
ownsampleNearest
_32u_C1R_host
* \see nppiStD
ecimate
_32u_C1R_host
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_32s_C1R_host
(
Ncv32s
*
h_src
,
Ncv32u
srcStep
,
Ncv32s
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
NCVStatus
nppiStD
ecimate
_32s_C1R_host
(
Ncv32s
*
h_src
,
Ncv32u
srcStep
,
Ncv32s
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
* \see nppiStD
ownsampleNearest
_32u_C1R_host
* \see nppiStD
ecimate
_32u_C1R_host
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_32f_C1R_host
(
Ncv32f
*
h_src
,
Ncv32u
srcStep
,
Ncv32f
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
NCVStatus
nppiStD
ecimate
_32f_C1R_host
(
Ncv32f
*
h_src
,
Ncv32u
srcStep
,
Ncv32f
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
* \see nppiStD
ownsampleNearest
_32u_C1R_host
* \see nppiStD
ecimate
_32u_C1R_host
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_64u_C1R_host
(
Ncv64u
*
h_src
,
Ncv32u
srcStep
,
Ncv64u
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
NCVStatus
nppiStD
ecimate
_64u_C1R_host
(
Ncv64u
*
h_src
,
Ncv32u
srcStep
,
Ncv64u
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
* \see nppiStD
ownsampleNearest
_32u_C1R_host
* \see nppiStD
ecimate
_32u_C1R_host
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_64s_C1R_host
(
Ncv64s
*
h_src
,
Ncv32u
srcStep
,
Ncv64s
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
NCVStatus
nppiStD
ecimate
_64s_C1R_host
(
Ncv64s
*
h_src
,
Ncv32u
srcStep
,
Ncv64s
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
* \see nppiStD
ownsampleNearest
_32u_C1R_host
* \see nppiStD
ecimate
_32u_C1R_host
*/
NCV_EXPORTS
NCVStatus
nppiStD
ownsampleNearest
_64f_C1R_host
(
Ncv64f
*
h_src
,
Ncv32u
srcStep
,
Ncv64f
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
NCVStatus
nppiStD
ecimate
_64f_C1R_host
(
Ncv64f
*
h_src
,
Ncv32u
srcStep
,
Ncv64f
*
h_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
,
Ncv32u
scale
);
/**
...
...
@@ -333,6 +333,15 @@ NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
Ncv64f
*
d_dst
,
Ncv32u
dstStride
,
NcvSize32u
srcRoi
);
/**
* Transposes an image. 128-bit pixels of any type, single channel
* \see nppiStTranspose_32u_C1R
*/
NCV_EXPORTS
NCVStatus
nppiStTranspose_128_C1R
(
void
*
d_src
,
Ncv32u
srcStep
,
void
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
);
/**
* Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
*
...
...
@@ -394,6 +403,15 @@ NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
Ncv64f
*
h_dst
,
Ncv32u
dstStride
,
NcvSize32u
srcRoi
);
/**
* Transposes an image. 128-bit pixels of any type, single channel. Host implementation
* \see nppiStTranspose_32u_C1R_host
*/
NCV_EXPORTS
NCVStatus
nppiStTranspose_128_C1R_host
(
void
*
d_src
,
Ncv32u
srcStep
,
void
*
d_dst
,
Ncv32u
dstStep
,
NcvSize32u
srcRoi
);
/**
* Calculates the size of the temporary buffer for integral image creation
*
...
...
modules/gpu/src/nvidia/core/NCV.c
pp
→
modules/gpu/src/nvidia/core/NCV.c
u
View file @
0c325cac
...
...
@@ -40,14 +40,9 @@
//M*/
#if !defined (HAVE_CUDA)
#else
/* !defined (HAVE_CUDA) */
#include <ios>
#include <stdarg.h>
#include <vector>
#include "NCV.hpp"
...
...
@@ -182,6 +177,78 @@ NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NC
}
NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream)
{
NCVStatus ncvStat;
switch (dstType)
{
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
switch (srcType)
{
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
for (Ncv32u i=0; i<height; i++)
{
memcpy((char*)dst + i * dstPitch, (char*)src + i * srcPitch, widthbytes);
}
ncvStat = NCV_SUCCESS;
break;
case NCVMemoryTypeDevice:
if (cuStream != 0)
{
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
}
else
{
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
}
ncvStat = NCV_SUCCESS;
break;
default:
ncvStat = NCV_MEM_RESIDENCE_ERROR;
}
break;
case NCVMemoryTypeDevice:
switch (srcType)
{
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
if (cuStream != 0)
{
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
}
else
{
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
}
ncvStat = NCV_SUCCESS;
break;
case NCVMemoryTypeDevice:
if (cuStream != 0)
{
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
}
else
{
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
}
ncvStat = NCV_SUCCESS;
break;
default:
ncvStat = NCV_MEM_RESIDENCE_ERROR;
}
break;
default:
ncvStat = NCV_MEM_RESIDENCE_ERROR;
}
return ncvStat;
}
//===================================================================
//
// NCVMemStackAllocator class members implementation
...
...
@@ -195,8 +262,10 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
_maxSize(0),
allocBegin(NULL),
begin(NULL),
end(NULL),
_memType(NCVMemoryTypeNone),
_alignment
(
alignment
)
_alignment(alignment),
bReusesMemory(false)
{
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
...
...
@@ -573,4 +642,264 @@ double ncvEndQueryTimerMs(NcvTimer t)
return res;
}
#endif
/* !defined (HAVE_CUDA) */
//===================================================================
//
// Operations with rectangles
//
//===================================================================
//from OpenCV
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses,
Ncv32u &numHypotheses,
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights)
{
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (hypothesesWeights != NULL)
{
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
}
if (numHypotheses == 0)
{
return NCV_SUCCESS;
}
std::vector<NcvRect32u> rects(numHypotheses);
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
std::vector<Ncv32u> weights;
if (hypothesesWeights != NULL)
{
groupRectangles(rects, minNeighbors, intersectEps, &weights);
}
else
{
groupRectangles(rects, minNeighbors, intersectEps, NULL);
}
numHypotheses = (Ncv32u)rects.size();
if (numHypotheses > 0)
{
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
}
if (hypothesesWeights != NULL)
{
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
}
return NCV_SUCCESS;
}
template <class T>
static NCVStatus drawRectsWrapperHost(T *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
T color)
{
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
for (Ncv32u i=0; i<numRects; i++)
{
NcvRect32u rect = h_rects[i];
if (rect.x < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x] = color;
}
}
if (rect.x+rect.width-1 < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x+rect.width-1] = color;
}
}
if (rect.y < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[rect.y*dstStride+j] = color;
}
}
if (rect.y + rect.height - 1 < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
}
}
}
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
const Ncv32u NUMTHREADS_DRAWRECTS = 32;
const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
template <class T>
__global__ void drawRects(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color)
{
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
if (blockId > numRects * 4)
{
return;
}
NcvRect32u curRect = d_rects[blockId >> 2];
NcvBool bVertical = blockId & 0x1;
NcvBool bTopLeft = blockId & 0x2;
Ncv32u pt0x, pt0y;
if (bVertical)
{
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
pt0y = curRect.y;
if (pt0x < dstWidth)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptY < pt0y + curRect.height && ptY < dstHeight)
{
d_dst[ptY * dstStride + pt0x] = color;
}
}
}
}
else
{
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = curRect.x;
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
if (pt0y < dstHeight)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptX < pt0x + curRect.width && ptX < dstWidth)
{
d_dst[pt0y * dstStride + ptX] = color;
}
}
}
}
}
template <class T>
static NCVStatus drawRectsWrapperDevice(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color,
cudaStream_t cuStream)
{
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
if (numRects == 0)
{
return NCV_SUCCESS;
}
dim3 grid(numRects * 4);
dim3 block(NUMTHREADS_DRAWRECTS);
if (grid.x > 65535)
{
grid.y = (grid.x + 65534) / 65535;
grid.x = 65535;
}
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
modules/gpu/src/nvidia/core/NCV.hpp
View file @
0c325cac
...
...
@@ -129,8 +129,8 @@ struct NcvRect8u
Ncv8u
y
;
Ncv8u
width
;
Ncv8u
height
;
NcvRect8u
()
:
x
(
0
),
y
(
0
),
width
(
0
),
height
(
0
)
{};
NcvRect8u
(
Ncv8u
x
,
Ncv8u
y
,
Ncv8u
width
,
Ncv8u
height
)
:
x
(
x
),
y
(
y
),
width
(
width
),
height
(
height
)
{}
__host__
__device__
NcvRect8u
()
:
x
(
0
),
y
(
0
),
width
(
0
),
height
(
0
)
{};
__host__
__device__
NcvRect8u
(
Ncv8u
x
,
Ncv8u
y
,
Ncv8u
width
,
Ncv8u
height
)
:
x
(
x
),
y
(
y
),
width
(
width
),
height
(
height
)
{}
};
...
...
@@ -140,8 +140,8 @@ struct NcvRect32s
Ncv32s
y
;
///< y-coordinate of upper left corner.
Ncv32s
width
;
///< Rectangle width.
Ncv32s
height
;
///< Rectangle height.
NcvRect32s
()
:
x
(
0
),
y
(
0
),
width
(
0
),
height
(
0
)
{};
NcvRect32s
(
Ncv32s
x
,
Ncv32s
y
,
Ncv32s
width
,
Ncv32s
height
)
:
x
(
x
),
y
(
y
),
width
(
width
),
height
(
height
)
{}
__host__
__device__
NcvRect32s
()
:
x
(
0
),
y
(
0
),
width
(
0
),
height
(
0
)
{};
__host__
__device__
NcvRect32s
(
Ncv32s
x
,
Ncv32s
y
,
Ncv32s
width
,
Ncv32s
height
)
:
x
(
x
),
y
(
y
),
width
(
width
),
height
(
height
)
{}
};
...
...
@@ -151,8 +151,8 @@ struct NcvRect32u
Ncv32u
y
;
///< y-coordinate of upper left corner.
Ncv32u
width
;
///< Rectangle width.
Ncv32u
height
;
///< Rectangle height.
NcvRect32u
()
:
x
(
0
),
y
(
0
),
width
(
0
),
height
(
0
)
{};
NcvRect32u
(
Ncv32u
x
,
Ncv32u
y
,
Ncv32u
width
,
Ncv32u
height
)
:
x
(
x
),
y
(
y
),
width
(
width
),
height
(
height
)
{}
__host__
__device__
NcvRect32u
()
:
x
(
0
),
y
(
0
),
width
(
0
),
height
(
0
)
{};
__host__
__device__
NcvRect32u
(
Ncv32u
x
,
Ncv32u
y
,
Ncv32u
width
,
Ncv32u
height
)
:
x
(
x
),
y
(
y
),
width
(
width
),
height
(
height
)
{}
};
...
...
@@ -160,8 +160,8 @@ struct NcvSize32s
{
Ncv32s
width
;
///< Rectangle width.
Ncv32s
height
;
///< Rectangle height.
NcvSize32s
()
:
width
(
0
),
height
(
0
)
{};
NcvSize32s
(
Ncv32s
width
,
Ncv32s
height
)
:
width
(
width
),
height
(
height
)
{}
__host__
__device__
NcvSize32s
()
:
width
(
0
),
height
(
0
)
{};
__host__
__device__
NcvSize32s
(
Ncv32s
width
,
Ncv32s
height
)
:
width
(
width
),
height
(
height
)
{}
};
...
...
@@ -169,8 +169,8 @@ struct NcvSize32u
{
Ncv32u
width
;
///< Rectangle width.
Ncv32u
height
;
///< Rectangle height.
NcvSize32u
()
:
width
(
0
),
height
(
0
)
{};
NcvSize32u
(
Ncv32u
width
,
Ncv32u
height
)
:
width
(
width
),
height
(
height
)
{}
__host__
__device__
NcvSize32u
()
:
width
(
0
),
height
(
0
)
{};
__host__
__device__
NcvSize32u
(
Ncv32u
width
,
Ncv32u
height
)
:
width
(
width
),
height
(
height
)
{}
};
...
...
@@ -275,6 +275,7 @@ enum NCVStatus
{
//NCV statuses
NCV_SUCCESS
,
NCV_UNKNOWN_ERROR
,
NCV_CUDA_ERROR
,
NCV_NPP_ERROR
,
...
...
@@ -501,13 +502,18 @@ private:
/**
* Copy dispatcher
* Copy dispatcher
s
*/
NCV_EXPORTS
NCVStatus
memSegCopyHelper
(
void
*
dst
,
NCVMemoryType
dstType
,
const
void
*
src
,
NCVMemoryType
srcType
,
size_t
sz
,
cudaStream_t
cuStream
);
NCV_EXPORTS
NCVStatus
memSegCopyHelper2D
(
void
*
dst
,
Ncv32u
dstPitch
,
NCVMemoryType
dstType
,
const
void
*
src
,
Ncv32u
srcPitch
,
NCVMemoryType
srcType
,
Ncv32u
widthbytes
,
Ncv32u
height
,
cudaStream_t
cuStream
);
/**
* NCVVector (1D)
*/
...
...
@@ -532,7 +538,7 @@ public:
_memtype
=
NCVMemoryTypeNone
;
}
NCVStatus
copySolid
(
NCVVector
<
T
>
&
dst
,
cudaStream_t
cuStream
,
size_t
howMuch
=
0
)
NCVStatus
copySolid
(
NCVVector
<
T
>
&
dst
,
cudaStream_t
cuStream
,
size_t
howMuch
=
0
)
const
{
if
(
howMuch
==
0
)
{
...
...
@@ -600,7 +606,6 @@ public:
this
->
_memtype
=
this
->
allocatedMem
.
begin
.
memtype
;
}
~
NCVVectorAlloc
()
{
NCVStatus
ncvStat
;
...
...
@@ -611,25 +616,22 @@ public:
this
->
clear
();
}
NcvBool
isMemAllocated
()
const
{
return
(
this
->
allocatedMem
.
begin
.
ptr
!=
NULL
)
||
(
this
->
allocator
.
isCounting
());
}
Ncv32u
getAllocatorsAlignment
()
const
{
return
allocator
.
alignment
();
}
NCVMemSegment
getSegment
()
const
{
return
allocatedMem
;
}
private
:
private
:
INCVMemAllocator
&
allocator
;
NCVMemSegment
allocatedMem
;
};
...
...
@@ -658,7 +660,6 @@ public:
this
->
bReused
=
true
;
}
NCVVectorReuse
(
const
NCVMemSegment
&
memSegment
,
Ncv32u
length
)
{
this
->
bReused
=
false
;
...
...
@@ -674,7 +675,6 @@ public:
this
->
bReused
=
true
;
}
NcvBool
isMemReused
()
const
{
return
this
->
bReused
;
...
...
@@ -703,7 +703,6 @@ public:
virtual
~
NCVMatrix
()
{}
void
clear
()
{
_ptr
=
NULL
;
...
...
@@ -713,14 +712,13 @@ public:
_memtype
=
NCVMemoryTypeNone
;
}
Ncv32u
stride
()
const
{
return
_pitch
/
sizeof
(
T
);
}
NCVStatus
copySolid
(
NCVMatrix
<
T
>
&
dst
,
cudaStream_t
cuStream
,
size_t
howMuch
=
0
)
//a side effect of this function is that it copies everything in a single chunk, so the "padding" will be overwritten
NCVStatus
copySolid
(
NCVMatrix
<
T
>
&
dst
,
cudaStream_t
cuStream
,
size_t
howMuch
=
0
)
const
{
if
(
howMuch
==
0
)
{
...
...
@@ -748,6 +746,24 @@ public:
return
ncvStat
;
}
NCVStatus
copy2D
(
NCVMatrix
<
T
>
&
dst
,
NcvSize32u
roi
,
cudaStream_t
cuStream
)
const
{
ncvAssertReturn
(
this
->
width
()
>=
roi
.
width
&&
this
->
height
()
>=
roi
.
height
&&
dst
.
width
()
>=
roi
.
width
&&
dst
.
height
()
>=
roi
.
height
,
NCV_MEM_COPY_ERROR
);
ncvAssertReturn
((
this
->
_ptr
!=
NULL
||
this
->
_memtype
==
NCVMemoryTypeNone
)
&&
(
dst
.
_ptr
!=
NULL
||
dst
.
_memtype
==
NCVMemoryTypeNone
),
NCV_NULL_PTR
);
NCVStatus
ncvStat
=
NCV_SUCCESS
;
if
(
this
->
_memtype
!=
NCVMemoryTypeNone
)
{
ncvStat
=
memSegCopyHelper2D
(
dst
.
_ptr
,
dst
.
_pitch
,
dst
.
_memtype
,
this
->
_ptr
,
this
->
_pitch
,
this
->
_memtype
,
roi
.
width
*
sizeof
(
T
),
roi
.
height
,
cuStream
);
}
return
ncvStat
;
}
T
*
ptr
()
const
{
return
this
->
_ptr
;}
Ncv32u
width
()
const
{
return
this
->
_width
;}
Ncv32u
height
()
const
{
return
this
->
_height
;}
...
...
@@ -817,19 +833,16 @@ public:
this
->
clear
();
}
NcvBool
isMemAllocated
()
const
{
return
(
this
->
allocatedMem
.
begin
.
ptr
!=
NULL
)
||
(
this
->
allocator
.
isCounting
());
}
Ncv32u
getAllocatorsAlignment
()
const
{
return
allocator
.
alignment
();
}
NCVMemSegment
getSegment
()
const
{
return
allocatedMem
;
...
...
@@ -888,6 +901,23 @@ public:
this
->
bReused
=
true
;
}
NCVMatrixReuse
(
const
NCVMatrix
<
T
>
&
mat
,
NcvRect32u
roi
)
{
this
->
bReused
=
false
;
this
->
clear
();
ncvAssertPrintReturn
(
roi
.
x
<
mat
.
width
()
&&
roi
.
y
<
mat
.
height
()
&&
\
roi
.
x
+
roi
.
width
<=
mat
.
width
()
&&
roi
.
y
+
roi
.
height
<=
mat
.
height
(),
"NCVMatrixReuse ctor:: memory binding failed due to mismatching ROI and source matrix dims"
,
);
this
->
_width
=
roi
.
width
;
this
->
_height
=
roi
.
height
;
this
->
_pitch
=
mat
.
pitch
();
this
->
_ptr
=
mat
.
ptr
()
+
roi
.
y
*
mat
.
stride
()
+
roi
.
x
;
this
->
_memtype
=
mat
.
memType
();
this
->
bReused
=
true
;
}
NcvBool
isMemReused
()
const
{
...
...
@@ -899,4 +929,27 @@ private:
NcvBool
bReused
;
};
/**
* Operations with rectangles
*/
NCV_EXPORTS
NCVStatus
ncvGroupRectangles_host
(
NCVVector
<
NcvRect32u
>
&
hypotheses
,
Ncv32u
&
numHypotheses
,
Ncv32u
minNeighbors
,
Ncv32f
intersectEps
,
NCVVector
<
Ncv32u
>
*
hypothesesWeights
);
NCV_EXPORTS
NCVStatus
ncvDrawRects_8u_host
(
Ncv8u
*
h_dst
,
Ncv32u
dstStride
,
Ncv32u
dstWidth
,
Ncv32u
dstHeight
,
NcvRect32u
*
h_rects
,
Ncv32u
numRects
,
Ncv8u
color
);
NCV_EXPORTS
NCVStatus
ncvDrawRects_32u_host
(
Ncv32u
*
h_dst
,
Ncv32u
dstStride
,
Ncv32u
dstWidth
,
Ncv32u
dstHeight
,
NcvRect32u
*
h_rects
,
Ncv32u
numRects
,
Ncv32u
color
);
NCV_EXPORTS
NCVStatus
ncvDrawRects_8u_device
(
Ncv8u
*
d_dst
,
Ncv32u
dstStride
,
Ncv32u
dstWidth
,
Ncv32u
dstHeight
,
NcvRect32u
*
d_rects
,
Ncv32u
numRects
,
Ncv8u
color
,
cudaStream_t
cuStream
);
NCV_EXPORTS
NCVStatus
ncvDrawRects_32u_device
(
Ncv32u
*
d_dst
,
Ncv32u
dstStride
,
Ncv32u
dstWidth
,
Ncv32u
dstHeight
,
NcvRect32u
*
d_rects
,
Ncv32u
numRects
,
Ncv32u
color
,
cudaStream_t
cuStream
);
#endif // _ncv_hpp_
modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
View file @
0c325cac
...
...
@@ -150,14 +150,14 @@ namespace NCVRuntimeTemplateBool
{
//Convenience function used by the user
//Takes a variable argument list, transforms it into a list
static
void
call
(
Func
*
functor
,
int
dummy
,
...)
static
void
call
(
Func
*
functor
,
...)
{
//Vector used to collect arguments
std
::
vector
<
int
>
templateParamList
;
//Variable argument list manipulation
va_list
listPointer
;
va_start
(
listPointer
,
dummy
);
va_start
(
listPointer
,
functor
);
//Collect parameters into the list
for
(
int
i
=
0
;
i
<
NumArguments
;
i
++
)
{
...
...
modules/gpu/test/nvidia/TestHypothesesFilter.cpp
View file @
0c325cac
...
...
@@ -134,7 +134,7 @@ bool TestHypothesesFilter::process()
Ncv32u
numHypothesesSrc
=
h_vecSrc
.
length
();
NCV_SKIP_COND_BEGIN
ncvStat
=
ncv
FilterHypothes
es_host
(
h_vecSrc
,
numHypothesesSrc
,
this
->
minNeighbors
,
this
->
eps
,
NULL
);
ncvStat
=
ncv
GroupRectangl
es_host
(
h_vecSrc
,
numHypothesesSrc
,
this
->
minNeighbors
,
this
->
eps
,
NULL
);
ncvAssertReturn
(
ncvStat
==
NCV_SUCCESS
,
false
);
NCV_SKIP_COND_END
...
...
modules/gpu/test/nvidia/TestResize.cpp
View file @
0c325cac
...
...
@@ -83,17 +83,17 @@ bool TestResize<T>::process()
NCV_SKIP_COND_BEGIN
if
(
sizeof
(
T
)
==
sizeof
(
Ncv32u
))
{
ncvStat
=
nppiStD
ownsampleNearest
_32u_C1R
((
Ncv32u
*
)
d_img
.
ptr
(),
d_img
.
pitch
(),
(
Ncv32u
*
)
d_small
.
ptr
(),
d_small
.
pitch
(),
srcSize
,
this
->
scaleFactor
,
this
->
bTextureCache
);
ncvStat
=
nppiStD
ecimate
_32u_C1R
((
Ncv32u
*
)
d_img
.
ptr
(),
d_img
.
pitch
(),
(
Ncv32u
*
)
d_small
.
ptr
(),
d_small
.
pitch
(),
srcSize
,
this
->
scaleFactor
,
this
->
bTextureCache
);
}
else
if
(
sizeof
(
T
)
==
sizeof
(
Ncv64u
))
{
ncvStat
=
nppiStD
ownsampleNearest
_64u_C1R
((
Ncv64u
*
)
d_img
.
ptr
(),
d_img
.
pitch
(),
(
Ncv64u
*
)
d_small
.
ptr
(),
d_small
.
pitch
(),
srcSize
,
this
->
scaleFactor
,
this
->
bTextureCache
);
ncvStat
=
nppiStD
ecimate
_64u_C1R
((
Ncv64u
*
)
d_img
.
ptr
(),
d_img
.
pitch
(),
(
Ncv64u
*
)
d_small
.
ptr
(),
d_small
.
pitch
(),
srcSize
,
this
->
scaleFactor
,
this
->
bTextureCache
);
}
else
{
...
...
@@ -107,15 +107,15 @@ bool TestResize<T>::process()
NCV_SKIP_COND_BEGIN
if
(
sizeof
(
T
)
==
sizeof
(
Ncv32u
))
{
ncvStat
=
nppiStD
ownsampleNearest
_32u_C1R_host
((
Ncv32u
*
)
h_img
.
ptr
(),
h_img
.
pitch
(),
(
Ncv32u
*
)
h_small
.
ptr
(),
h_small
.
pitch
(),
srcSize
,
this
->
scaleFactor
);
ncvStat
=
nppiStD
ecimate
_32u_C1R_host
((
Ncv32u
*
)
h_img
.
ptr
(),
h_img
.
pitch
(),
(
Ncv32u
*
)
h_small
.
ptr
(),
h_small
.
pitch
(),
srcSize
,
this
->
scaleFactor
);
}
else
if
(
sizeof
(
T
)
==
sizeof
(
Ncv64u
))
{
ncvStat
=
nppiStD
ownsampleNearest
_64u_C1R_host
((
Ncv64u
*
)
h_img
.
ptr
(),
h_img
.
pitch
(),
(
Ncv64u
*
)
h_small
.
ptr
(),
h_small
.
pitch
(),
srcSize
,
this
->
scaleFactor
);
ncvStat
=
nppiStD
ecimate
_64u_C1R_host
((
Ncv64u
*
)
h_img
.
ptr
(),
h_img
.
pitch
(),
(
Ncv64u
*
)
h_small
.
ptr
(),
h_small
.
pitch
(),
srcSize
,
this
->
scaleFactor
);
}
else
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment