Commit 0c325cac authored by Anton Obukhov's avatar Anton Obukhov

[~] Minor refactoring, clean-up

[+] Added 128-bit transpose
parent e2caf4a3
...@@ -63,8 +63,6 @@ ...@@ -63,8 +63,6 @@
#include "NCVRuntimeTemplates.hpp" #include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp" #include "NCVHaarObjectDetection.hpp"
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
//============================================================================== //==============================================================================
// //
...@@ -785,7 +783,6 @@ void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, ...@@ -785,7 +783,6 @@ void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
//Second parameter is the number of "dynamic" template parameters //Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor> NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
::call( &functor, ::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbInitMaskPositively, tbInitMaskPositively,
tbCacheTextureIImg, tbCacheTextureIImg,
tbCacheTextureCascade, tbCacheTextureCascade,
...@@ -890,7 +887,6 @@ void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg ...@@ -890,7 +887,6 @@ void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg
//Second parameter is the number of "dynamic" template parameters //Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor> NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
::call( &functor, ::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbCacheTextureIImg, tbCacheTextureIImg,
tbCacheTextureCascade, tbCacheTextureCascade,
tbDoAtomicCompaction); tbDoAtomicCompaction);
...@@ -957,7 +953,6 @@ void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask, ...@@ -957,7 +953,6 @@ void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
//Second parameter is the number of "dynamic" template parameters //Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor> NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
::call( &functor, ::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbMaskByInmask, tbMaskByInmask,
tbDoAtomicCompaction); tbDoAtomicCompaction);
} }
...@@ -1554,172 +1549,6 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask, ...@@ -1554,172 +1549,6 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
} }
//==============================================================================
//
// Visualize file
//
//==============================================================================
const Ncv32u NUMTHREADS_DRAWRECTS = 32;
const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
template <class T>
__global__ void drawRects(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color)
{
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
if (blockId > numRects * 4)
{
return;
}
NcvRect32u curRect = d_rects[blockId >> 2];
NcvBool bVertical = blockId & 0x1;
NcvBool bTopLeft = blockId & 0x2;
Ncv32u pt0x, pt0y;
if (bVertical)
{
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
pt0y = curRect.y;
if (pt0x < dstWidth)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptY < pt0y + curRect.height && ptY < dstHeight)
{
d_dst[ptY * dstStride + pt0x] = color;
}
}
}
}
else
{
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = curRect.x;
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
if (pt0y < dstHeight)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptX < pt0x + curRect.width && ptX < dstWidth)
{
d_dst[pt0y * dstStride + ptX] = color;
}
}
}
}
}
template <class T>
static NCVStatus drawRectsWrapperDevice(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color,
cudaStream_t cuStream)
{
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
if (numRects == 0)
{
return NCV_SUCCESS;
}
#if defined _SELF_TEST_
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
NcvRect32s *h_rects;
ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color));
#endif
dim3 grid(numRects * 4);
dim3 block(NUMTHREADS_DRAWRECTS);
if (grid.x > 65535)
{
grid.y = (grid.x + 65534) / 65535;
grid.x = 65535;
}
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
#if defined _SELF_TEST_
T *h_dst_after;
ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
bool bPass = true;
for (Ncv32u i=0; i<dstHeight && bPass; i++)
{
for (Ncv32u j=0; j<dstWidth && bPass; j++)
{
if (h_dst[i*dstStride+j] != h_dst_after[i*dstStride+j])
{
printf("::drawRectsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_dst[i*dstStride+j], h_dst_after[i*dstStride+j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_dst_after), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaFreeHost(h_rects), NCV_CUDA_ERROR);
printf("::drawRectsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
//============================================================================== //==============================================================================
// //
// Pipeline file // Pipeline file
...@@ -1901,13 +1730,13 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg, ...@@ -1901,13 +1730,13 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
NCV_SKIP_COND_BEGIN NCV_SKIP_COND_BEGIN
nppStat = nppiStDownsampleNearest_32u_C1R( nppStat = nppiStDecimate_32u_C1R(
d_integralImage.ptr(), d_integralImage.pitch(), d_integralImage.ptr(), d_integralImage.pitch(),
d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(), d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
srcIIRoi, scale, true); srcIIRoi, scale, true);
ncvAssertReturnNcvStat(nppStat); ncvAssertReturnNcvStat(nppStat);
nppStat = nppiStDownsampleNearest_64u_C1R( nppStat = nppiStDecimate_64u_C1R(
d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(), d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(), d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
srcIIRoi, scale, true); srcIIRoi, scale, true);
...@@ -1969,7 +1798,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg, ...@@ -1969,7 +1798,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
} }
Ncv32u numStrongHypothesesNow = dstNumRects; Ncv32u numStrongHypothesesNow = dstNumRects;
ncvStat = ncvFilterHypotheses_host( ncvStat = ncvGroupRectangles_host(
h_hypothesesIntermediate, h_hypothesesIntermediate,
numStrongHypothesesNow, numStrongHypothesesNow,
minNeighbors, minNeighbors,
...@@ -2031,7 +1860,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg, ...@@ -2031,7 +1860,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
} }
ncvStat = ncvFilterHypotheses_host( ncvStat = ncvGroupRectangles_host(
h_hypothesesIntermediate, h_hypothesesIntermediate,
dstNumRects, dstNumRects,
minNeighbors, minNeighbors,
...@@ -2285,133 +2114,6 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask, ...@@ -2285,133 +2114,6 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
} }
NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
Ncv32u &numHypotheses,
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights)
{
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (hypothesesWeights != NULL)
{
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
}
if (numHypotheses == 0)
{
return NCV_SUCCESS;
}
std::vector<NcvRect32u> rects(numHypotheses);
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
std::vector<Ncv32u> weights;
if (hypothesesWeights != NULL)
{
groupRectangles(rects, minNeighbors, intersectEps, &weights);
}
else
{
groupRectangles(rects, minNeighbors, intersectEps, NULL);
}
numHypotheses = (Ncv32u)rects.size();
if (numHypotheses > 0)
{
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
}
if (hypothesesWeights != NULL)
{
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
}
return NCV_SUCCESS;
}
template <class T>
static NCVStatus drawRectsWrapperHost(T *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
T color)
{
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
for (Ncv32u i=0; i<numRects; i++)
{
NcvRect32u rect = h_rects[i];
if (rect.x < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x] = color;
}
}
if (rect.x+rect.width-1 < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x+rect.width-1] = color;
}
}
if (rect.y < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[rect.y*dstStride+j] = color;
}
}
if (rect.y + rect.height - 1 < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
}
}
}
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
NCVStatus loadFromXML(const std::string &filename, NCVStatus loadFromXML(const std::string &filename,
HaarClassifierCascadeDescriptor &haar, HaarClassifierCascadeDescriptor &haar,
std::vector<HaarStage64> &haarStages, std::vector<HaarStage64> &haarStages,
......
...@@ -346,153 +346,107 @@ enum ...@@ -346,153 +346,107 @@ enum
NCVPipeObjDet_VisualizeInPlace = 0x004, NCVPipeObjDet_VisualizeInPlace = 0x004,
}; };
NCV_EXPORTS
NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg, NCV_EXPORTS NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
NcvSize32u srcRoi, NcvSize32u srcRoi,
NCVVector<NcvRect32u> &d_dstRects, NCVVector<NcvRect32u> &d_dstRects,
Ncv32u &dstNumRects, Ncv32u &dstNumRects,
HaarClassifierCascadeDescriptor &haar, HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages, NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarStage64> &d_HaarStages, NCVVector<HaarStage64> &d_HaarStages,
NCVVector<HaarClassifierNode128> &d_HaarNodes, NCVVector<HaarClassifierNode128> &d_HaarNodes,
NCVVector<HaarFeature64> &d_HaarFeatures, NCVVector<HaarFeature64> &d_HaarFeatures,
NcvSize32u minObjSize, NcvSize32u minObjSize,
Ncv32u minNeighbors, //default 4 Ncv32u minNeighbors, //default 4
Ncv32f scaleStep, //default 1.2f Ncv32f scaleStep, //default 1.2f
Ncv32u pixelStep, //default 1 Ncv32u pixelStep, //default 1
Ncv32u flags, //default NCVPipeObjDet_Default Ncv32u flags, //default NCVPipeObjDet_Default
INCVMemAllocator &gpuAllocator, INCVMemAllocator &gpuAllocator,
INCVMemAllocator &cpuAllocator, INCVMemAllocator &cpuAllocator,
cudaDeviceProp &devProp, cudaDeviceProp &devProp,
cudaStream_t cuStream); cudaStream_t cuStream);
#define OBJDET_MASK_ELEMENT_INVALID_32U 0xFFFFFFFF #define OBJDET_MASK_ELEMENT_INVALID_32U 0xFFFFFFFF
#define HAAR_STDDEV_BORDER 1 #define HAAR_STDDEV_BORDER 1
NCV_EXPORTS
NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage, NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
NCVMatrix<Ncv32f> &d_weights, NCVMatrix<Ncv32f> &d_weights,
NCVMatrixAlloc<Ncv32u> &d_pixelMask, NCVMatrixAlloc<Ncv32u> &d_pixelMask,
Ncv32u &numDetections, Ncv32u &numDetections,
HaarClassifierCascadeDescriptor &haar, HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages, NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarStage64> &d_HaarStages, NCVVector<HaarStage64> &d_HaarStages,
NCVVector<HaarClassifierNode128> &d_HaarNodes, NCVVector<HaarClassifierNode128> &d_HaarNodes,
NCVVector<HaarFeature64> &d_HaarFeatures, NCVVector<HaarFeature64> &d_HaarFeatures,
NcvBool bMaskElements, NcvBool bMaskElements,
NcvSize32u anchorsRoi, NcvSize32u anchorsRoi,
Ncv32u pixelStep, Ncv32u pixelStep,
Ncv32f scaleArea, Ncv32f scaleArea,
INCVMemAllocator &gpuAllocator, INCVMemAllocator &gpuAllocator,
INCVMemAllocator &cpuAllocator, INCVMemAllocator &cpuAllocator,
cudaDeviceProp &devProp, cudaDeviceProp &devProp,
cudaStream_t cuStream); cudaStream_t cuStream);
NCV_EXPORTS
NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage, NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
NCVMatrix<Ncv32f> &h_weights, NCVMatrix<Ncv32f> &h_weights,
NCVMatrixAlloc<Ncv32u> &h_pixelMask, NCVMatrixAlloc<Ncv32u> &h_pixelMask,
Ncv32u &numDetections, Ncv32u &numDetections,
HaarClassifierCascadeDescriptor &haar, HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages, NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes, NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures, NCVVector<HaarFeature64> &h_HaarFeatures,
NcvBool bMaskElements, NcvBool bMaskElements,
NcvSize32u anchorsRoi, NcvSize32u anchorsRoi,
Ncv32u pixelStep, Ncv32u pixelStep,
Ncv32f scaleArea); Ncv32f scaleArea);
NCV_EXPORTS
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream);
NCV_EXPORTS
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream);
NCV_EXPORTS
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color);
NCV_EXPORTS
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color);
#define RECT_SIMILARITY_PROPORTION 0.2f #define RECT_SIMILARITY_PROPORTION 0.2f
NCV_EXPORTS
NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask, NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
Ncv32u numPixelMaskDetections, Ncv32u numPixelMaskDetections,
NCVVector<NcvRect32u> &hypotheses, NCVVector<NcvRect32u> &hypotheses,
Ncv32u &totalDetections, Ncv32u &totalDetections,
Ncv32u totalMaxDetections, Ncv32u totalMaxDetections,
Ncv32u rectWidth, Ncv32u rectWidth,
Ncv32u rectHeight, Ncv32u rectHeight,
Ncv32f curScale, Ncv32f curScale,
cudaStream_t cuStream); cudaStream_t cuStream);
NCV_EXPORTS
NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask, NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
Ncv32u numPixelMaskDetections, Ncv32u numPixelMaskDetections,
NCVVector<NcvRect32u> &hypotheses, NCVVector<NcvRect32u> &hypotheses,
Ncv32u &totalDetections, Ncv32u &totalDetections,
Ncv32u totalMaxDetections, Ncv32u totalMaxDetections,
Ncv32u rectWidth, Ncv32u rectWidth,
Ncv32u rectHeight, Ncv32u rectHeight,
Ncv32f curScale); Ncv32f curScale);
NCV_EXPORTS
NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses, NCV_EXPORTS NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
Ncv32u &numHypotheses, Ncv32u &numNodes, Ncv32u &numFeatures);
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights); NCV_EXPORTS NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
HaarClassifierCascadeDescriptor &haar,
NCV_EXPORTS NCVVector<HaarStage64> &h_HaarStages,
NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages, NCVVector<HaarClassifierNode128> &h_HaarNodes,
Ncv32u &numNodes, Ncv32u &numFeatures); NCVVector<HaarFeature64> &h_HaarFeatures);
NCV_EXPORTS
NCVStatus ncvHaarLoadFromFile_host(const std::string &filename, NCV_EXPORTS NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
HaarClassifierCascadeDescriptor &haar, HaarClassifierCascadeDescriptor haar,
NCVVector<HaarStage64> &h_HaarStages, NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes, NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures); NCVVector<HaarFeature64> &h_HaarFeatures);
NCV_EXPORTS
NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
HaarClassifierCascadeDescriptor haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures);
......
...@@ -44,10 +44,6 @@ ...@@ -44,10 +44,6 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "NPP_staging.hpp" #include "NPP_staging.hpp"
#if defined _SELF_TEST_
#include <stdio.h>
#endif
texture<Ncv8u, 1, cudaReadModeElementType> tex8u; texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
texture<Ncv32u, 1, cudaReadModeElementType> tex32u; texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
...@@ -161,12 +157,6 @@ const Ncv32u NUM_SCAN_THREADS = 256; ...@@ -161,12 +157,6 @@ const Ncv32u NUM_SCAN_THREADS = 256;
const Ncv32u LOG2_NUM_SCAN_THREADS = 8; const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
struct T_true {};
struct T_false {};
template <typename T, typename U> struct is_same : T_false {};
template <typename T> struct is_same<T, T> : T_true {};
template<class T_in, class T_out> template<class T_in, class T_out>
struct _scanElemOp struct _scanElemOp
{ {
...@@ -175,13 +165,16 @@ struct _scanElemOp ...@@ -175,13 +165,16 @@ struct _scanElemOp
{ {
return scanElemOp( elem, Int2Type<(int)tbDoSqr>() ); return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
} }
private: private:
template <int v> struct Int2Type { enum { value = v }; }; template <int v> struct Int2Type { enum { value = v }; };
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>) static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
{ {
return (T_out)elem; return (T_out)elem;
} }
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>) static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
{ {
return (T_out)(elem*elem); return (T_out)(elem*elem);
...@@ -190,25 +183,25 @@ private: ...@@ -190,25 +183,25 @@ private:
template<class T> template<class T>
inline __device__ T readElem(T *d_src, Ncv32u srcStride, Ncv32u curElemOffs); inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
template<> template<>
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u srcStride, Ncv32u curElemOffs) inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{ {
return tex1Dfetch(tex8u, srcStride * blockIdx.x + curElemOffs); return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
} }
template<> template<>
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u srcStride, Ncv32u curElemOffs) inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{ {
return d_src[curElemOffs]; return d_src[curElemOffs];
} }
template<> template<>
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32u curElemOffs) inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
{ {
return d_src[curElemOffs]; return d_src[curElemOffs];
} }
...@@ -233,7 +226,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32 ...@@ -233,7 +226,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32
* \return None * \return None
*/ */
template <class T_in, class T_out, bool tbDoSqr> template <class T_in, class T_out, bool tbDoSqr>
__global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride, __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
T_out *d_II, Ncv32u IIstride) T_out *d_II, Ncv32u IIstride)
{ {
//advance pointers to the current line //advance pointers to the current line
...@@ -263,7 +256,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride, ...@@ -263,7 +256,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
if (curElemOffs < srcWidth) if (curElemOffs < srcWidth)
{ {
//load elements //load elements
curElem = readElem<T_in>(d_src, srcStride, curElemOffs); curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
} }
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem); curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
...@@ -298,55 +291,28 @@ NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride, ...@@ -298,55 +291,28 @@ NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
T_out *d_dst, Ncv32u dstStride, NcvSize32u roi) T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
{ {
cudaChannelFormatDesc cfdTex; cudaChannelFormatDesc cfdTex;
size_t alignmentOffset = 0;
if (sizeof(T_in) == 1) if (sizeof(T_in) == 1)
{ {
cfdTex = cudaCreateChannelDesc<Ncv8u>(); cfdTex = cudaCreateChannelDesc<Ncv8u>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR); ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); if (alignmentOffset > 0)
{
ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
}
} }
scanRows scanRows
<T_in, T_out, tbDoSqr> <T_in, T_out, tbDoSqr>
<<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>> <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
(d_src, roi.width, srcStride, d_dst, dstStride); (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
#if defined _SELF_TEST_
T_in *h_src;
T_out *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * roi.height * sizeof(T_in)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * roi.height * sizeof(T_out)), NPPST_MEM_ALLOC_ERR);
memset(h_src, 0, srcStride * roi.height * sizeof(T_in));
memset(h_dst, 0, dstStride * roi.height * sizeof(T_out));
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * roi.height * sizeof(T_in), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * roi.height * sizeof(T_out), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
NcvBool bPass = true;
for (Ncv32u i=0; i<roi.height && bPass; i++)
{
T_out curElem = 0;
for (Ncv32u j=0; j<roi.width+1 && bPass; j++)
{
if (curElem != h_dst[i * dstStride + j])
{
printf("CIntegralImage::scanRowsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, curElem, h_dst[i * dstStride + j]);
bPass = false;
}
if (j < roi.width)
{
curElem += scanElemOp<T_op>(h_src[i*srcStride+j]);
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("CIntegralImage::scanRowsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NPPST_SUCCESS; return NPPST_SUCCESS;
} }
Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment) static Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
{ {
Ncv32u alignMask = allocatorAlignment-1; Ncv32u alignMask = allocatorAlignment-1;
Ncv32u inverseAlignMask = ~alignMask; Ncv32u inverseAlignMask = ~alignMask;
...@@ -676,7 +642,7 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep, ...@@ -676,7 +642,7 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
//============================================================================== //==============================================================================
// //
// DownsampleNearest.cu // Decimate.cu
// //
//============================================================================== //==============================================================================
...@@ -686,25 +652,25 @@ const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8; ...@@ -686,25 +652,25 @@ const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
template<class T, NcvBool tbCacheTexture> template<class T, NcvBool tbCacheTexture>
__device__ T getElem_DownsampleNearest(Ncv32u x, T *d_src); __device__ T getElem_Decimate(Ncv32u x, T *d_src);
template<> template<>
__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, true>(Ncv32u x, Ncv32u *d_src) __device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
{ {
return tex1Dfetch(tex32u, x); return tex1Dfetch(tex32u, x);
} }
template<> template<>
__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, false>(Ncv32u x, Ncv32u *d_src) __device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
{ {
return d_src[x]; return d_src[x];
} }
template<> template<>
__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_src) __device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
{ {
uint2 tmp = tex1Dfetch(tex64u, x); uint2 tmp = tex1Dfetch(tex64u, x);
Ncv64u res = (Ncv64u)tmp.y; Ncv64u res = (Ncv64u)tmp.y;
...@@ -715,14 +681,14 @@ __device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_sr ...@@ -715,14 +681,14 @@ __device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_sr
template<> template<>
__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, false>(Ncv32u x, Ncv64u *d_src) __device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
{ {
return d_src[x]; return d_src[x];
} }
template <class T, NcvBool tbCacheTexture> template <class T, NcvBool tbCacheTexture>
__global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep, __global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
NcvSize32u dstRoi, Ncv32u scale) NcvSize32u dstRoi, Ncv32u scale)
{ {
int curX = blockIdx.x * blockDim.x + threadIdx.x; int curX = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -733,12 +699,12 @@ __global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u ...@@ -733,12 +699,12 @@ __global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u
return; return;
} }
d_dst[curY * dstStep + curX] = getElem_DownsampleNearest<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src); d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
} }
template <class T> template <class T>
static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep, static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
T *d_dst, Ncv32u dstStep, T *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale, NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture) NcvBool readThruTexture)
...@@ -761,7 +727,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep, ...@@ -761,7 +727,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
if (!readThruTexture) if (!readThruTexture)
{ {
downsampleNearest_C1R decimate_C1R
<T, false> <T, false>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>> <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale); (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
...@@ -787,7 +753,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep, ...@@ -787,7 +753,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
} }
downsampleNearest_C1R decimate_C1R
<T, true> <T, true>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>> <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale); (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
...@@ -795,39 +761,12 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep, ...@@ -795,39 +761,12 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
#if defined _SELF_TEST_
T *h_src;
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStep * srcRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStep * dstRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStep * srcRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStep * dstRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
bool bPass = true;
for (Ncv32u i=0; i<dstRoi.height && bPass; i++)
{
for (Ncv32u j=0; j<dstRoi.width && bPass; j++)
{
if (h_dst[i*dstStep+j] != h_src[i*scale*srcStep + j*scale])
{
printf("::downsampleNearestWrapperDevice self test failed: i=%d, j=%d, cpu=%ld, gpu=%ld\n", i, j, (long long)h_src[i*scale*srcStep + j*scale], (long long)h_dst[i*dstStep+j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("::downsampleNearestWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NPPST_SUCCESS; return NPPST_SUCCESS;
} }
template <class T> template <class T>
static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep, static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
T *h_dst, Ncv32u dstStep, T *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale) NcvSize32u srcRoi, Ncv32u scale)
{ {
...@@ -856,40 +795,40 @@ static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep, ...@@ -856,40 +795,40 @@ static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
} }
#define implementNppDownsampleNearest(bit, typ) \ #define implementNppDecimate(bit, typ) \
NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \ NCVStatus nppiStDecimate_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
Ncv##bit##typ *d_dst, Ncv32u dstStep, \ Ncv##bit##typ *d_dst, Ncv32u dstStep, \
NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \ NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
{ \ { \
return downsampleNearestWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \ return decimateWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
(Ncv##bit##u *)d_dst, dstStep, \ (Ncv##bit##u *)d_dst, dstStep, \
srcRoi, scale, readThruTexture); \ srcRoi, scale, readThruTexture); \
} }
#define implementNppDownsampleNearestHost(bit, typ) \ #define implementNppDecimateHost(bit, typ) \
NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \ NCVStatus nppiStDecimate_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
Ncv##bit##typ *h_dst, Ncv32u dstStep, \ Ncv##bit##typ *h_dst, Ncv32u dstStep, \
NcvSize32u srcRoi, Ncv32u scale) \ NcvSize32u srcRoi, Ncv32u scale) \
{ \ { \
return downsampleNearestWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \ return decimateWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
(Ncv##bit##u *)h_dst, dstStep, \ (Ncv##bit##u *)h_dst, dstStep, \
srcRoi, scale); \ srcRoi, scale); \
} }
implementNppDownsampleNearest(32, u) implementNppDecimate(32, u)
implementNppDownsampleNearest(32, s) implementNppDecimate(32, s)
implementNppDownsampleNearest(32, f) implementNppDecimate(32, f)
implementNppDownsampleNearest(64, u) implementNppDecimate(64, u)
implementNppDownsampleNearest(64, s) implementNppDecimate(64, s)
implementNppDownsampleNearest(64, f) implementNppDecimate(64, f)
implementNppDownsampleNearestHost(32, u) implementNppDecimateHost(32, u)
implementNppDownsampleNearestHost(32, s) implementNppDecimateHost(32, s)
implementNppDownsampleNearestHost(32, f) implementNppDecimateHost(32, f)
implementNppDownsampleNearestHost(64, u) implementNppDecimateHost(64, u)
implementNppDownsampleNearestHost(64, s) implementNppDecimateHost(64, s)
implementNppDownsampleNearestHost(64, f) implementNppDecimateHost(64, f)
//============================================================================== //==============================================================================
...@@ -1051,46 +990,6 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep, ...@@ -1051,46 +990,6 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
#if defined _SELF_TEST_
Ncv32u *h_sum;
Ncv64u *h_sqsum;
Ncv32f *h_norm_d;
Ncv32u ExtHeight = roi.height + rect.y + rect.height;
ncvAssertCUDAReturn(cudaMallocHost(&h_sum, sumStep * ExtHeight * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_norm_d, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMemcpy(h_sum, d_sum, sumStep * ExtHeight * sizeof(Ncv32u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_sqsum, d_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_norm_d, d_norm, normStep * roi.height * sizeof(Ncv32f), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
Ncv32f *h_norm_h;
ncvAssertCUDAReturn(cudaMallocHost(&h_norm_h, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertReturnNcvStat(nppRectStdDev_32f_C1R_host(h_sum, sqsumStep, h_sqsum, sqsumStep, h_norm_h, normStep, roi, rect, scaleArea));
const Ncv64f relEPS = 0.005;
bool bPass = true;
for (Ncv32u i=0; i<roi.height && bPass; i++)
{
for (Ncv32u j=0; j<roi.width && bPass; j++)
{
Ncv64f absErr = fabs(h_norm_h[i * normStep + j] - h_norm_d[i * normStep + j]);
Ncv64f relErr = absErr / h_norm_h[i * normStep + j];
if (relErr > relEPS)
{
printf("::ncvRectStdDev_32f_C1R self test failed: i=%d, j=%d, cpu=%f, gpu=%f\n", i, j, h_norm_h[i * normStep + j], h_norm_d[i * normStep + j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_sum), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_sqsum), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_norm_d), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_norm_h), NPPST_MEMFREE_ERR);
printf("::ncvRectStdDev_32f_C1R %s\n", bPass?"PASSED":"FAILED");
#endif
return NPPST_SUCCESS; return NPPST_SUCCESS;
} }
...@@ -1251,34 +1150,6 @@ NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride, ...@@ -1251,34 +1150,6 @@ NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
(d_src, srcStride, d_dst, dstStride, srcRoi); (d_src, srcStride, d_dst, dstStride, srcRoi);
ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
#if defined _SELF_TEST_
Ncv32u widthExt = grid.x * TRANSPOSE_TILE_DIM;
Ncv32u heightExt = grid.y * TRANSPOSE_TILE_DIM;
T *h_src;
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * heightExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * widthExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
memset(h_src, 0, srcStride * heightExt * sizeof(T));
memset(h_dst, 0, dstStride * widthExt * sizeof(T));
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * heightExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * widthExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
NcvBool bPass = true;
for (Ncv32u i=0; i<srcRoi.height && bPass; i++)
{
for (Ncv32u j=0; j<srcRoi.width && bPass; j++)
{
if (h_src[i * srcStride + j] != h_dst[j * dstStride + i])
{
printf("CIntegralImage::transposeWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_src[j * srcStride + i], h_dst[i * dstStride + j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("CIntegralImage::transposeWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NPPST_SUCCESS; return NPPST_SUCCESS;
} }
...@@ -1341,6 +1212,20 @@ implementNppTransposeHost(64,s) ...@@ -1341,6 +1212,20 @@ implementNppTransposeHost(64,s)
implementNppTransposeHost(64,f) implementNppTransposeHost(64,f)
NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
{
return transposeWrapperDevice<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
}
NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
{
return transposeWrapperHost<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
}
//============================================================================== //==============================================================================
// //
// Compact.cu // Compact.cu
......
...@@ -96,65 +96,65 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream); ...@@ -96,65 +96,65 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
* \return NCV status code * \return NCV status code
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_32u_C1R(Ncv32u *d_src, Ncv32u srcStep, NCVStatus nppiStDecimate_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
Ncv32u *d_dst, Ncv32u dstStep, Ncv32u *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale, NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture); NcvBool readThruTexture);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R * \see nppiStDecimate_32u_C1R
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_32s_C1R(Ncv32s *d_src, Ncv32u srcStep, NCVStatus nppiStDecimate_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
Ncv32s *d_dst, Ncv32u dstStep, Ncv32s *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale, NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture); NcvBool readThruTexture);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R * \see nppiStDecimate_32u_C1R
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_32f_C1R(Ncv32f *d_src, Ncv32u srcStep, NCVStatus nppiStDecimate_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
Ncv32f *d_dst, Ncv32u dstStep, Ncv32f *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale, NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture); NcvBool readThruTexture);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R * \see nppiStDecimate_32u_C1R
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_64u_C1R(Ncv64u *d_src, Ncv32u srcStep, NCVStatus nppiStDecimate_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
Ncv64u *d_dst, Ncv32u dstStep, Ncv64u *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale, NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture); NcvBool readThruTexture);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R * \see nppiStDecimate_32u_C1R
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_64s_C1R(Ncv64s *d_src, Ncv32u srcStep, NCVStatus nppiStDecimate_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
Ncv64s *d_dst, Ncv32u dstStep, Ncv64s *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale, NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture); NcvBool readThruTexture);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R * \see nppiStDecimate_32u_C1R
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep, NCVStatus nppiStDecimate_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
Ncv64f *d_dst, Ncv32u dstStep, Ncv64f *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale, NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture); NcvBool readThruTexture);
/** /**
...@@ -170,59 +170,59 @@ NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep, ...@@ -170,59 +170,59 @@ NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
* \return NCV status code * \return NCV status code
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep, NCVStatus nppiStDecimate_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
Ncv32u *h_dst, Ncv32u dstStep, Ncv32u *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale); NcvSize32u srcRoi, Ncv32u scale);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation. * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host * \see nppiStDecimate_32u_C1R_host
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep, NCVStatus nppiStDecimate_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
Ncv32s *h_dst, Ncv32u dstStep, Ncv32s *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale); NcvSize32u srcRoi, Ncv32u scale);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation. * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host * \see nppiStDecimate_32u_C1R_host
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep, NCVStatus nppiStDecimate_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
Ncv32f *h_dst, Ncv32u dstStep, Ncv32f *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale); NcvSize32u srcRoi, Ncv32u scale);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation. * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host * \see nppiStDecimate_32u_C1R_host
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep, NCVStatus nppiStDecimate_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
Ncv64u *h_dst, Ncv32u dstStep, Ncv64u *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale); NcvSize32u srcRoi, Ncv32u scale);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation. * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host * \see nppiStDecimate_32u_C1R_host
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep, NCVStatus nppiStDecimate_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
Ncv64s *h_dst, Ncv32u dstStep, Ncv64s *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale); NcvSize32u srcRoi, Ncv32u scale);
/** /**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation. * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host * \see nppiStDecimate_32u_C1R_host
*/ */
NCV_EXPORTS NCV_EXPORTS
NCVStatus nppiStDownsampleNearest_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep, NCVStatus nppiStDecimate_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
Ncv64f *h_dst, Ncv32u dstStep, Ncv64f *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale); NcvSize32u srcRoi, Ncv32u scale);
/** /**
...@@ -333,6 +333,15 @@ NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride, ...@@ -333,6 +333,15 @@ NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi); Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
/**
* Transposes an image. 128-bit pixels of any type, single channel
* \see nppiStTranspose_32u_C1R
*/
NCV_EXPORTS
NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
/** /**
* Transposes an image. 32-bit unsigned pixels, single channel. Host implementation * Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
* *
...@@ -394,6 +403,15 @@ NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride, ...@@ -394,6 +403,15 @@ NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi); Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
/**
* Transposes an image. 128-bit pixels of any type, single channel. Host implementation
* \see nppiStTranspose_32u_C1R_host
*/
NCV_EXPORTS
NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
/** /**
* Calculates the size of the temporary buffer for integral image creation * Calculates the size of the temporary buffer for integral image creation
* *
......
...@@ -40,14 +40,9 @@ ...@@ -40,14 +40,9 @@
//M*/ //M*/
#if !defined (HAVE_CUDA)
#else /* !defined (HAVE_CUDA) */
#include <ios> #include <ios>
#include <stdarg.h> #include <stdarg.h>
#include <vector>
#include "NCV.hpp" #include "NCV.hpp"
...@@ -182,6 +177,78 @@ NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NC ...@@ -182,6 +177,78 @@ NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NC
} }
NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream)
{
NCVStatus ncvStat;
switch (dstType)
{
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
switch (srcType)
{
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
for (Ncv32u i=0; i<height; i++)
{
memcpy((char*)dst + i * dstPitch, (char*)src + i * srcPitch, widthbytes);
}
ncvStat = NCV_SUCCESS;
break;
case NCVMemoryTypeDevice:
if (cuStream != 0)
{
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
}
else
{
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
}
ncvStat = NCV_SUCCESS;
break;
default:
ncvStat = NCV_MEM_RESIDENCE_ERROR;
}
break;
case NCVMemoryTypeDevice:
switch (srcType)
{
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
if (cuStream != 0)
{
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
}
else
{
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
}
ncvStat = NCV_SUCCESS;
break;
case NCVMemoryTypeDevice:
if (cuStream != 0)
{
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
}
else
{
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
}
ncvStat = NCV_SUCCESS;
break;
default:
ncvStat = NCV_MEM_RESIDENCE_ERROR;
}
break;
default:
ncvStat = NCV_MEM_RESIDENCE_ERROR;
}
return ncvStat;
}
//=================================================================== //===================================================================
// //
// NCVMemStackAllocator class members implementation // NCVMemStackAllocator class members implementation
...@@ -195,8 +262,10 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment) ...@@ -195,8 +262,10 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
_maxSize(0), _maxSize(0),
allocBegin(NULL), allocBegin(NULL),
begin(NULL), begin(NULL),
end(NULL),
_memType(NCVMemoryTypeNone), _memType(NCVMemoryTypeNone),
_alignment(alignment) _alignment(alignment),
bReusesMemory(false)
{ {
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0; NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2"); ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
...@@ -573,4 +642,264 @@ double ncvEndQueryTimerMs(NcvTimer t) ...@@ -573,4 +642,264 @@ double ncvEndQueryTimerMs(NcvTimer t)
return res; return res;
} }
#endif /* !defined (HAVE_CUDA) */
//===================================================================
//
// Operations with rectangles
//
//===================================================================
//from OpenCV
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses,
Ncv32u &numHypotheses,
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights)
{
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (hypothesesWeights != NULL)
{
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
}
if (numHypotheses == 0)
{
return NCV_SUCCESS;
}
std::vector<NcvRect32u> rects(numHypotheses);
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
std::vector<Ncv32u> weights;
if (hypothesesWeights != NULL)
{
groupRectangles(rects, minNeighbors, intersectEps, &weights);
}
else
{
groupRectangles(rects, minNeighbors, intersectEps, NULL);
}
numHypotheses = (Ncv32u)rects.size();
if (numHypotheses > 0)
{
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
}
if (hypothesesWeights != NULL)
{
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
}
return NCV_SUCCESS;
}
template <class T>
static NCVStatus drawRectsWrapperHost(T *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
T color)
{
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
for (Ncv32u i=0; i<numRects; i++)
{
NcvRect32u rect = h_rects[i];
if (rect.x < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x] = color;
}
}
if (rect.x+rect.width-1 < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x+rect.width-1] = color;
}
}
if (rect.y < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[rect.y*dstStride+j] = color;
}
}
if (rect.y + rect.height - 1 < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
}
}
}
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
const Ncv32u NUMTHREADS_DRAWRECTS = 32;
const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
template <class T>
__global__ void drawRects(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color)
{
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
if (blockId > numRects * 4)
{
return;
}
NcvRect32u curRect = d_rects[blockId >> 2];
NcvBool bVertical = blockId & 0x1;
NcvBool bTopLeft = blockId & 0x2;
Ncv32u pt0x, pt0y;
if (bVertical)
{
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
pt0y = curRect.y;
if (pt0x < dstWidth)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptY < pt0y + curRect.height && ptY < dstHeight)
{
d_dst[ptY * dstStride + pt0x] = color;
}
}
}
}
else
{
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = curRect.x;
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
if (pt0y < dstHeight)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptX < pt0x + curRect.width && ptX < dstWidth)
{
d_dst[pt0y * dstStride + ptX] = color;
}
}
}
}
}
template <class T>
static NCVStatus drawRectsWrapperDevice(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color,
cudaStream_t cuStream)
{
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
if (numRects == 0)
{
return NCV_SUCCESS;
}
dim3 grid(numRects * 4);
dim3 block(NUMTHREADS_DRAWRECTS);
if (grid.x > 65535)
{
grid.y = (grid.x + 65534) / 65535;
grid.x = 65535;
}
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
...@@ -129,8 +129,8 @@ struct NcvRect8u ...@@ -129,8 +129,8 @@ struct NcvRect8u
Ncv8u y; Ncv8u y;
Ncv8u width; Ncv8u width;
Ncv8u height; Ncv8u height;
NcvRect8u() : x(0), y(0), width(0), height(0) {}; __host__ __device__ NcvRect8u() : x(0), y(0), width(0), height(0) {};
NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {} __host__ __device__ NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
}; };
...@@ -140,8 +140,8 @@ struct NcvRect32s ...@@ -140,8 +140,8 @@ struct NcvRect32s
Ncv32s y; ///< y-coordinate of upper left corner. Ncv32s y; ///< y-coordinate of upper left corner.
Ncv32s width; ///< Rectangle width. Ncv32s width; ///< Rectangle width.
Ncv32s height; ///< Rectangle height. Ncv32s height; ///< Rectangle height.
NcvRect32s() : x(0), y(0), width(0), height(0) {}; __host__ __device__ NcvRect32s() : x(0), y(0), width(0), height(0) {};
NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {} __host__ __device__ NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
}; };
...@@ -151,8 +151,8 @@ struct NcvRect32u ...@@ -151,8 +151,8 @@ struct NcvRect32u
Ncv32u y; ///< y-coordinate of upper left corner. Ncv32u y; ///< y-coordinate of upper left corner.
Ncv32u width; ///< Rectangle width. Ncv32u width; ///< Rectangle width.
Ncv32u height; ///< Rectangle height. Ncv32u height; ///< Rectangle height.
NcvRect32u() : x(0), y(0), width(0), height(0) {}; __host__ __device__ NcvRect32u() : x(0), y(0), width(0), height(0) {};
NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {} __host__ __device__ NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
}; };
...@@ -160,8 +160,8 @@ struct NcvSize32s ...@@ -160,8 +160,8 @@ struct NcvSize32s
{ {
Ncv32s width; ///< Rectangle width. Ncv32s width; ///< Rectangle width.
Ncv32s height; ///< Rectangle height. Ncv32s height; ///< Rectangle height.
NcvSize32s() : width(0), height(0) {}; __host__ __device__ NcvSize32s() : width(0), height(0) {};
NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {} __host__ __device__ NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
}; };
...@@ -169,8 +169,8 @@ struct NcvSize32u ...@@ -169,8 +169,8 @@ struct NcvSize32u
{ {
Ncv32u width; ///< Rectangle width. Ncv32u width; ///< Rectangle width.
Ncv32u height; ///< Rectangle height. Ncv32u height; ///< Rectangle height.
NcvSize32u() : width(0), height(0) {}; __host__ __device__ NcvSize32u() : width(0), height(0) {};
NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {} __host__ __device__ NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
}; };
...@@ -275,6 +275,7 @@ enum NCVStatus ...@@ -275,6 +275,7 @@ enum NCVStatus
{ {
//NCV statuses //NCV statuses
NCV_SUCCESS, NCV_SUCCESS,
NCV_UNKNOWN_ERROR,
NCV_CUDA_ERROR, NCV_CUDA_ERROR,
NCV_NPP_ERROR, NCV_NPP_ERROR,
...@@ -501,13 +502,18 @@ private: ...@@ -501,13 +502,18 @@ private:
/** /**
* Copy dispatcher * Copy dispatchers
*/ */
NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
const void *src, NCVMemoryType srcType, const void *src, NCVMemoryType srcType,
size_t sz, cudaStream_t cuStream); size_t sz, cudaStream_t cuStream);
NCV_EXPORTS NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream);
/** /**
* NCVVector (1D) * NCVVector (1D)
*/ */
...@@ -532,7 +538,7 @@ public: ...@@ -532,7 +538,7 @@ public:
_memtype = NCVMemoryTypeNone; _memtype = NCVMemoryTypeNone;
} }
NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0) NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
{ {
if (howMuch == 0) if (howMuch == 0)
{ {
...@@ -600,7 +606,6 @@ public: ...@@ -600,7 +606,6 @@ public:
this->_memtype = this->allocatedMem.begin.memtype; this->_memtype = this->allocatedMem.begin.memtype;
} }
~NCVVectorAlloc() ~NCVVectorAlloc()
{ {
NCVStatus ncvStat; NCVStatus ncvStat;
...@@ -611,25 +616,22 @@ public: ...@@ -611,25 +616,22 @@ public:
this->clear(); this->clear();
} }
NcvBool isMemAllocated() const NcvBool isMemAllocated() const
{ {
return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting()); return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
} }
Ncv32u getAllocatorsAlignment() const Ncv32u getAllocatorsAlignment() const
{ {
return allocator.alignment(); return allocator.alignment();
} }
NCVMemSegment getSegment() const NCVMemSegment getSegment() const
{ {
return allocatedMem; return allocatedMem;
} }
private: private:
INCVMemAllocator &allocator; INCVMemAllocator &allocator;
NCVMemSegment allocatedMem; NCVMemSegment allocatedMem;
}; };
...@@ -658,7 +660,6 @@ public: ...@@ -658,7 +660,6 @@ public:
this->bReused = true; this->bReused = true;
} }
NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length) NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)
{ {
this->bReused = false; this->bReused = false;
...@@ -674,7 +675,6 @@ public: ...@@ -674,7 +675,6 @@ public:
this->bReused = true; this->bReused = true;
} }
NcvBool isMemReused() const NcvBool isMemReused() const
{ {
return this->bReused; return this->bReused;
...@@ -703,7 +703,6 @@ public: ...@@ -703,7 +703,6 @@ public:
virtual ~NCVMatrix() {} virtual ~NCVMatrix() {}
void clear() void clear()
{ {
_ptr = NULL; _ptr = NULL;
...@@ -713,14 +712,13 @@ public: ...@@ -713,14 +712,13 @@ public:
_memtype = NCVMemoryTypeNone; _memtype = NCVMemoryTypeNone;
} }
Ncv32u stride() const Ncv32u stride() const
{ {
return _pitch / sizeof(T); return _pitch / sizeof(T);
} }
//a side effect of this function is that it copies everything in a single chunk, so the "padding" will be overwritten
NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0) NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
{ {
if (howMuch == 0) if (howMuch == 0)
{ {
...@@ -748,6 +746,24 @@ public: ...@@ -748,6 +746,24 @@ public:
return ncvStat; return ncvStat;
} }
NCVStatus copy2D(NCVMatrix<T> &dst, NcvSize32u roi, cudaStream_t cuStream) const
{
ncvAssertReturn(this->width() >= roi.width && this->height() >= roi.height &&
dst.width() >= roi.width && dst.height() >= roi.height, NCV_MEM_COPY_ERROR);
ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) &&
(dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
NCVStatus ncvStat = NCV_SUCCESS;
if (this->_memtype != NCVMemoryTypeNone)
{
ncvStat = memSegCopyHelper2D(dst._ptr, dst._pitch, dst._memtype,
this->_ptr, this->_pitch, this->_memtype,
roi.width * sizeof(T), roi.height, cuStream);
}
return ncvStat;
}
T *ptr() const {return this->_ptr;} T *ptr() const {return this->_ptr;}
Ncv32u width() const {return this->_width;} Ncv32u width() const {return this->_width;}
Ncv32u height() const {return this->_height;} Ncv32u height() const {return this->_height;}
...@@ -817,19 +833,16 @@ public: ...@@ -817,19 +833,16 @@ public:
this->clear(); this->clear();
} }
NcvBool isMemAllocated() const NcvBool isMemAllocated() const
{ {
return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting()); return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
} }
Ncv32u getAllocatorsAlignment() const Ncv32u getAllocatorsAlignment() const
{ {
return allocator.alignment(); return allocator.alignment();
} }
NCVMemSegment getSegment() const NCVMemSegment getSegment() const
{ {
return allocatedMem; return allocatedMem;
...@@ -888,6 +901,23 @@ public: ...@@ -888,6 +901,23 @@ public:
this->bReused = true; this->bReused = true;
} }
NCVMatrixReuse(const NCVMatrix<T> &mat, NcvRect32u roi)
{
this->bReused = false;
this->clear();
ncvAssertPrintReturn(roi.x < mat.width() && roi.y < mat.height() && \
roi.x + roi.width <= mat.width() && roi.y + roi.height <= mat.height(),
"NCVMatrixReuse ctor:: memory binding failed due to mismatching ROI and source matrix dims", );
this->_width = roi.width;
this->_height = roi.height;
this->_pitch = mat.pitch();
this->_ptr = mat.ptr() + roi.y * mat.stride() + roi.x;
this->_memtype = mat.memType();
this->bReused = true;
}
NcvBool isMemReused() const NcvBool isMemReused() const
{ {
...@@ -899,4 +929,27 @@ private: ...@@ -899,4 +929,27 @@ private:
NcvBool bReused; NcvBool bReused;
}; };
/**
* Operations with rectangles
*/
NCV_EXPORTS NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses, Ncv32u &numHypotheses,
Ncv32u minNeighbors, Ncv32f intersectEps, NCVVector<Ncv32u> *hypothesesWeights);
NCV_EXPORTS NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
NcvRect32u *h_rects, Ncv32u numRects, Ncv8u color);
NCV_EXPORTS NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
NcvRect32u *h_rects, Ncv32u numRects, Ncv32u color);
NCV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
NcvRect32u *d_rects, Ncv32u numRects, Ncv8u color, cudaStream_t cuStream);
NCV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
NcvRect32u *d_rects, Ncv32u numRects, Ncv32u color, cudaStream_t cuStream);
#endif // _ncv_hpp_ #endif // _ncv_hpp_
...@@ -150,14 +150,14 @@ namespace NCVRuntimeTemplateBool ...@@ -150,14 +150,14 @@ namespace NCVRuntimeTemplateBool
{ {
//Convenience function used by the user //Convenience function used by the user
//Takes a variable argument list, transforms it into a list //Takes a variable argument list, transforms it into a list
static void call(Func *functor, int dummy, ...) static void call(Func *functor, ...)
{ {
//Vector used to collect arguments //Vector used to collect arguments
std::vector<int> templateParamList; std::vector<int> templateParamList;
//Variable argument list manipulation //Variable argument list manipulation
va_list listPointer; va_list listPointer;
va_start(listPointer, dummy); va_start(listPointer, functor);
//Collect parameters into the list //Collect parameters into the list
for(int i=0; i<NumArguments; i++) for(int i=0; i<NumArguments; i++)
{ {
......
...@@ -134,7 +134,7 @@ bool TestHypothesesFilter::process() ...@@ -134,7 +134,7 @@ bool TestHypothesesFilter::process()
Ncv32u numHypothesesSrc = h_vecSrc.length(); Ncv32u numHypothesesSrc = h_vecSrc.length();
NCV_SKIP_COND_BEGIN NCV_SKIP_COND_BEGIN
ncvStat = ncvFilterHypotheses_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL); ncvStat = ncvGroupRectangles_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
ncvAssertReturn(ncvStat == NCV_SUCCESS, false); ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
NCV_SKIP_COND_END NCV_SKIP_COND_END
......
...@@ -83,17 +83,17 @@ bool TestResize<T>::process() ...@@ -83,17 +83,17 @@ bool TestResize<T>::process()
NCV_SKIP_COND_BEGIN NCV_SKIP_COND_BEGIN
if (sizeof(T) == sizeof(Ncv32u)) if (sizeof(T) == sizeof(Ncv32u))
{ {
ncvStat = nppiStDownsampleNearest_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(), ncvStat = nppiStDecimate_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
(Ncv32u *)d_small.ptr(), d_small.pitch(), (Ncv32u *)d_small.ptr(), d_small.pitch(),
srcSize, this->scaleFactor, srcSize, this->scaleFactor,
this->bTextureCache); this->bTextureCache);
} }
else if (sizeof(T) == sizeof(Ncv64u)) else if (sizeof(T) == sizeof(Ncv64u))
{ {
ncvStat = nppiStDownsampleNearest_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(), ncvStat = nppiStDecimate_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
(Ncv64u *)d_small.ptr(), d_small.pitch(), (Ncv64u *)d_small.ptr(), d_small.pitch(),
srcSize, this->scaleFactor, srcSize, this->scaleFactor,
this->bTextureCache); this->bTextureCache);
} }
else else
{ {
...@@ -107,15 +107,15 @@ bool TestResize<T>::process() ...@@ -107,15 +107,15 @@ bool TestResize<T>::process()
NCV_SKIP_COND_BEGIN NCV_SKIP_COND_BEGIN
if (sizeof(T) == sizeof(Ncv32u)) if (sizeof(T) == sizeof(Ncv32u))
{ {
ncvStat = nppiStDownsampleNearest_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(), ncvStat = nppiStDecimate_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
(Ncv32u *)h_small.ptr(), h_small.pitch(), (Ncv32u *)h_small.ptr(), h_small.pitch(),
srcSize, this->scaleFactor); srcSize, this->scaleFactor);
} }
else if (sizeof(T) == sizeof(Ncv64u)) else if (sizeof(T) == sizeof(Ncv64u))
{ {
ncvStat = nppiStDownsampleNearest_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(), ncvStat = nppiStDecimate_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
(Ncv64u *)h_small.ptr(), h_small.pitch(), (Ncv64u *)h_small.ptr(), h_small.pitch(),
srcSize, this->scaleFactor); srcSize, this->scaleFactor);
} }
else else
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment