Commit 63b5cf6d authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

temporary disabled optimized version of CascadeClassifier (bug #1640)

fixed HaarCascadeLoader test (incorrect behavior due to macros usage)
parent 1494da33
...@@ -77,56 +77,110 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th ...@@ -77,56 +77,110 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE and size is power of 2 //assuming size <= WARP_SIZE and size is power of 2
template <class T> //template <class T>
inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) //inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
//{
// Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
// s_Data[pos] = 0;
// pos += K_WARP_SIZE;
// s_Data[pos] = idata;
//
// s_Data[pos] += s_Data[pos - 1];
// s_Data[pos] += s_Data[pos - 2];
// s_Data[pos] += s_Data[pos - 4];
// s_Data[pos] += s_Data[pos - 8];
// s_Data[pos] += s_Data[pos - 16];
//
// return s_Data[pos];
//}
//template <class T>
//inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
//{
// return warpScanInclusive(idata, s_Data) - idata;
//}
//
//
//template <class T, Ncv32u tiNumScanThreads>
//inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
//{
// if (tiNumScanThreads > K_WARP_SIZE)
// {
// //Bottom-level inclusive warp scan
// T warpResult = warpScanInclusive(idata, s_Data);
//
// //Save top elements of each warp for exclusive warp scan
// //sync to wait for warp scans to complete (because s_Data is being overwritten)
// __syncthreads();
// if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
// {
// s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
// }
//
// //wait for warp scans to complete
// __syncthreads();
//
// if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
// {
// //grab top warp elements
// T val = s_Data[threadIdx.x];
// //calculate exclusive scan and write back to shared memory
// s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
// }
//
// //return updated warp scans with exclusive scan results
// __syncthreads();
// return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
// }
// else
// {
// return warpScanInclusive(idata, s_Data);
// }
//}
template <Ncv32u size>
__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u* s_Data)
{ {
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
s_Data[pos] = 0; s_Data[pos] = 0;
pos += K_WARP_SIZE; pos += size;
s_Data[pos] = idata; s_Data[pos] = idata;
s_Data[pos] += s_Data[pos - 1]; for(Ncv32u offset = 1; offset < size; offset <<= 1)
s_Data[pos] += s_Data[pos - 2]; s_Data[pos] += s_Data[pos - offset];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos]; return s_Data[pos];
} }
template <Ncv32u size>
template <class T> __forceinline__ __device__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
{ {
return warpScanInclusive(idata, s_Data) - idata; return warpScanInclusive<size>(idata, s_Data) - idata;
} }
template <Ncv32u size, Ncv32u tiNumScanThreads>
template <class T, Ncv32u tiNumScanThreads> __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
{ {
if (tiNumScanThreads > K_WARP_SIZE) if(size > K_WARP_SIZE)
{ {
//Bottom-level inclusive warp scan //Bottom-level inclusive warp scan
T warpResult = warpScanInclusive(idata, s_Data); Ncv32u warpResult = warpScanInclusive<K_WARP_SIZE>(idata, s_Data);
//Save top elements of each warp for exclusive warp scan //Save top elements of each warp for exclusive warp scan
//sync to wait for warp scans to complete (because s_Data is being overwritten) //sync to wait for warp scans to complete (because s_Data is being overwritten)
__syncthreads(); __syncthreads();
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
{
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
}
//wait for warp scans to complete //wait for warp scans to complete
__syncthreads(); __syncthreads();
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
{ {
//grab top warp elements //grab top warp elements
T val = s_Data[threadIdx.x]; Ncv32u val = s_Data[threadIdx.x];
//calculate exclusive scan and write back to shared memory //calculate exclsive scan and write back to shared memory
s_Data[threadIdx.x] = warpScanExclusive(val, s_Data); s_Data[threadIdx.x] = warpScanExclusive<(size >> K_LOG2_WARP_SIZE)>(val, s_Data);
} }
//return updated warp scans with exclusive scan results //return updated warp scans with exclusive scan results
...@@ -135,7 +189,7 @@ inline __device__ T blockScanInclusive(T idata, volatile T *s_Data) ...@@ -135,7 +189,7 @@ inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
} }
else else
{ {
return warpScanInclusive(idata, s_Data); return warpScanInclusive<size>(idata, s_Data);
} }
} }
...@@ -233,30 +287,29 @@ __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg) ...@@ -233,30 +287,29 @@ __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
__device__ Ncv32u d_outMaskPosition; __device__ Ncv32u d_outMaskPosition;
__inline __device__ void compactBlockWriteOutAnchorParallel(NcvBool threadPassFlag, __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
Ncv32u threadElem,
Ncv32u *vectorOut)
{ {
#if __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
Ncv32u passMaskElem = threadPassFlag ? 1 : 0;
__shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2]; __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
Ncv32u incScan = blockScanInclusive<Ncv32u, NUM_THREADS_ANCHORSPARALLEL>(passMaskElem, shmem);
__syncthreads();
Ncv32u excScan = incScan - passMaskElem;
__shared__ Ncv32u numPassed; __shared__ Ncv32u numPassed;
__shared__ Ncv32u outMaskOffset; __shared__ Ncv32u outMaskOffset;
Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL, NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
__syncthreads();
if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1) if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
{ {
numPassed = incScan; numPassed = incScan;
outMaskOffset = atomicAdd(&d_outMaskPosition, incScan); outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
} }
__syncthreads();
if (threadPassFlag) if (threadPassFlag)
{ {
Ncv32u excScan = incScan - threadPassFlag;
shmem[excScan] = threadElem; shmem[excScan] = threadElem;
} }
__syncthreads(); __syncthreads();
if (threadIdx.x < numPassed) if (threadIdx.x < numPassed)
...@@ -1047,7 +1100,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag ...@@ -1047,7 +1100,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
NcvBool bTexCacheCascade = devProp.major < 2; NcvBool bTexCacheCascade = devProp.major < 2;
NcvBool bTexCacheIImg = true; //this works better even on Fermi so far NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3); NcvBool bDoAtomicCompaction = false;// devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask; NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp; NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
...@@ -2073,13 +2126,16 @@ static NCVStatus loadFromNVBIN(const std::string &filename, ...@@ -2073,13 +2126,16 @@ static NCVStatus loadFromNVBIN(const std::string &filename,
std::vector<HaarClassifierNode128> &haarClassifierNodes, std::vector<HaarClassifierNode128> &haarClassifierNodes,
std::vector<HaarFeature64> &haarFeatures) std::vector<HaarFeature64> &haarFeatures)
{ {
size_t readCount;
FILE *fp = fopen(filename.c_str(), "rb"); FILE *fp = fopen(filename.c_str(), "rb");
ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
Ncv32u fileVersion; Ncv32u fileVersion;
ncvAssertReturn(1 == fread(&fileVersion, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR); readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR); ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
Ncv32u fsize; Ncv32u fsize;
ncvAssertReturn(1 == fread(&fsize, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR); readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
fseek(fp, 0, SEEK_END); fseek(fp, 0, SEEK_END);
Ncv32u fsizeActual = ftell(fp); Ncv32u fsizeActual = ftell(fp);
ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR); ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
...@@ -2088,7 +2144,8 @@ static NCVStatus loadFromNVBIN(const std::string &filename, ...@@ -2088,7 +2144,8 @@ static NCVStatus loadFromNVBIN(const std::string &filename,
fdata.resize(fsize); fdata.resize(fsize);
Ncv32u dataOffset = 0; Ncv32u dataOffset = 0;
fseek(fp, 0, SEEK_SET); fseek(fp, 0, SEEK_SET);
ncvAssertReturn(1 == fread(&fdata[0], fsize, 1, fp), NCV_FILE_ERROR); readCount = fread(&fdata[0], fsize, 1, fp);
ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
fclose(fp); fclose(fp);
//data //data
...@@ -2130,6 +2187,7 @@ static NCVStatus loadFromNVBIN(const std::string &filename, ...@@ -2130,6 +2187,7 @@ static NCVStatus loadFromNVBIN(const std::string &filename,
NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages, NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
Ncv32u &numNodes, Ncv32u &numFeatures) Ncv32u &numNodes, Ncv32u &numFeatures)
{ {
size_t readCount;
NCVStatus ncvStat; NCVStatus ncvStat;
std::string fext = filename.substr(filename.find_last_of(".") + 1); std::string fext = filename.substr(filename.find_last_of(".") + 1);
...@@ -2140,14 +2198,19 @@ NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStage ...@@ -2140,14 +2198,19 @@ NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStage
FILE *fp = fopen(filename.c_str(), "rb"); FILE *fp = fopen(filename.c_str(), "rb");
ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
Ncv32u fileVersion; Ncv32u fileVersion;
ncvAssertReturn(1 == fread(&fileVersion, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR); readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR); ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET); fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
Ncv32u tmp; Ncv32u tmp;
ncvAssertReturn(1 == fread(&numStages, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR); readCount = fread(&numStages, sizeof(Ncv32u), 1, fp);
ncvAssertReturn(1 == fread(&tmp, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR); ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
ncvAssertReturn(1 == fread(&numNodes, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR); readCount = fread(&tmp, sizeof(Ncv32u), 1, fp);
ncvAssertReturn(1 == fread(&numFeatures, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR); ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
readCount = fread(&numNodes, sizeof(Ncv32u), 1, fp);
ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
fclose(fp); fclose(fp);
} }
else if (fext == "xml") else if (fext == "xml")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment