temporary disabled optimized version of CascadeClassifier (bug #1640)

fixed HaarCascadeLoader test (incorrect behavior due to macros usage)

temporary disabled optimized version of CascadeClassifier (bug #1640)
fixed HaarCascadeLoader test (incorrect behavior due to macros usage)
63b5cf6d · Vladislav Vinogradov · 1494da33 · 63b5cf6d
Commit 63b5cf6d authored Mar 05, 2012 by Vladislav Vinogradov
Hide whitespace changes
Inline Side-by-side

Showing with 106 additions and 43 deletions

NCVHaarObjectDetection.cu modules/gpu/src/nvidia/NCVHaarObjectDetection.cu +106 -43

No files found.
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -77,56 +77,110 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
 //assuming size <= WARP_SIZE and size is power of 2
-template <class T>
+//template <class T>
-inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
+//inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
+//{
+//    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+//    s_Data[pos] = 0;
+//    pos += K_WARP_SIZE;
+//    s_Data[pos] = idata;
+//
+//    s_Data[pos] += s_Data[pos - 1];
+//    s_Data[pos] += s_Data[pos - 2];
+//    s_Data[pos] += s_Data[pos - 4];
+//    s_Data[pos] += s_Data[pos - 8];
+//    s_Data[pos] += s_Data[pos - 16];
+//
+//    return s_Data[pos];
+//}
+//template <class T>
+//inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
+//{
+//    return warpScanInclusive(idata, s_Data) - idata;
+//}
+//
+//
+//template <class T, Ncv32u tiNumScanThreads>
+//inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
+//{
+//    if (tiNumScanThreads > K_WARP_SIZE)
+//    {
+//        //Bottom-level inclusive warp scan
+//        T warpResult = warpScanInclusive(idata, s_Data);
+//
+//        //Save top elements of each warp for exclusive warp scan
+//        //sync to wait for warp scans to complete (because s_Data is being overwritten)
+//        __syncthreads();
+//        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
+//        {
+//            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
+//        }
+//
+//        //wait for warp scans to complete
+//        __syncthreads();
+//
+//        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
+//        {
+//            //grab top warp elements
+//            T val = s_Data[threadIdx.x];
+//            //calculate exclusive scan and write back to shared memory
+//            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
+//        }
+//
+//        //return updated warp scans with exclusive scan results
+//        __syncthreads();
+//        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
+//    }
+//    else
+//    {
+//        return warpScanInclusive(idata, s_Data);
+//    }
+//}
+template <Ncv32u size>
+__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u* s_Data)
 {
-    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
    s_Data[pos] = 0;
-    pos += K_WARP_SIZE;
+    pos += size;
    s_Data[pos] = idata;
-    s_Data[pos] += s_Data[pos - 1];
+    for(Ncv32u offset = 1; offset < size; offset <<= 1)
-    s_Data[pos] += s_Data[pos - 2];
+        s_Data[pos] += s_Data[pos - offset];
-    s_Data[pos] += s_Data[pos - 4];
-    s_Data[pos] += s_Data[pos - 8];
-    s_Data[pos] += s_Data[pos - 16];
    return s_Data[pos];
 }
+template <Ncv32u size>
-template <class T>
+__forceinline__ __device__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
-inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
 {
-    return warpScanInclusive(idata, s_Data) - idata;
+    return warpScanInclusive<size>(idata, s_Data) - idata;
 }
+template <Ncv32u size, Ncv32u tiNumScanThreads>
-template <class T, Ncv32u tiNumScanThreads>
+__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
-inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
 {
-    if (tiNumScanThreads > K_WARP_SIZE)
+    if(size > K_WARP_SIZE)
    {
        //Bottom-level inclusive warp scan
-        T warpResult = warpScanInclusive(idata, s_Data);
+        Ncv32u warpResult = warpScanInclusive<K_WARP_SIZE>(idata, s_Data);
        //Save top elements of each warp for exclusive warp scan
        //sync to wait for warp scans to complete (because s_Data is being overwritten)
        __syncthreads();
        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
-        {
            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
-        }
        //wait for warp scans to complete
        __syncthreads();
        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
        {
            //grab top warp elements
-            T val = s_Data[threadIdx.x];
+            Ncv32u val = s_Data[threadIdx.x];
-            //calculate exclusive scan and write back to shared memory
+            //calculate exclsive scan and write back to shared memory
-            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
+            s_Data[threadIdx.x] = warpScanExclusive<(size >> K_LOG2_WARP_SIZE)>(val, s_Data);
        }
        //return updated warp scans with exclusive scan results
@@ -135,7 +189,7 @@ inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
    }
    else
    {
-        return warpScanInclusive(idata, s_Data);
+        return warpScanInclusive<size>(idata, s_Data);
    }
 }
@@ -233,30 +287,29 @@ __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
 __device__ Ncv32u d_outMaskPosition;
-__inline __device__ void compactBlockWriteOutAnchorParallel(NcvBool threadPassFlag,
+__device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
-                                                            Ncv32u threadElem,
-                                                            Ncv32u *vectorOut)
 {
 #if __CUDA_ARCH__ >= 110
-    Ncv32u passMaskElem = threadPassFlag ? 1 : 0;
    __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
-    Ncv32u incScan = blockScanInclusive<Ncv32u, NUM_THREADS_ANCHORSPARALLEL>(passMaskElem, shmem);
-    __syncthreads();
-    Ncv32u excScan = incScan - passMaskElem;
    __shared__ Ncv32u numPassed;
    __shared__ Ncv32u outMaskOffset;
+    Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL, NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
+    __syncthreads();
    if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
    {
        numPassed = incScan;
        outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
    }
-    __syncthreads();
    if (threadPassFlag)
    {
+        Ncv32u excScan = incScan - threadPassFlag;
        shmem[excScan] = threadElem;
    }
    __syncthreads();
    if (threadIdx.x < numPassed)
@@ -1047,7 +1100,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
    NcvBool bTexCacheCascade = devProp.major < 2;
    NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
-    NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
+    NcvBool bDoAtomicCompaction = false;// devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
    NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
    NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
@@ -2073,13 +2126,16 @@ static NCVStatus loadFromNVBIN(const std::string &filename,
                               std::vector<HaarClassifierNode128> &haarClassifierNodes,
                               std::vector<HaarFeature64> &haarFeatures)
 {
+    size_t readCount;
    FILE *fp = fopen(filename.c_str(), "rb");
    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
    Ncv32u fileVersion;
-    ncvAssertReturn(1 == fread(&fileVersion, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
+    readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
+    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
    ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
    Ncv32u fsize;
-    ncvAssertReturn(1 == fread(&fsize, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
+    readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
+    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
    fseek(fp, 0, SEEK_END);
    Ncv32u fsizeActual = ftell(fp);
    ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
@@ -2088,7 +2144,8 @@ static NCVStatus loadFromNVBIN(const std::string &filename,
    fdata.resize(fsize);
    Ncv32u dataOffset = 0;
    fseek(fp, 0, SEEK_SET);
-    ncvAssertReturn(1 == fread(&fdata[0], fsize, 1, fp), NCV_FILE_ERROR);
+    readCount = fread(&fdata[0], fsize, 1, fp);
+    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
    fclose(fp);
    //data
@@ -2130,6 +2187,7 @@ static NCVStatus loadFromNVBIN(const std::string &filename,
 NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
                                   Ncv32u &numNodes, Ncv32u &numFeatures)
 {
+    size_t readCount;
    NCVStatus ncvStat;
    std::string fext = filename.substr(filename.find_last_of(".") + 1);
@@ -2140,14 +2198,19 @@ NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStage
        FILE *fp = fopen(filename.c_str(), "rb");
        ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
        Ncv32u fileVersion;
-        ncvAssertReturn(1 == fread(&fileVersion, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
+        readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
        ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
        fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
        Ncv32u tmp;
-        ncvAssertReturn(1 == fread(&numStages,   sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
+        readCount = fread(&numStages,   sizeof(Ncv32u), 1, fp);
-        ncvAssertReturn(1 == fread(&tmp,         sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
-        ncvAssertReturn(1 == fread(&numNodes,    sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
+        readCount = fread(&tmp,         sizeof(Ncv32u), 1, fp);
-        ncvAssertReturn(1 == fread(&numFeatures, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+        readCount = fread(&numNodes,    sizeof(Ncv32u), 1, fp);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+        readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
        fclose(fp);
    }
    else if (fext == "xml")