GK107 Policy

aa92be34 · marina.kolpakova · 580d8173 · aa92be34 · aa92be34 · aa92be34
Commit aa92be34 authored Nov 12, 2012 by marina.kolpakova
Showing with 128 additions and 190 deletions

isf-sc.cu modules/gpu/src/cuda/isf-sc.cu +89 -152

icf.hpp modules/gpu/src/icf.hpp +18 -8

softcascade.cpp modules/gpu/src/softcascade.cpp +16 -25

test_softcascade.cpp modules/gpu/test/test_softcascade.cpp +5 -5

No files found.
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -161,192 +161,128 @@ namespace icf {
    }
    texture<float2,  cudaTextureType2D, cudaReadModeElementType> troi;
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
-    template<bool isUp>
-    __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
-        const int downscales)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x;
-        // load Lavel
+template<typename Policy>
-        __shared__ Level level;
+template<bool isUp>
+__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
+{
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x;
-        // check POI
+    // load Lavel
-        __shared__ volatile char roiCache[8];
+    __shared__ Level level;
-        if (!threadIdx.y && !threadIdx.x)
-            ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
-        __syncthreads();
+    // check POI
+    __shared__ volatile char roiCache[Policy::STA_Y];
-        if (!roiCache[threadIdx.y]) return;
+    if (!threadIdx.y && !threadIdx.x)
+        ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
-        if (!threadIdx.x)
+    __syncthreads();
-            level = levels[downscales + blockIdx.z];
-        if(x >= level.workRect.x || y >= level.workRect.y) return;
+    if (!roiCache[threadIdx.y]) return;
-        Octave octave = octaves[level.octave];
+    if (!threadIdx.x)
-        int st = octave.index * octave.stages;
+        level = levels[downscales + blockIdx.z];
-        const int stEnd = st + 1024;
-        float confidence = 0.f;
+    if(x >= level.workRect.x || y >= level.workRect.y) return;
-        for(; st < stEnd; st += 32)
+    int st = level.octave * level.step;
-        {
+    const int stEnd = st + level.step;
-            const int nId = (st + threadIdx.x) * 3;
+    float confidence = 0.f;
-            dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId);
+    for(; st < stEnd; st += Policy::WARP)
-            Node node = nodes[nId];
+    {
+        const int nId = (st + threadIdx.x) * 3;
-            float threshold = rescale<isUp>(level, node);
-            int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
-            int next = 1 + (int)(sum >= threshold);
+        Node node = nodes[nId];
-            dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
-            node = nodes[nId + next];
+        float threshold = rescale<isUp>(level, node);
-            threshold = rescale<isUp>(level, node);
+        int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
-            sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
-            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+        int next = 1 + (int)(sum >= threshold);
-            float impact = leaves[(st + threadIdx.x) * 4 + lShift];
-            dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact);
+        node = nodes[nId + next];
-            dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]);
+        threshold = rescale<isUp>(level, node);
-            dprintf("%d: computed  score: %f\n",threadIdx.x, impact);
+        sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
-#pragma unroll
-            // scan on shuffl functions
-            for (int i = 1; i < 32; i *= 2)
-            {
-                const float n = __shfl_up(impact, i, 32);
-                if (threadIdx.x >= i)
+        const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-                    impact += n;
+        float impact = leaves[(st + threadIdx.x) * 4 + lShift];
-            }
-            dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact);
+#pragma unroll
+        // scan on shuffl functions
+        for (int i = 1; i < Policy::WARP; i *= 2)
+        {
+            const float n = __shfl_up(impact, i, Policy::WARP);
-            confidence += impact;
+            if (threadIdx.x >= i)
-            if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
+                impact += n;
        }
-        if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
+        confidence += impact;
-        {
+        if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
-            int idx = atomicInc(ctr, ndetections);
-            // store detection
-            objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
-                __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
-        }
    }
-#else
-    template<bool isUp>
-    __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
-        const int downscales)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        Level level = levels[blockIdx.z];
-        // if (blockIdx.z != 31) return;
-        if(x >= level.workRect.x || y >= level.workRect.y) return;
-        // int roi = tex2D(troi, x, y);
-        // printf("%d\n", roi);
-        // if (!roi) return;
-        Octave octave = octaves[level.octave];
-        int st = octave.index * octave.stages;
-        const int stEnd = st + 1000;//octave.stages;
-        float confidence = 0.f;
+    if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
+    {
+        int idx = atomicInc(ctr, ndetections);
+        objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE),
+            __float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence);
+    }
+}
-        for(; st < stEnd; ++st)
+template<typename Policy, bool isUp>
-        {
+__global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* objects, const uint n, uint* ctr, const int downs)
-            dprintf("\n\nstage: %d\n", st);
+{
-            const int nId = st * 3;
+    invoker.template detect<isUp>(objects, n, ctr, downs);
-            Node node = nodes[nId];
+}
-            dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
+template<typename Policy>
-                node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
+void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
+{
+    int fw = 160;
+    int fh = 120;
-            float threshold = rescale<isUp>(level, node);
+    dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1);
-            int sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
-            dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
+    uint* ctr = (uint*)(counter.ptr(0));
-                node.rect.w, threshold);
+    Detection* det = (Detection*)objects.ptr();
+    uint max_det = objects.cols / sizeof(Detection);
-            int next = 1 + (int)(sum >= threshold);
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
-            dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
+    cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
-            node = nodes[nId + next];
+    cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
-            threshold = rescale<isUp>(level, node);
+    cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
-            sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
-            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+    const CascadeInvoker<Policy> inv = *this;
-            float impact = leaves[st * 4 + lShift];
-            confidence += impact;
-            if (confidence <= stages[st]) st = stEnd + 10;
+    if (scale == -1)
-            dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+    {
-            dprintf("extracted stage: %f\n", stages[st]);
+        soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0);
-            dprintf("computed  score: %f\n\n", confidence);
+        cudaSafeCall( cudaGetLastError());
-        }
-        if(st == stEnd)
+        grid = dim3(fw, fh / Policy::STA_Y, scales - downscales);
-        {
+        soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales);
-            int idx = atomicInc(ctr, ndetections);
-            // store detection
-            objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
-                __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
-        }
    }
-#endif
+    else
-    template<>
-    void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
    {
-        int fw = 160;
+        if (scale >= downscales)
-        int fh = 120;
+            soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, scale);
-        dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1);
-        uint* ctr = (uint*)(counter.ptr(0));
-        Detection* det = (Detection*)objects.ptr();
-        uint max_det = objects.cols / sizeof(Detection);
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
-        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
-        if (scale == -1)
-        {
-            test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
-            cudaSafeCall( cudaGetLastError());
-            grid = dim3(fw, fh / 8, 47 - downscales);
-            test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
-        }
        else
-        {
+            soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, scale);
-            if (scale >= downscales)
+    }
-                test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
-            else
-                test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
-        }
-        if (!stream)
+    if (!stream)
-        {
+    {
-            cudaSafeCall( cudaGetLastError());
+        cudaSafeCall( cudaGetLastError());
-            cudaSafeCall( cudaDeviceSynchronize());
+        cudaSafeCall( cudaDeviceSynchronize());
-        }
    }
 }
+template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const;
+}
 }}}
\ No newline at end of file
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -72,9 +72,9 @@ struct __align__(16) Octave
 struct __align__(8) Level //is actually 24 bytes
 {
    int octave;
+    int step;
    float relScale;
-    float shrScale;   // used for marking detection
    float scaling[2]; // calculated according to Dollal paper
    // for 640x480 we can not get overflow
@@ -115,31 +115,41 @@ struct __align__(16) Detection
    : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };
-struct CascadePolicy
+struct GK107PolicyX4
 {
-    enum {STA_X = 32, STA_Y = 8};
+    enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4};
+    static const dim3 block()
+    {
+        return dim3(GK107PolicyX4::STA_X, GK107PolicyX4::STA_Y);
+    }
 };
 template<typename Policy>
 struct CascadeInvoker
 {
-    CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {}
+    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages,
                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
-    : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()),
+    : levels((const Level*)_levels.ptr()),
-       nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr())
+      stages((const float*)_stages.ptr()),
+      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
+      scales(_levels.cols / sizeof(Level))
    {}
    const Level*  levels;
-    const Octave* octaves;
    const float*  stages;
    const Node*   nodes;
    const float*  leaves;
+    int scales;
    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
        PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const;
+    template<bool isUp>
+    __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;
 };
 }

--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -63,7 +63,7 @@ void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 #include <icf.hpp>
 cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-:  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+:  octave(idx), step(oct.stages), relScale(scale / oct.scale)
 {
    workRect.x = round(w / (float)oct.shrinkage);
    workRect.y = round(h / (float)oct.shrinkage);
@@ -100,7 +100,7 @@ namespace imgproc {
 struct cv::gpu::SCascade::Fields
 {
-    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs)
+    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals)
    {
        static const char *const SC_STAGE_TYPE          = "stageType";
        static const char *const SC_BOOST               = "BOOST";
@@ -119,11 +119,8 @@ struct cv::gpu::SCascade::Fields
        static const char *const SC_ORIG_W              = "width";
        static const char *const SC_ORIG_H              = "height";
-        int origWidth = (int)root[SC_ORIG_W];
+        int origWidth  = (int)root[SC_ORIG_W];
-        CV_Assert(origWidth  == ORIG_OBJECT_WIDTH);
        int origHeight = (int)root[SC_ORIG_H];
-        CV_Assert(origHeight == ORIG_OBJECT_HEIGHT);
        static const char *const SC_OCTAVES             = "octaves";
        static const char *const SC_STAGES              = "stages";
@@ -142,7 +139,6 @@ struct cv::gpu::SCascade::Fields
        static const char * const SC_F_CHANNEL          = "channel";
        static const char * const SC_F_RECT             = "rect";
        FileNode fn = root[SC_OCTAVES];
            if (fn.empty()) return false;
@@ -167,8 +163,8 @@ struct cv::gpu::SCascade::Fields
            ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
            ushort2 size;
-            size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
+            size.x = cvRound(origWidth * scale);
-            size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
+            size.y = cvRound(origHeight * scale);
            shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
            Octave octave(octIndex, nstages, shrinkage, size, scale);
@@ -245,11 +241,11 @@ struct cv::gpu::SCascade::Fields
        CV_Assert(!hleaves.empty());
        std::vector<Level> vlevels;
-        float logFactor = (::log(maxs) - ::log(mins)) / (TOTAL_SCALES -1);
+        float logFactor = (::log(maxs) - ::log(mins)) / (totals -1);
        float scale = mins;
        int downscales = 0;
-        for (int sc = 0; sc < TOTAL_SCALES; ++sc)
+        for (int sc = 0; sc < totals; ++sc)
        {
            int width  = ::std::max(0.0f, FRAME_WIDTH - (origWidth  * scale));
            int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale));
@@ -302,7 +298,7 @@ struct cv::gpu::SCascade::Fields
        leaves.upload(hleaves);
        levels.upload(hlevels);
-        invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
+        invoker = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, octaves, stages, nodes, leaves);
    }
@@ -456,16 +452,13 @@ public:
    GpuMat sobelBuf;
-    device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
+    device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker;
    enum { BOOST = 0 };
    enum
    {
        FRAME_WIDTH        = 640,
        FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128,
        HOG_BINS           = 6,
        LUV_BINS           = 3,
        HOG_LUV_BINS       = 10
@@ -480,21 +473,19 @@ cv::gpu::SCascade::~SCascade() { delete fields; }
 bool cv::gpu::SCascade::load(const FileNode& fn)
 {
    if (fields) delete fields;
-    fields = Fields::parseCascade(fn, minScale, maxScale);
+    fields = Fields::parseCascade(fn, minScale, maxScale, scales);
    return fields != 0;
 }
 void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const
 {
+    CV_Assert(fields);
    const GpuMat colored = image.getGpuMat();
    // only color images are supperted
    CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
-    // we guess user knows about shrincage
-    // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
    Fields& flds = *fields;
    if (colored.type() == CV_8UC3)
@@ -518,15 +509,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const
 {
+    CV_Assert(fields);
    const GpuMat colored = image.getGpuMat();
    // only color images are supperted
    CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
-    // we guess user knows about shrincage
-    // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
    Fields& flds = *fields;
    if (colored.type() == CV_8UC3)
    {
        // only this window size allowed
@@ -549,6 +538,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const
 {
+    CV_Assert(fields);
    const GpuMat roi = _roi.getGpuMat();
    _mask.create( roi.cols / 4, roi.rows / 4, roi.type() );
    GpuMat mask = _mask.getGpuMat();

--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -172,7 +172,7 @@ GPU_TEST_P(SCascadeTestRoi, detect,
        sub.setTo(1);
        cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
    }
+    objectBoxes.setTo(0);
    cascade.genRoi(rois, trois);
    cascade.detect(colored, trois, objectBoxes);
@@ -222,7 +222,7 @@ GPU_TEST_P(SCascadeTestLevel, detect,
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois);
+    objectBoxes.setTo(0);
    int level = GET_PARAM(3);
    cascade.detect(colored, trois, objectBoxes, level);
@@ -281,7 +281,7 @@ GPU_TEST_P(SCascadeTestAll, detect,
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois);
+    objectBoxes.setTo(0);
    cascade.detect(colored, trois, objectBoxes);
    typedef cv::gpu::SCascade::Detection Detection;
@@ -321,7 +321,7 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral,
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois);
+    objectBoxes.setTo(0);
    cascade.detect(hogluv, trois, objectBoxes);
    typedef cv::gpu::SCascade::Detection Detection;
@@ -357,7 +357,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream,
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois, s);
+    objectBoxes.setTo(0);
    cascade.detect(colored, trois, objectBoxes, s);
    cudaDeviceSynchronize();