changed hog to work with variable parameters and changed the hog sample to test…

changed hog to work with variable parameters and changed the hog sample to test it with more options added comments and tests

changed hog to work with variable parameters and changed the hog sample to test…
changed hog to work with variable parameters and changed the hog sample to test it with more options added comments and tests
89889ae8 · ManuelFreudenreich · 424c2bdd · 89889ae8 · 89889ae8 · 89889ae8
Commit 89889ae8 authored Jun 11, 2015 by ManuelFreudenreich
4 changed files
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@@ -49,11 +49,6 @@
 namespace cv { namespace cuda { namespace device
 {
-    // Other values are not supported
-    #define CELL_WIDTH 8
-    #define CELL_HEIGHT 8
-    #define CELLS_PER_BLOCK_X 2
-    #define CELLS_PER_BLOCK_Y 2
    namespace hog
    {
@@ -62,6 +57,8 @@ namespace cv { namespace cuda { namespace device
        __constant__ int cblock_stride_y;
        __constant__ int cnblocks_win_x;
        __constant__ int cnblocks_win_y;
+        __constant__ int cncells_block_x;
+        __constant__ int cncells_block_y;
        __constant__ int cblock_hist_size;
        __constant__ int cblock_hist_size_2up;
        __constant__ int cdescr_size;
@@ -72,31 +69,47 @@ namespace cv { namespace cuda { namespace device
        the typical GPU thread count (pert block) values */
        int power_2up(unsigned int n)
        {
-            if (n < 1) return 1;
+            if (n <= 1) return 1;
-            else if (n < 2) return 2;
+            else if (n <= 2) return 2;
-            else if (n < 4) return 4;
+            else if (n <= 4) return 4;
-            else if (n < 8) return 8;
+            else if (n <= 8) return 8;
-            else if (n < 16) return 16;
+            else if (n <= 16) return 16;
-            else if (n < 32) return 32;
+            else if (n <= 32) return 32;
-            else if (n < 64) return 64;
+            else if (n <= 64) return 64;
-            else if (n < 128) return 128;
+            else if (n <= 128) return 128;
-            else if (n < 256) return 256;
+            else if (n <= 256) return 256;
-            else if (n < 512) return 512;
+            else if (n <= 512) return 512;
-            else if (n < 1024) return 1024;
+            else if (n <= 1024) return 1024;
            return -1; // Input is too big
        }
+        /* Returns the max size for nblocks */
+        int max_nblocks(int nthreads, int ncells_block = 1)
+        {
+            int threads = nthreads * ncells_block;
+            if(threads * 4 <= 256)
+                return 4;
+            else if(threads * 3 <= 256)
+                return 3;
+            else if(threads * 2 <= 256)
+                return 2;
+            else
+                return 1;
+        }
        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-                              int nblocks_win_x, int nblocks_win_y)
+                              int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y)
        {
            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y)) );
-            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            int block_hist_size = nbins * ncells_block_x * ncells_block_y;
            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
            int block_hist_size_2up = power_2up(block_hist_size);
@@ -112,44 +125,48 @@ namespace cv { namespace cuda { namespace device
        //----------------------------------------------------------------------------
        // Histogram computation
+        //
+        // CUDA kernel to compute the histograms
        template <int nblocks> // Number of histogram blocks processed by single GPU thread block
        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad,
-                                                         const PtrStepb qangle, float scale, float* block_hists)
+                                                         const PtrStepb qangle, float scale, float* block_hists,
+                                                         int cell_size, int patch_size, int block_patch_size,
+                                                         int threads_cell, int threads_block, int half_cell_size)
        {
            const int block_x = threadIdx.z;
-            const int cell_x = threadIdx.x / 16;
+            const int cell_x = threadIdx.x / threads_cell;
            const int cell_y = threadIdx.y;
-            const int cell_thread_x = threadIdx.x & 0xF;
+            const int cell_thread_x = threadIdx.x & (threads_cell - 1);
            if (blockIdx.x * blockDim.z + block_x >= img_block_width)
                return;
            extern __shared__ float smem[];
            float* hists = smem;
-            float* final_hist = smem + cnbins * 48 * nblocks;
+            float* final_hist = smem + cnbins * block_patch_size * nblocks;
-            const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
+            // patch_size means that patch_size pixels affect on block's cell
-                                 4 * cell_x + cell_thread_x;
+            if (cell_thread_x < patch_size)
-            const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
+            {
+                const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
+                                     half_cell_size * cell_x + cell_thread_x;
+                const int offset_y = blockIdx.y * cblock_stride_y + half_cell_size * cell_y;
-            const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
+                const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
-            const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
+                const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
-            // 12 means that 12 pixels affect on block's cell (in one row)
-            if (cell_thread_x < 12)
+                float* hist = hists + patch_size * (cell_y * blockDim.z * cncells_block_y +
-            {
+                                            cell_x + block_x * cncells_block_x) +
-                float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +
-                                            cell_x + block_x * CELLS_PER_BLOCK_X) +
                                           cell_thread_x;
                for (int bin_id = 0; bin_id < cnbins; ++bin_id)
-                    hist[bin_id * 48 * nblocks] = 0.f;
+                    hist[bin_id * block_patch_size * nblocks] = 0.f;
-                const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;
+                //(dist_x, dist_y) : distance between current pixel in patch and cell's center
+                const int dist_x = -half_cell_size + (int)cell_thread_x - half_cell_size * cell_x;
-                const int dist_y_begin = -4 - 4 * (int)threadIdx.y;
+                const int dist_y_begin = -half_cell_size - half_cell_size * (int)threadIdx.y;
-                for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+                for (int dist_y = dist_y_begin; dist_y < dist_y_begin + patch_size; ++dist_y)
                {
                    float2 vote = *(const float2*)grad_ptr;
                    uchar2 bin = *(const uchar2*)qangle_ptr;
@@ -157,25 +174,29 @@ namespace cv { namespace cuda { namespace device
                    grad_ptr += grad.step/sizeof(float);
                    qangle_ptr += qangle.step;
-                    int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+                    //(dist_center_x, dist_center_y) : distance between current pixel in patch and block's center
-                    int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+                    int dist_center_y = dist_y - half_cell_size * (1 - 2 * cell_y);
+                    int dist_center_x = dist_x - half_cell_size * (1 - 2 * cell_x);
                    float gaussian = ::expf(-(dist_center_y * dist_center_y +
                                              dist_center_x * dist_center_x) * scale);
-                    float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
-                                          (8.f - ::fabs(dist_x + 0.5f)) / 64.f;
-                    hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
+                    float interp_weight = ((float)cell_size - ::fabs(dist_y + 0.5f)) *
-                    hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
+                                          ((float)cell_size - ::fabs(dist_x + 0.5f)) / (float)threads_block;
+                    hist[bin.x * block_patch_size * nblocks] += gaussian * interp_weight * vote.x;
+                    hist[bin.y * block_patch_size * nblocks] += gaussian * interp_weight * vote.y;
                }
+                //reduction of the histograms
                volatile float* hist_ = hist;
-                for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)
+                for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += block_patch_size * nblocks)
                {
-                    if (cell_thread_x < 6) hist_[0] += hist_[6];
+                    if (cell_thread_x < patch_size/2) hist_[0] += hist_[patch_size/2];
-                    if (cell_thread_x < 3) hist_[0] += hist_[3];
+                    if (cell_thread_x < patch_size/4 && (!((patch_size/4) < 3 && cell_thread_x == 0)))
+                            hist_[0] += hist_[patch_size/4];
                    if (cell_thread_x == 0)
-                        final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]
+                        final_hist[((cell_x + block_x * cncells_block_x) * cncells_block_y + cell_y) * cnbins + bin_id]
                            = hist_[0] + hist_[1] + hist_[2];
                }
            }
@@ -186,37 +207,69 @@ namespace cv { namespace cuda { namespace device
                                               blockIdx.x * blockDim.z + block_x) *
                                              cblock_hist_size;
-            int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
+            //copying from final_hist to block_hist
+            int tid;
+            if(threads_cell < cnbins)
+            {
+                tid = (cell_y * cncells_block_y + cell_x) * cnbins + cell_thread_x;
+            } else
+            {
+                tid = (cell_y * cncells_block_y + cell_x) * threads_cell + cell_thread_x;
+            }
            if (tid < cblock_hist_size)
+            {
                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
+                if(threads_cell < cnbins && cell_thread_x == (threads_cell-1))
+                {
+                    for(int i=1;i<=(cnbins - threads_cell);++i)
+                    {
+                        block_hist[tid + i] = final_hist[block_x * cblock_hist_size + tid + i];
+                    }
+                }
+            }
        }
+        //declaration of variables and invoke the kernel with the calculated number of blocks
        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
                           int height, int width, const PtrStepSzf& grad,
-                           const PtrStepSzb& qangle, float sigma, float* block_hists)
+                           const PtrStepSzb& qangle, float sigma, float* block_hists,
+                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
        {
-            const int nblocks = 1;
+            const int ncells_block = ncells_block_x * ncells_block_y;
+            const int patch_side = cell_size_x / 4;
-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+            const int patch_size = cell_size_x + (patch_side * 2);
+            const int block_patch_size = ncells_block * patch_size;
+            const int threads_cell = power_2up(patch_size);
+            const int threads_block = ncells_block * threads_cell;
+            const int half_cell_size = cell_size_x / 2;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) /
                                  block_stride_x;
-            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /
+            int img_block_height = (height - ncells_block_y * cell_size_y + block_stride_y) /
                                   block_stride_y;
+            const int nblocks = max_nblocks(threads_cell, ncells_block);
            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
-            dim3 threads(32, 2, nblocks);
+            dim3 threads(threads_cell * ncells_block_x, ncells_block_y, nblocks);
-            cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,
-                                                cudaFuncCachePreferL1));
            // Precompute gaussian spatial window parameter
            float scale = 1.f / (2.f * sigma * sigma);
-            int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);
+            int hists_size = (nbins * ncells_block * patch_size * nblocks) * sizeof(float);
-            int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);
+            int final_hists_size = (nbins * ncells_block * nblocks) * sizeof(float);
            int smem = hists_size + final_hists_size;
-            compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
+            if (nblocks == 4)
-                img_block_width, grad, qangle, scale, block_hists);
+                compute_hists_kernel_many_blocks<4><<<grid, threads, smem>>>(
+                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else if (nblocks == 3)
+                compute_hists_kernel_many_blocks<3><<<grid, threads, smem>>>(
+                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else if (nblocks == 2)
+                compute_hists_kernel_many_blocks<2><<<grid, threads, smem>>>(
+                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else
+                compute_hists_kernel_many_blocks<1><<<grid, threads, smem>>>(
+                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
@@ -293,16 +346,16 @@ namespace cv { namespace cuda { namespace device
        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold)
+                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
        {
            const int nblocks = 1;
-            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            int block_hist_size = nbins * ncells_block_x * ncells_block_y;
            int nthreads = power_2up(block_hist_size);
            dim3 threads(nthreads, 1, nblocks);
-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
-            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
+            int img_block_height = (height - ncells_block_y * cell_size_y + block_stride_y) / block_stride_y;
            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
            if (nthreads == 32)
@@ -310,7 +363,7 @@ namespace cv { namespace cuda { namespace device
            else if (nthreads == 64)
                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
            else if (nthreads == 128)
-                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+                normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
            else if (nthreads == 256)
                normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
            else if (nthreads == 512)
@@ -365,7 +418,7 @@ namespace cv { namespace cuda { namespace device
       void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                               int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                               float* coefs, float free_coef, float threshold, float *confidences)
+                                               float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences)
       {
           const int nthreads = 256;
           const int nblocks = 1;
@@ -381,7 +434,7 @@ namespace cv { namespace cuda { namespace device
           cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
                                                                                   cudaFuncCachePreferL1));
-           int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+           int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) /
                                                       block_stride_x;
           compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
                   img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
@@ -427,7 +480,7 @@ namespace cv { namespace cuda { namespace device
        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                            float* coefs, float free_coef, float threshold, unsigned char* labels)
+                            float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels)
        {
            const int nthreads = 256;
            const int nblocks = 1;
@@ -442,7 +495,7 @@ namespace cv { namespace cuda { namespace device
            cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
            classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
                block_hists, coefs, free_coef, threshold, labels);
@@ -477,7 +530,7 @@ namespace cv { namespace cuda { namespace device
        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
-                                    int height, int width, float* block_hists, PtrStepSzf descriptors)
+                                    int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors)
        {
            const int nthreads = 256;
@@ -488,7 +541,7 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(nthreads, 1);
            dim3 grid(img_win_width, img_win_height);
-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
            extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
            cudaSafeCall( cudaGetLastError() );
@@ -525,7 +578,7 @@ namespace cv { namespace cuda { namespace device
        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
                                    PtrStepSzf descriptors)
        {
            const int nthreads = 256;
@@ -537,7 +590,7 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(nthreads, 1);
            dim3 grid(img_win_width, img_win_height);
-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
            extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
            cudaSafeCall( cudaGetLastError() );

--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -51,34 +51,45 @@ Ptr<cuda::HOG> cv::cuda::HOG::create(Size, Size, Size, Size, int) { throw_no_cud
 #else
+/****************************************************************************************\
+      The code below is implementation of HOG (Histogram-of-Oriented Gradients)
+      descriptor and object detection, introduced by Navneet Dalal and Bill Triggs.
+      The computed feature vectors are compatible with the
+      INRIA Object Detection and Localization Toolkit
+      (http://pascal.inrialpes.fr/soft/olt/)
+\****************************************************************************************/
 namespace cv { namespace cuda { namespace device
 {
    namespace hog
    {
        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-                              int nblocks_win_x, int nblocks_win_y);
+                              int nblocks_win_x, int nblocks_win_y,
+                              int ncells_block_x, int ncells_block_y);
-        void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
+        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
-                           int height, int width, const cv::cuda::PtrStepSzf& grad,
+                           int height, int width, const PtrStepSzf& grad,
-                           const cv::cuda::PtrStepSzb& qangle, float sigma, float* block_hists);
+                           const PtrStepSzb& qangle, float sigma, float* block_hists,
+                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);
        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold);
+                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);
        void classify_hists(int win_height, int win_width, int block_stride_y,
                            int block_stride_x, int win_stride_y, int win_stride_x, int height,
                            int width, float* block_hists, float* coefs, float free_coef,
-                            float threshold, unsigned char* labels);
+                            float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels);
        void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
                           int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                           float* coefs, float free_coef, float threshold, float *confidences);
+                           float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences);
        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
                                    cv::cuda::PtrStepSzf descriptors);
        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
                                    cv::cuda::PtrStepSzf descriptors);
        void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
@@ -167,6 +178,7 @@ namespace
        double scale0_;
        int group_threshold_;
        int descr_format_;
+        Size cells_per_block_;
    private:
        int getTotalHistSize(Size img_size) const;
@@ -197,7 +209,8 @@ namespace
        win_stride_(block_stride),
        scale0_(1.05),
        group_threshold_(2),
-        descr_format_(DESCR_FORMAT_COL_BY_COL)
+        descr_format_(DESCR_FORMAT_COL_BY_COL),
+        cells_per_block_(block_size.width / cell_size.width, block_size.height / cell_size.height)
    {
        CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
                  (win_size.height - block_size.height) % block_stride.height == 0);
@@ -205,12 +218,13 @@ namespace
        CV_Assert(block_size.width % cell_size.width == 0 &&
                  block_size.height % cell_size.height == 0);
-        CV_Assert(block_stride == cell_size);
+        // Navneet Dalal and Bill Triggs. Histograms of oriented gradients for
+        // human detection. In International Conference on Computer Vision and
-        CV_Assert(cell_size == Size(8, 8));
+        // Pattern Recognition, volume 2, pages 886–893, June 2005
+        // http://lear.inrialpes.fr/people/triggs/pubs/Dalal-cvpr05.pdf (28.07.2015) [Figure 5]
+        CV_Assert(block_stride == (block_size / 2));
-        Size cells_per_block(block_size.width / cell_size.width, block_size.height / cell_size.height);
+        CV_Assert(cell_size.width == cell_size.height);
-        CV_Assert(cells_per_block == Size(2, 2));
    }
    static int numPartsWithin(int size, int part_size, int stride)
@@ -231,8 +245,7 @@ namespace
    size_t HOG_Impl::getBlockHistogramSize() const
    {
-        Size cells_per_block(block_size_.width / cell_size_.width, block_size_.height / cell_size_.height);
+        return nbins_ * cells_per_block_.area();
-        return nbins_ * cells_per_block.area();
    }
    double HOG_Impl::getWinSigma() const
@@ -313,6 +326,7 @@ namespace
                                detector_.ptr<float>(),
                                (float)free_coef_,
                                (float)hit_threshold_,
+                                cell_size_.width, cells_per_block_.width,
                                labels.ptr());
            Mat labels_host;
@@ -339,6 +353,7 @@ namespace
                                          detector_.ptr<float>(),
                                          (float)free_coef_,
                                          (float)hit_threshold_,
+                                          cell_size_.width, cells_per_block_.width,
                                          labels.ptr<float>());
            Mat labels_host;
@@ -465,6 +480,7 @@ namespace
                                        win_stride_.height, win_stride_.width,
                                        img.rows, img.cols,
                                        block_hists.ptr<float>(),
+                                        cell_size_.width, cells_per_block_.width,
                                        descriptors);
            break;
        case DESCR_FORMAT_COL_BY_COL:
@@ -473,6 +489,7 @@ namespace
                                        win_stride_.height, win_stride_.width,
                                        img.rows, img.cols,
                                        block_hists.ptr<float>(),
+                                        cell_size_.width, cells_per_block_.width,
                                        descriptors);
            break;
        default:
@@ -490,7 +507,7 @@ namespace
    void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists)
    {
        cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
-        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height);
+        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height);
        BufferPool pool(Stream::Null());
@@ -505,13 +522,17 @@ namespace
                           img.rows, img.cols,
                           grad, qangle,
                           (float)getWinSigma(),
-                           block_hists.ptr<float>());
+                           block_hists.ptr<float>(),
+                           cell_size_.width, cell_size_.height,
+                           cells_per_block_.width, cells_per_block_.height);
        hog::normalize_hists(nbins_,
                             block_stride_.width, block_stride_.height,
                             img.rows, img.cols,
                             block_hists.ptr<float>(),
-                             (float)threshold_L2hys_);
+                             (float)threshold_L2hys_,
+                             cell_size_.width, cell_size_.height,
+                             cells_per_block_.width, cells_per_block_.height);
    }
    void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)

--- a/modules/cudaobjdetect/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@@ -217,9 +217,9 @@ CUDA_TEST_P(HOG, GetDescriptors)
                              r[(x * blocks_per_win_y + y) * block_hist_size + k]);
    }
 }
+/*
 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
+*/
 //============== caltech hog tests =====================//
 struct CalTech : public ::testing::TestWithParam<std::tr1::tuple<cv::cuda::DeviceInfo, std::string> >
@@ -269,8 +269,204 @@ INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png")));
+//------------------------variable GPU HOG Tests------------------------//
+struct Hog_var : public ::testing::TestWithParam<std::tr1::tuple<cv::cuda::DeviceInfo, std::string> >
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Mat img, c_img;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+        cv::Rect roi(0, 0, 16, 32);
+        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+        c_img = img(roi);
+    }
+};
+CUDA_TEST_P(Hog_var, HOG)
+{
+    cv::cuda::GpuMat _img(c_img);
+    cv::cuda::GpuMat d_img;
+    int win_stride_width = 8;int win_stride_height = 8;
+    int win_width = 16;
+    int block_width = 8;
+    int block_stride_width = 4;int block_stride_height = 4;
+    int cell_width = 4;
+    int nbins = 9;
+    Size win_stride(win_stride_width, win_stride_height);
+    Size win_size(win_width, win_width * 2);
+    Size block_size(block_width, block_width);
+    Size block_stride(block_stride_width, block_stride_height);
+    Size cell_size(cell_width, cell_width);
+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size, block_size, block_stride, cell_size, nbins);
+    gpu_hog->setNumLevels(13);
+    gpu_hog->setHitThreshold(0);
+    gpu_hog->setWinStride(win_stride);
+    gpu_hog->setScaleFactor(1.05);
+    gpu_hog->setGroupThreshold(8);
+    gpu_hog->compute(_img, d_img);
+    vector<float> gpu_desc_vec;
+    ASSERT_TRUE(gpu_desc_vec.empty());
+    cv::Mat R(d_img);
+    cv::HOGDescriptor cpu_hog(win_size, block_size, block_stride, cell_size, nbins);
+    cpu_hog.nlevels = 13;
+    vector<float> cpu_desc_vec;
+    ASSERT_TRUE(cpu_desc_vec.empty());
+    cpu_hog.compute(c_img, cpu_desc_vec, win_stride, Size(0,0));
+}
+INSTANTIATE_TEST_CASE_P(detect, Hog_var, testing::Combine(ALL_DEVICES,
+    ::testing::Values<std::string>("/hog/road.png")));
+struct Hog_var_cell : public ::testing::TestWithParam<std::tr1::tuple<cv::cuda::DeviceInfo, std::string> >
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Mat img, c_img, c_img2, c_img3, c_img4;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+        cv::Rect roi(0, 0, 48, 96);
+        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+        c_img = img(roi);
+        cv::Rect roi2(0, 0, 54, 108);
+        c_img2 = img(roi2);
+        cv::Rect roi3(0, 0, 64, 128);
+        c_img3 = img(roi3);
+        cv::Rect roi4(0, 0, 32, 64);
+        c_img4 = img(roi4);
+    }
+};
+CUDA_TEST_P(Hog_var_cell, HOG)
+{
+    cv::cuda::GpuMat _img(c_img);
+    cv::cuda::GpuMat _img2(c_img2);
+    cv::cuda::GpuMat _img3(c_img3);
+    cv::cuda::GpuMat _img4(c_img4);
+    cv::cuda::GpuMat d_img;
+    ASSERT_FALSE(_img.empty());
+    ASSERT_TRUE(d_img.empty());
+    int win_stride_width = 8;int win_stride_height = 8;
+    int win_width = 48;
+    int block_width = 16;
+    int block_stride_width = 8;int block_stride_height = 8;
+    int cell_width = 8;
+    int nbins = 9;
+    Size win_stride(win_stride_width, win_stride_height);
+    Size win_size(win_width, win_width * 2);
+    Size block_size(block_width, block_width);
+    Size block_stride(block_stride_width, block_stride_height);
+    Size cell_size(cell_width, cell_width);
+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size, block_size, block_stride, cell_size, nbins);
+    gpu_hog->setNumLevels(13);
+    gpu_hog->setHitThreshold(0);
+    gpu_hog->setWinStride(win_stride);
+    gpu_hog->setScaleFactor(1.05);
+    gpu_hog->setGroupThreshold(8);
+    gpu_hog->compute(_img, d_img);
+//------------------------------------------------------------------------------
+    cv::cuda::GpuMat d_img2;
+    ASSERT_TRUE(d_img2.empty());
+    int win_stride_width2 = 8;int win_stride_height2 = 8;
+    int win_width2 = 48;
+    int block_width2 = 16;
+    int block_stride_width2 = 8;int block_stride_height2 = 8;
+    int cell_width2 = 4;
+    Size win_stride2(win_stride_width2, win_stride_height2);
+    Size win_size2(win_width2, win_width2 * 2);
+    Size block_size2(block_width2, block_width2);
+    Size block_stride2(block_stride_width2, block_stride_height2);
+    Size cell_size2(cell_width2, cell_width2);
+    cv::Ptr<cv::cuda::HOG> gpu_hog2 = cv::cuda::HOG::create(win_size2, block_size2, block_stride2, cell_size2, nbins);
+    gpu_hog2->setWinStride(win_stride2);
+    gpu_hog2->compute(_img, d_img2);
+//------------------------------------------------------------------------------
+    cv::cuda::GpuMat d_img3;
+    ASSERT_TRUE(d_img3.empty());
+    int win_stride_width3 = 9;int win_stride_height3 = 9;
+    int win_width3 = 54;
+    int block_width3 = 18;
+    int block_stride_width3 = 9;int block_stride_height3 = 9;
+    int cell_width3 = 6;
+    Size win_stride3(win_stride_width3, win_stride_height3);
+    Size win_size3(win_width3, win_width3 * 2);
+    Size block_size3(block_width3, block_width3);
+    Size block_stride3(block_stride_width3, block_stride_height3);
+    Size cell_size3(cell_width3, cell_width3);
+    cv::Ptr<cv::cuda::HOG> gpu_hog3 = cv::cuda::HOG::create(win_size3, block_size3, block_stride3, cell_size3, nbins);
+    gpu_hog3->setWinStride(win_stride3);
+    gpu_hog3->compute(_img2, d_img3);
+//------------------------------------------------------------------------------
+    cv::cuda::GpuMat d_img4;
+    ASSERT_TRUE(d_img4.empty());
+    int win_stride_width4 = 16;int win_stride_height4 = 16;
+    int win_width4 = 64;
+    int block_width4 = 32;
+    int block_stride_width4 = 16;int block_stride_height4 = 16;
+    int cell_width4 = 8;
+    Size win_stride4(win_stride_width4, win_stride_height4);
+    Size win_size4(win_width4, win_width4 * 2);
+    Size block_size4(block_width4, block_width4);
+    Size block_stride4(block_stride_width4, block_stride_height4);
+    Size cell_size4(cell_width4, cell_width4);
+    cv::Ptr<cv::cuda::HOG> gpu_hog4 = cv::cuda::HOG::create(win_size4, block_size4, block_stride4, cell_size4, nbins);
+    gpu_hog4->setWinStride(win_stride4);
+    gpu_hog4->compute(_img3, d_img4);
+//------------------------------------------------------------------------------
+    cv::cuda::GpuMat d_img5;
+    ASSERT_TRUE(d_img5.empty());
+    int win_stride_width5 = 16;int win_stride_height5 = 16;
+    int win_width5 = 64;
+    int block_width5 = 32;
+    int block_stride_width5 = 16;int block_stride_height5 = 16;
+    int cell_width5 = 16;
+    Size win_stride5(win_stride_width5, win_stride_height5);
+    Size win_size5(win_width5, win_width5 * 2);
+    Size block_size5(block_width5, block_width5);
+    Size block_stride5(block_stride_width5, block_stride_height5);
+    Size cell_size5(cell_width5, cell_width5);
+    cv::Ptr<cv::cuda::HOG> gpu_hog5 = cv::cuda::HOG::create(win_size5, block_size5, block_stride5, cell_size5, nbins);
+    gpu_hog5->setWinStride(win_stride5);
+    gpu_hog5->compute(_img3, d_img5);
+//------------------------------------------------------------------------------
+}
+INSTANTIATE_TEST_CASE_P(detect, Hog_var_cell, testing::Combine(ALL_DEVICES,
+    ::testing::Values<std::string>("/hog/road.png")));
 //////////////////////////////////////////////////////////////////////////////////////////
 /// LBP classifier

--- a/samples/gpu/hog.cpp
+++ b/samples/gpu/hog.cpp
@@ -22,10 +22,14 @@ public:
    static Args read(int argc, char** argv);
    string src;
+    bool src_is_folder;
    bool src_is_video;
    bool src_is_camera;
    int camera_id;
+    bool svm_load;
+    string svm;
    bool write_video;
    string dst_video;
    double dst_video_fps;
@@ -44,6 +48,10 @@ public:
    int win_width;
    int win_stride_width, win_stride_height;
+    int block_width;
+    int block_stride_width, block_stride_height;
+    int cell_width;
+    int nbins;
    bool gamma_corr;
 };
@@ -93,6 +101,9 @@ static void printHelp()
    cout << "Histogram of Oriented Gradients descriptor and detector sample.\n"
         << "\nUsage: hog_gpu\n"
         << "  (<image>|--video <vide>|--camera <camera_id>) # frames source\n"
+         << "  or"
+         << "  (--folder <folder_path>) # load images from folder\n"
+         << "  [--svm <file> # load svm file"
         << "  [--make_gray <true/false>] # convert image to gray one or not\n"
         << "  [--resize_src <true/false>] # do resize of the source image or not\n"
         << "  [--width <int>] # resized image width\n"
@@ -100,9 +111,14 @@ static void printHelp()
         << "  [--hit_threshold <double>] # classifying plane distance threshold (0.0 usually)\n"
         << "  [--scale <double>] # HOG window scale factor\n"
         << "  [--nlevels <int>] # max number of HOG window scales\n"
-         << "  [--win_width <int>] # width of the window (48 or 64)\n"
+         << "  [--win_width <int>] # width of the window\n"
         << "  [--win_stride_width <int>] # distance by OX axis between neighbour wins\n"
         << "  [--win_stride_height <int>] # distance by OY axis between neighbour wins\n"
+         << "  [--block_width <int>] # width of the block\n"
+         << "  [--block_stride_width <int>] # distance by 0X axis between neighbour blocks\n"
+         << "  [--block_stride_height <int>] # distance by 0Y axis between neighbour blocks\n"
+         << "  [--cell_width <int>] # width of the cell\n"
+         << "  [--nbins <int>] # number of bins\n"
         << "  [--gr_threshold <int>] # merging similar rects constant\n"
         << "  [--gamma_correct <int>] # do gamma correction or not\n"
         << "  [--write_video <bool>] # write video or not\n"
@@ -142,6 +158,8 @@ Args::Args()
 {
    src_is_video = false;
    src_is_camera = false;
+    src_is_folder = false;
+    svm_load = false;
    camera_id = 0;
    write_video = false;
@@ -162,6 +180,11 @@ Args::Args()
    win_width = 48;
    win_stride_width = 8;
    win_stride_height = 8;
+    block_width = 16;
+    block_stride_width = 8;
+    block_stride_height = 8;
+    cell_width = 8;
+    nbins = 9;
    gamma_corr = true;
 }
@@ -186,6 +209,11 @@ Args Args::read(int argc, char** argv)
        else if (string(argv[i]) == "--win_width") args.win_width = atoi(argv[++i]);
        else if (string(argv[i]) == "--win_stride_width") args.win_stride_width = atoi(argv[++i]);
        else if (string(argv[i]) == "--win_stride_height") args.win_stride_height = atoi(argv[++i]);
+        else if (string(argv[i]) == "--block_width") args.block_width = atoi(argv[++i]);
+        else if (string(argv[i]) == "--block_stride_width") args.block_stride_width = atoi(argv[++i]);
+        else if (string(argv[i]) == "--block_stride_height") args.block_stride_height = atoi(argv[++i]);
+        else if (string(argv[i]) == "--cell_width") args.cell_width = atoi(argv[++i]);
+        else if (string(argv[i]) == "--nbins") args.nbins = atoi(argv[++i]);
        else if (string(argv[i]) == "--gr_threshold") args.gr_threshold = atoi(argv[++i]);
        else if (string(argv[i]) == "--gamma_correct") args.gamma_corr = (string(argv[++i]) == "true");
        else if (string(argv[i]) == "--write_video") args.write_video = (string(argv[++i]) == "true");
@@ -194,6 +222,8 @@ Args Args::read(int argc, char** argv)
        else if (string(argv[i]) == "--help") printHelp();
        else if (string(argv[i]) == "--video") { args.src = argv[++i]; args.src_is_video = true; }
        else if (string(argv[i]) == "--camera") { args.camera_id = atoi(argv[++i]); args.src_is_camera = true; }
+        else if (string(argv[i]) == "--folder") { args.src = argv[++i]; args.src_is_folder = true;}
+        else if (string(argv[i]) == "--svm") { args.svm = argv[++i]; args.svm_load = true;}
        else if (args.src.empty()) args.src = argv[i];
        else throw runtime_error((string("unknown key: ") + argv[i]));
    }
@@ -228,16 +258,17 @@ App::App(const Args& s)
    gamma_corr = args.gamma_corr;
-    if (args.win_width != 64 && args.win_width != 48)
-        args.win_width = 64;
    cout << "Scale: " << scale << endl;
    if (args.resize_src)
        cout << "Resized source: (" << args.width << ", " << args.height << ")\n";
    cout << "Group threshold: " << gr_threshold << endl;
    cout << "Levels number: " << nlevels << endl;
-    cout << "Win width: " << args.win_width << endl;
+    cout << "Win size: (" << args.win_width << ", " << args.win_width*2 << ")\n";
    cout << "Win stride: (" << args.win_stride_width << ", " << args.win_stride_height << ")\n";
+    cout << "Block size: (" << args.block_width << ", " << args.block_width << ")\n";
+    cout << "Block stride: (" << args.block_stride_width << ", " << args.block_stride_height << ")\n";
+    cout << "Cell size: (" << args.cell_width << ", " << args.cell_width << ")\n";
+    cout << "Bins number: " << args.nbins << endl;
    cout << "Hit threshold: " << hit_threshold << endl;
    cout << "Gamma correction: " << gamma_corr << endl;
    cout << endl;
@@ -249,22 +280,58 @@ void App::run()
    running = true;
    cv::VideoWriter video_writer;
-    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
    Size win_stride(args.win_stride_width, args.win_stride_height);
+    Size win_size(args.win_width, args.win_width * 2);
+    Size block_size(args.block_width, args.block_width);
+    Size block_stride(args.block_stride_width, args.block_stride_height);
+    Size cell_size(args.cell_width, args.cell_width);
+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size, block_size, block_stride, cell_size, args.nbins);
+    cv::HOGDescriptor cpu_hog(win_size, block_size, block_stride, cell_size, args.nbins);
+    if(args.svm_load) {
+        std::vector<float> svm_model;
+        const std::string model_file_name = args.svm;
+        FileStorage ifs(model_file_name, FileStorage::READ);
+        if (ifs.isOpened()) {
+            ifs["svm_detector"] >> svm_model;
+        } else {
+            const std::string what =
+                    "could not load model for hog classifier from file: "
+                    + model_file_name;
+            throw std::runtime_error(what);
+        }
-    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size);
+        // check if the variables are initialized
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9);
+        if (svm_model.empty()) {
+            const std::string what =
+                    "HoG classifier: svm model could not be loaded from file"
+                    + model_file_name;
+            throw std::runtime_error(what);
+        }
+        gpu_hog->setSVMDetector(svm_model);
+        cpu_hog.setSVMDetector(svm_model);
+    } else {
+        // Create HOG descriptors and detectors here
+        Mat detector = gpu_hog->getDefaultPeopleDetector();
-    // Create HOG descriptors and detectors here
+        gpu_hog->setSVMDetector(detector);
-    Mat detector = gpu_hog->getDefaultPeopleDetector();
+        cpu_hog.setSVMDetector(detector);
+    }
-    gpu_hog->setSVMDetector(detector);
+    cout << "gpusvmDescriptorSize : " << gpu_hog->getDescriptorSize()
-    cpu_hog.setSVMDetector(detector);
+         << endl;
+    cout << "cpusvmDescriptorSize : " << cpu_hog.getDescriptorSize()
+         << endl;
    while (running)
    {
        VideoCapture vc;
        Mat frame;
+        vector<String> filenames;
+        unsigned int count = 1;
        if (args.src_is_video)
        {
@@ -273,6 +340,14 @@ void App::run()
                throw runtime_error(string("can't open video file: " + args.src));
            vc >> frame;
        }
+        else if (args.src_is_folder) {
+            String folder = args.src;
+            cout << folder << endl;
+            glob(folder, filenames);
+            frame = imread(filenames[count]);	// 0 --> .gitignore
+            if (!frame.data)
+                cerr << "Problem loading image from folder!!!" << endl;
+        }
        else if (args.src_is_camera)
        {
            vc.open(args.camera_id);
@@ -327,7 +402,7 @@ void App::run()
            {
                cpu_hog.nlevels = nlevels;
                cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
-                                          Size(0, 0), scale, gr_threshold);
+                                         Size(0, 0), scale, gr_threshold);
            }
            hogWorkEnd();
@@ -342,11 +417,20 @@ void App::run()
                putText(img_to_show, "Mode: GPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
            else
                putText(img_to_show, "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
-            putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
+            putText(img_to_show, "FPS HOG: " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
-            putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
+            putText(img_to_show, "FPS total: " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
            imshow("opencv_gpu_hog", img_to_show);
            if (args.src_is_video || args.src_is_camera) vc >> frame;
+            if (args.src_is_folder) {
+                count++;
+                if (count < filenames.size()) {
+                    frame = imread(filenames[count]);
+                } else {
+                    Mat empty;
+                    frame = empty;
+                }
+            }
            workEnd();