Merge pull request #2234 from KonstantinMatskevich:ocl_tapi_hog

22146e4b · Andrey Pavlenko · OpenCV Buildbot · bd6620fa · 3495c595 · 22146e4b
Commit 22146e4b authored Feb 04, 2014 by Andrey Pavlenko Committed by OpenCV Buildbot Feb 04, 2014
5 changed files
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -285,10 +285,11 @@ public:
    CV_WRAP virtual void save(const String& filename, const String& objname = String()) const;
    virtual void copyTo(HOGDescriptor& c) const;

-    CV_WRAP virtual void compute(const Mat& img,
+    CV_WRAP virtual void compute(InputArray img,
                         CV_OUT std::vector<float>& descriptors,
                         Size winStride = Size(), Size padding = Size(),
                         const std::vector<Point>& locations = std::vector<Point>()) const;
+
    //with found weights output
    CV_WRAP virtual void detect(const Mat& img, CV_OUT std::vector<Point>& foundLocations,
                        CV_OUT std::vector<double>& weights,
@@ -300,13 +301,14 @@ public:
                        double hitThreshold = 0, Size winStride = Size(),
                        Size padding = Size(),
                        const std::vector<Point>& searchLocations=std::vector<Point>()) const;
+
    //with result weights output
-    CV_WRAP virtual void detectMultiScale(const Mat& img, CV_OUT std::vector<Rect>& foundLocations,
+    CV_WRAP virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
                                  CV_OUT std::vector<double>& foundWeights, double hitThreshold = 0,
                                  Size winStride = Size(), Size padding = Size(), double scale = 1.05,
                                  double finalThreshold = 2.0,bool useMeanshiftGrouping = false) const;
    //without found weights output
-    virtual void detectMultiScale(const Mat& img, CV_OUT std::vector<Rect>& foundLocations,
+    virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
                                  double hitThreshold = 0, Size winStride = Size(),
                                  Size padding = Size(), double scale = 1.05,
                                  double finalThreshold = 2.0, bool useMeanshiftGrouping = false) const;
@@ -328,25 +330,27 @@ public:
    CV_PROP double L2HysThreshold;
    CV_PROP bool gammaCorrection;
    CV_PROP std::vector<float> svmDetector;
+    UMat oclSvmDetector;
+    float free_coef;
    CV_PROP int nlevels;


-   // evaluate specified ROI and return confidence value for each location
-   virtual void detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
+    // evaluate specified ROI and return confidence value for each location
+    virtual void detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
                                   CV_OUT std::vector<cv::Point>& foundLocations, CV_OUT std::vector<double>& confidences,
                                   double hitThreshold = 0, cv::Size winStride = Size(),
                                   cv::Size padding = Size()) const;

-   // evaluate specified ROI and return confidence value for each location in multiple scales
-   virtual void detectMultiScaleROI(const cv::Mat& img,
+    // evaluate specified ROI and return confidence value for each location in multiple scales
+    virtual void detectMultiScaleROI(const cv::Mat& img,
                                                       CV_OUT std::vector<cv::Rect>& foundLocations,
                                                       std::vector<DetectionROI>& locations,
                                                       double hitThreshold = 0,
                                                       int groupThreshold = 0) const;

-   // read/parse Dalal's alt model file
-   void readALTModel(String modelfile);
-   void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
+    // read/parse Dalal's alt model file
+    void readALTModel(String modelfile);
+    void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
 };



--- a/modules/objdetect/perf/opencl/perf_hogdetect.cpp
+++ b/modules/objdetect/perf/opencl/perf_hogdetect.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+///////////// HOG////////////////////////
+
+struct RectLess :
+        public std::binary_function<cv::Rect, cv::Rect, bool>
+{
+    bool operator()(const cv::Rect& a,
+        const cv::Rect& b) const
+    {
+        if (a.x != b.x)
+            return a.x < b.x;
+        else if (a.y != b.y)
+            return a.y < b.y;
+        else if (a.width != b.width)
+            return a.width < b.width;
+        else
+            return a.height < b.height;
+    }
+};
+
+OCL_PERF_TEST(HOGFixture, HOG)
+{
+    UMat src;
+    imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE).copyTo(src);
+    ASSERT_FALSE(src.empty());
+
+    vector<cv::Rect> found_locations;
+    declare.in(src);
+
+    HOGDescriptor hog;
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
+
+    OCL_TEST_CYCLE() hog.detectMultiScale(src, found_locations);
+
+    std::sort(found_locations.begin(), found_locations.end(), RectLess());
+    SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
+}
+
+}
+}
+
+#endif
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -42,6 +42,7 @@

 #include "precomp.hpp"
 #include "opencv2/core/core_c.h"
+#include "opencl_kernels.hpp"

 #include <cstdio>
 #include <iterator>
@@ -58,6 +59,29 @@
 namespace cv
 {

+#define NTHREADS 256
+
+enum {DESCR_FORMAT_COL_BY_COL, DESCR_FORMAT_ROW_BY_ROW};
+
+static int numPartsWithin(int size, int part_size, int stride)
+{
+    return (size - part_size + stride) / stride;
+}
+
+static Size numPartsWithin(cv::Size size, cv::Size part_size,
+                                                cv::Size stride)
+{
+    return Size(numPartsWithin(size.width, part_size.width, stride.width),
+        numPartsWithin(size.height, part_size.height, stride.height));
+}
+
+static size_t getBlockHistogramSize(Size block_size, Size cell_size, int nbins)
+{
+    Size cells_per_block = Size(block_size.width / cell_size.width,
+        block_size.height / cell_size.height);
+    return (size_t)(nbins * cells_per_block.area());
+}
+
 size_t HOGDescriptor::getDescriptorSize() const
 {
    CV_Assert(blockSize.width % cellSize.width == 0 &&
@@ -88,7 +112,24 @@ bool HOGDescriptor::checkDetectorSize() const
 void HOGDescriptor::setSVMDetector(InputArray _svmDetector)
 {
    _svmDetector.getMat().convertTo(svmDetector, CV_32F);
-    CV_Assert( checkDetectorSize() );
+    CV_Assert(checkDetectorSize());
+
+    Mat detector_reordered(1, (int)svmDetector.size(), CV_32FC1);
+
+    size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
+    cv::Size blocks_per_img = numPartsWithin(winSize, blockSize, blockStride);
+
+    for (int i = 0; i < blocks_per_img.height; ++i)
+        for (int j = 0; j < blocks_per_img.width; ++j)
+        {
+            const float *src = &svmDetector[0] + (j * blocks_per_img.height + i) * block_hist_size;
+            float *dst = (float*)detector_reordered.data + (i * blocks_per_img.width + j) * block_hist_size;
+            for (size_t k = 0; k < block_hist_size; ++k)
+                dst[k] = src[k];
+        }
+    size_t descriptor_size = getDescriptorSize();
+    free_coef = svmDetector.size() > descriptor_size ? svmDetector[descriptor_size] : 0;
+    detector_reordered.copyTo(oclSvmDetector);
 }

 #define CV_TYPE_NAME_HOG_DESCRIPTOR "opencv-object-detector-hog"
@@ -1029,7 +1070,318 @@ static inline int gcd(int a, int b)
    return a;
 }

-void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
+#ifdef HAVE_OPENCL
+
+static bool ocl_compute_gradients_8UC1(int height, int width, InputArray _img, float angle_scale,
+                                       UMat grad, UMat qangle, bool correct_gamma, int nbins)
+{
+    ocl::Kernel k("compute_gradients_8UC1_kernel", ocl::objdetect::objdetect_hog_oclsrc);
+    if(k.empty())
+        return false;
+
+    UMat img = _img.getUMat();
+
+    size_t localThreads[3] = { NTHREADS, 1, 1 };
+    size_t globalThreads[3] = { width, height, 1 };
+    char correctGamma = (correct_gamma) ? 1 : 0;
+    int grad_quadstep = (int)grad.step >> 3;
+    int qangle_step_shift = 0;
+    int qangle_step = (int)qangle.step >> (1 + qangle_step_shift);
+
+    int idx = 0;
+    idx = k.set(idx, height);
+    idx = k.set(idx, width);
+    idx = k.set(idx, (int)img.step1());
+    idx = k.set(idx, grad_quadstep);
+    idx = k.set(idx, qangle_step);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(img));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(grad));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(qangle));
+    idx = k.set(idx, angle_scale);
+    idx = k.set(idx, correctGamma);
+    idx = k.set(idx, nbins);
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_computeGradient(InputArray img, UMat grad, UMat qangle, int nbins, Size effect_size, bool gamma_correction)
+{
+    float angleScale = (float)(nbins / CV_PI);
+
+    return ocl_compute_gradients_8UC1(effect_size.height, effect_size.width, img,
+         angleScale, grad, qangle, gamma_correction, nbins);
+}
+
+#define CELL_WIDTH 8
+#define CELL_HEIGHT 8
+#define CELLS_PER_BLOCK_X 2
+#define CELLS_PER_BLOCK_Y 2
+
+static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width,
+                              UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size)
+{
+    ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc);
+    if(k.empty())
+        return false;
+    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
+    cv::String opts;
+    if(is_cpu)
+       opts = "-D CPU ";
+    else
+        opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+    k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+    if(k.empty())
+        return false;
+
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)/block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)/block_stride_y;
+    int blocks_total = img_block_width * img_block_height;
+
+    int qangle_step_shift = 0;
+    int grad_quadstep = (int)grad.step >> 2;
+    int qangle_step = (int)qangle.step >> qangle_step_shift;
+
+    int blocks_in_group = 4;
+    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
+    size_t globalThreads[3] = {((img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group) * localThreads[0], 2, 1 };
+
+    int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
+    int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
+
+    int smem = (hists_size + final_hists_size) * blocks_in_group;
+
+    int idx = 0;
+    idx = k.set(idx, block_stride_x);
+    idx = k.set(idx, block_stride_y);
+    idx = k.set(idx, nbins);
+    idx = k.set(idx, (int)block_hist_size);
+    idx = k.set(idx, img_block_width);
+    idx = k.set(idx, blocks_in_group);
+    idx = k.set(idx, blocks_total);
+    idx = k.set(idx, grad_quadstep);
+    idx = k.set(idx, qangle_step);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(grad));
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(qangle));
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(gauss_w_lut));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(block_hists));
+    idx = k.set(idx, (void*)NULL, (size_t)smem);
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static int power_2up(unsigned int n)
+{
+    for(unsigned int i = 1; i<=1024; i<<=1)
+        if(n < i)
+            return i;
+    return -1; // Input is too big
+}
+
+static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_y,
+                                int height, int width, UMat block_hists, float threshold)
+{
+    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
+        / block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
+        / block_stride_y;
+    int nthreads;
+    size_t globalThreads[3] = { 1, 1, 1  };
+    size_t localThreads[3] = { 1, 1, 1  };
+
+    int idx = 0;
+    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
+    cv::String opts;
+    ocl::Kernel k;
+    if ( nbins == 9 )
+    {
+        k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+
+        int blocks_in_group = NTHREADS / block_hist_size;
+        nthreads = blocks_in_group * block_hist_size;
+        int num_groups = (img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group;
+        globalThreads[0] = nthreads * num_groups;
+        localThreads[0] = nthreads;
+    }
+    else
+    {
+        k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+
+        nthreads = power_2up(block_hist_size);
+        globalThreads[0] = img_block_width * nthreads;
+        globalThreads[1] = img_block_height;
+        localThreads[0] = nthreads;
+
+        if ((nthreads < 32) || (nthreads > 512) )
+            return false;
+
+        idx = k.set(idx, nthreads);
+        idx = k.set(idx, block_hist_size);
+        idx = k.set(idx, img_block_width);
+    }
+    idx = k.set(idx, ocl::KernelArg::PtrReadWrite(block_hists));
+    idx = k.set(idx, threshold);
+    idx = k.set(idx, (void*)NULL,  nthreads * sizeof(float));
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
+                                       int height, int width, UMat block_hists, UMat descriptors,
+                                       int block_hist_size, int descr_size, int descr_width)
+{
+    ocl::Kernel k("extract_descrs_by_rows_kernel", ocl::objdetect::objdetect_hog_oclsrc);
+    if(k.empty())
+        return false;
+
+    int win_block_stride_x = win_stride_x / block_stride_x;
+    int win_block_stride_y = win_stride_y / block_stride_y;
+    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
+
+    int descriptors_quadstep = (int)descriptors.step >> 2;
+
+    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
+    size_t localThreads[3] = { NTHREADS, 1, 1 };
+
+    int idx = 0;
+    idx = k.set(idx, block_hist_size);
+    idx = k.set(idx, descriptors_quadstep);
+    idx = k.set(idx, descr_size);
+    idx = k.set(idx, descr_width);
+    idx = k.set(idx, img_block_width);
+    idx = k.set(idx, win_block_stride_x);
+    idx = k.set(idx, win_block_stride_y);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors));
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
+                                       int height, int width, UMat block_hists, UMat descriptors,
+                                       int block_hist_size, int descr_size, int nblocks_win_x, int nblocks_win_y)
+{
+    ocl::Kernel k("extract_descrs_by_cols_kernel", ocl::objdetect::objdetect_hog_oclsrc);
+    if(k.empty())
+        return false;
+
+    int win_block_stride_x = win_stride_x / block_stride_x;
+    int win_block_stride_y = win_stride_y / block_stride_y;
+    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
+
+    int descriptors_quadstep = (int)descriptors.step >> 2;
+
+    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
+    size_t localThreads[3] = { NTHREADS, 1, 1 };
+
+    int idx = 0;
+    idx = k.set(idx, block_hist_size);
+    idx = k.set(idx, descriptors_quadstep);
+    idx = k.set(idx, descr_size);
+    idx = k.set(idx, nblocks_win_x);
+    idx = k.set(idx, nblocks_win_y);
+    idx = k.set(idx, img_block_width);
+    idx = k.set(idx, win_block_stride_x);
+    idx = k.set(idx, win_block_stride_y);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors));
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_compute(InputArray _img, Size win_stride, std::vector<float>& _descriptors, int descr_format, Size blockSize,
+                        Size cellSize, int nbins, Size blockStride, Size winSize, float sigma, bool gammaCorrection, double L2HysThreshold)
+{
+     Size imgSize = _img.size();
+    Size effect_size = imgSize;
+
+    UMat grad(imgSize, CV_32FC2);
+    UMat qangle(imgSize, CV_8UC2);
+
+    const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
+    const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
+    UMat block_hists(1, static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
+
+    Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride);
+    UMat labels(1, wins_per_img.area(), CV_8U);
+
+    float scale = 1.f / (2.f * sigma * sigma);
+    Mat gaussian_lut(1, 512, CV_32FC1);
+    int idx = 0;
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
+
+    if(!ocl_computeGradient(_img, grad, qangle, nbins, effect_size, gammaCorrection))
+        return false;
+
+    UMat gauss_w_lut;
+    gaussian_lut.copyTo(gauss_w_lut);
+    if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
+        effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size))
+        return false;
+
+    if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
+        effect_size.width, block_hists, (float)L2HysThreshold))
+        return false;
+
+    Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride);
+    wins_per_img = numPartsWithin(effect_size, winSize, win_stride);
+
+    int descr_size = blocks_per_win.area()*(int)block_hist_size;
+    int descr_width = (int)block_hist_size*blocks_per_win.width;
+
+    UMat descriptors(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
+    switch (descr_format)
+    {
+    case DESCR_FORMAT_ROW_BY_ROW:
+        if(!ocl_extract_descrs_by_rows(winSize.height, winSize.width,
+            blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height,
+            effect_size.width, block_hists, descriptors, (int)block_hist_size, descr_size, descr_width))
+            return false;
+        break;
+    case DESCR_FORMAT_COL_BY_COL:
+        if(!ocl_extract_descrs_by_cols(winSize.height, winSize.width,
+            blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height, effect_size.width,
+            block_hists, descriptors, (int)block_hist_size, descr_size, blocks_per_win.width, blocks_per_win.height))
+            return false;
+        break;
+    default:
+        return false;
+    }
+    descriptors.reshape(1, (int)descriptors.total()).getMat(ACCESS_READ).copyTo(_descriptors);
+    return true;
+}
+#endif //HAVE_OPENCL
+
+void HOGDescriptor::compute(InputArray _img, std::vector<float>& descriptors,
    Size winStride, Size padding, const std::vector<Point>& locations) const
 {
    if( winStride == Size() )
@@ -1037,11 +1389,18 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
    Size cacheStride(gcd(winStride.width, blockStride.width),
                     gcd(winStride.height, blockStride.height));

+    Size imgSize = _img.size();
+
    size_t nwindows = locations.size();
    padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
    padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
-    Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
+    Size paddedImgSize(imgSize.width + padding.width*2, imgSize.height + padding.height*2);
+
+    CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && _img.isUMat(),
+        ocl_compute(_img, winStride, descriptors, DESCR_FORMAT_COL_BY_COL, blockSize,
+        cellSize, nbins, blockStride, winSize, (float)getWinSigma(), gammaCorrection, L2HysThreshold))

+    Mat img = _img.getMat();
    HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);

    if( !nwindows )
@@ -1263,20 +1622,215 @@ private:
    Mutex* mtx;
 };

+#ifdef HAVE_OPENCL
+
+static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                               int win_stride_y, int win_stride_x, int height, int width,
+                               const UMat& block_hists, UMat detector,
+                               float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size)
+{
+    int nthreads;
+    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
+    cv::String opts;
+
+    ocl::Kernel k;
+    int idx = 0;
+    switch (descr_size.width)
+    {
+    case 180:
+        nthreads = 180;
+        k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+        idx = k.set(idx, descr_size.width);
+        idx = k.set(idx, descr_size.height);
+        break;
+
+    case 252:
+        nthreads = 256;
+        k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+        idx = k.set(idx, descr_size.width);
+        idx = k.set(idx, descr_size.height);
+        break;
+
+    default:
+        nthreads = 256;
+        k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+        idx = k.set(idx, descr_size.area());
+        idx = k.set(idx, descr_size.height);
+    }
+
+    int win_block_stride_x = win_stride_x / block_stride_x;
+    int win_block_stride_y = win_stride_y / block_stride_y;
+    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
+
+    size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
+    size_t localThreads[3] = { nthreads, 1, 1 };
+
+    idx = k.set(idx, block_hist_size);
+    idx = k.set(idx, img_win_width);
+    idx = k.set(idx, img_block_width);
+    idx = k.set(idx, win_block_stride_x);
+    idx = k.set(idx, win_block_stride_y);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(detector));
+    idx = k.set(idx, free_coef);
+    idx = k.set(idx, threshold);
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(labels));
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_detect(InputArray img, std::vector<Point> &hits, double hit_threshold, Size win_stride,
+                       const UMat& oclSvmDetector, Size blockSize, Size cellSize, int nbins, Size blockStride, Size winSize,
+                       bool gammaCorrection, double L2HysThreshold, float sigma, float free_coef)
+{
+    hits.clear();
+    if (oclSvmDetector.empty())
+        return false;
+
+    Size imgSize = img.size();
+    Size effect_size = imgSize;
+    UMat grad(imgSize, CV_32FC2);
+    UMat qangle(imgSize, CV_8UC2);
+
+    const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
+    const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
+    UMat block_hists(1, static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
+
+    Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride);
+    UMat labels(1, wins_per_img.area(), CV_8U);
+
+    float scale = 1.f / (2.f * sigma * sigma);
+    Mat gaussian_lut(1, 512, CV_32FC1);
+    int idx = 0;
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
+
+    if(!ocl_computeGradient(img, grad, qangle, nbins, effect_size, gammaCorrection))
+        return false;
+
+    UMat gauss_w_lut;
+    gaussian_lut.copyTo(gauss_w_lut);
+    if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
+        effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size))
+        return false;
+
+    if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
+        effect_size.width, block_hists, (float)L2HysThreshold))
+        return false;
+
+    Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride);
+
+    Size descr_size((int)block_hist_size*blocks_per_win.width, blocks_per_win.height);
+
+    if(!ocl_classify_hists(winSize.height, winSize.width, blockStride.height,
+        blockStride.width, win_stride.height, win_stride.width,
+        effect_size.height, effect_size.width, block_hists, oclSvmDetector,
+        free_coef, (float)hit_threshold, labels, descr_size, (int)block_hist_size))
+        return false;
+
+    Mat labels_host = labels.getMat(ACCESS_READ);
+    unsigned char *vec = labels_host.ptr();
+    for (int i = 0; i < wins_per_img.area(); i++)
+    {
+        int y = i / wins_per_img.width;
+        int x = i - wins_per_img.width * y;
+        if (vec[i])
+        {
+            hits.push_back(Point(x * win_stride.width, y * win_stride.height));
+        }
+    }
+    return true;
+}
+
+static bool ocl_detectMultiScale(InputArray _img, std::vector<Rect> &found_locations, std::vector<double>& level_scale,
+                                              double hit_threshold, Size win_stride, double group_threshold,
+                                              const UMat& oclSvmDetector, Size blockSize, Size cellSize,
+                                              int nbins, Size blockStride, Size winSize, bool gammaCorrection,
+                                              double L2HysThreshold, float sigma, float free_coef)
+{
+    std::vector<Rect> all_candidates;
+    std::vector<Point> locations;
+    UMat image_scale;
+    Size imgSize = _img.size();
+    image_scale.create(imgSize, _img.type());
+
+    for (size_t i = 0; i<level_scale.size() ; i++)
+    {
+        double scale = level_scale[i];
+        Size effect_size = Size(cvRound(imgSize.width / scale), cvRound(imgSize.height / scale));
+        if (effect_size == imgSize)
+        {
+            if(!ocl_detect(_img, locations, hit_threshold, win_stride, oclSvmDetector, blockSize, cellSize, nbins,
+                blockStride, winSize, gammaCorrection, L2HysThreshold, sigma, free_coef))
+                return false;
+        }
+        else
+        {
+            resize(_img, image_scale, effect_size);
+            if(!ocl_detect(image_scale, locations, hit_threshold, win_stride, oclSvmDetector, blockSize, cellSize, nbins,
+                blockStride, winSize, gammaCorrection, L2HysThreshold, sigma, free_coef))
+                return false;
+        }
+        Size scaled_win_size(cvRound(winSize.width * scale),
+            cvRound(winSize.height * scale));
+        for (size_t j = 0; j < locations.size(); j++)
+            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
+    }
+    found_locations.assign(all_candidates.begin(), all_candidates.end());
+    cv::groupRectangles(found_locations, (int)group_threshold, 0.2);
+    return true;
+}
+#endif //HAVE_OPENCL
+
 void HOGDescriptor::detectMultiScale(
-    const Mat& img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
+    InputArray _img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
    double hitThreshold, Size winStride, Size padding,
    double scale0, double finalThreshold, bool useMeanshiftGrouping) const
 {
    double scale = 1.;
    int levels = 0;

+    Size imgSize = _img.size();
    std::vector<double> levelScale;
    for( levels = 0; levels < nlevels; levels++ )
    {
        levelScale.push_back(scale);
-        if( cvRound(img.cols/scale) < winSize.width ||
-                cvRound(img.rows/scale) < winSize.height ||
+        if( cvRound(imgSize.width/scale) < winSize.width ||
+            cvRound(imgSize.height/scale) < winSize.height ||
                scale0 <= 1 )
            break;
        scale *= scale0;
@@ -1284,12 +1838,21 @@ void HOGDescriptor::detectMultiScale(
    levels = std::max(levels, 1);
    levelScale.resize(levels);

+    if(winStride == Size())
+        winStride = blockStride;
+
+    CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && scale0 > 1 && winStride.width % blockStride.width == 0 &&
+        winStride.height % blockStride.height == 0 && padding == Size(0,0) && _img.isUMat(),
+        ocl_detectMultiScale(_img, foundLocations, levelScale, hitThreshold, winStride, finalThreshold, oclSvmDetector,
+        blockSize, cellSize, nbins, blockStride, winSize, gammaCorrection, L2HysThreshold, (float)getWinSigma(), free_coef));
+
    std::vector<Rect> allCandidates;
    std::vector<double> tempScales;
    std::vector<double> tempWeights;
    std::vector<double> foundScales;
-    Mutex mtx;

+    Mutex mtx;
+    Mat img = _img.getMat();
    Range range(0, (int)levelScale.size());
    HOGInvoker invoker(this, img, hitThreshold, winStride, padding, &levelScale[0], &allCandidates, &mtx, &tempWeights, &tempScales);
    parallel_for_(range, invoker);
@@ -1306,7 +1869,7 @@ void HOGDescriptor::detectMultiScale(
        groupRectangles(foundLocations, foundWeights, (int)finalThreshold, 0.2);
 }

-void HOGDescriptor::detectMultiScale(const Mat& img, std::vector<Rect>& foundLocations,
+void HOGDescriptor::detectMultiScale(InputArray img, std::vector<Rect>& foundLocations,
    double hitThreshold, Size winStride, Size padding,
    double scale0, double finalThreshold, bool useMeanshiftGrouping) const
 {

--- a/modules/objdetect/src/opencl/objdetect_hog.cl
+++ b/modules/objdetect/src/opencl/objdetect_hog.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wenju He, wenju@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CELL_WIDTH 8
+#define CELL_HEIGHT 8
+#define CELLS_PER_BLOCK_X 2
+#define CELLS_PER_BLOCK_Y 2
+#define NTHREADS 256
+#define CV_PI_F 3.1415926535897932384626433832795f
+
+#ifdef INTEL_DEVICE
+#define QANGLE_TYPE		int
+#define QANGLE_TYPE2	int2
+#else
+#define QANGLE_TYPE		uchar
+#define QANGLE_TYPE2	uchar2
+#endif
+
+//----------------------------------------------------------------------------
+// Histogram computation
+// 12 threads for a cell, 12x4 threads per block
+// Use pre-computed gaussian and interp_weight lookup tables
+__kernel void compute_hists_lut_kernel(
+    const int cblock_stride_x, const int cblock_stride_y,
+    const int cnbins, const int cblock_hist_size, const int img_block_width,
+    const int blocks_in_group, const int blocks_total,
+    const int grad_quadstep, const int qangle_step,
+    __global const float* grad, __global const QANGLE_TYPE* qangle,
+    __global const float* gauss_w_lut,
+    __global float* block_hists, __local float* smem)
+{
+    const int lx = get_local_id(0);
+    const int lp = lx / 24; /* local group id */
+    const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
+    const int gidY = gid / img_block_width;
+    const int gidX = gid - gidY * img_block_width;
+
+    const int lidX = lx - lp * 24;
+    const int lidY = get_local_id(1);
+
+    const int cell_x = lidX / 12;
+    const int cell_y = lidY;
+    const int cell_thread_x = lidX - cell_x * 12;
+
+    __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
+        CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
+    __local float* final_hist = hists + cnbins *
+        (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
+
+    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
+    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
+
+    __global const float* grad_ptr = (gid < blocks_total) ?
+        grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
+    __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
+        qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
+
+    __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
+        cell_thread_x;
+    for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+        hist[bin_id * 48] = 0.f;
+
+    const int dist_x = -4 + cell_thread_x - 4 * cell_x;
+    const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+    const int dist_y_begin = -4 - 4 * lidY;
+    for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+    {
+        float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
+        QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
+
+        grad_ptr += grad_quadstep;
+        qangle_ptr += qangle_step;
+
+        int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+
+        int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
+        float gaussian = gauss_w_lut[idx];
+        idx = (dist_y + 8) * 16 + (dist_x + 8);
+        float interp_weight = gauss_w_lut[256+idx];
+
+        hist[bin.x * 48] += gaussian * interp_weight * vote.x;
+        hist[bin.y * 48] += gaussian * interp_weight * vote.y;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* hist_ = hist;
+    for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
+    {
+        if (cell_thread_x < 6)
+            hist_[0] += hist_[6];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (cell_thread_x < 3)
+            hist_[0] += hist_[3];
+#ifdef CPU
+        barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+        if (cell_thread_x == 0)
+            final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
+                hist_[0] + hist_[1] + hist_[2];
+    }
+#ifdef CPU
+    barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
+    if ((tid < cblock_hist_size) && (gid < blocks_total))
+    {
+        __global float* block_hist = block_hists +
+            (gidY * img_block_width + gidX) * cblock_hist_size;
+        block_hist[tid] = final_hist[tid];
+    }
+}
+
+//-------------------------------------------------------------
+//  Normalization of histograms via L2Hys_norm
+//  optimized for the case of 9 bins
+__kernel void normalize_hists_36_kernel(__global float* block_hists,
+                                        const float threshold, __local float *squares)
+{
+    const int tid = get_local_id(0);
+    const int gid = get_global_id(0);
+    const int bid = tid / 36;      /* block-hist id, (0 - 6) */
+    const int boffset = bid * 36;  /* block-hist offset in the work-group */
+    const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
+
+    float elem = block_hists[gid];
+    squares[tid] = elem * elem;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local float* smem = squares + boffset;
+    float sum = smem[hid];
+    if (hid < 18)
+        smem[hid] = sum = sum + smem[hid + 18];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 9)
+        smem[hid] = sum = sum + smem[hid + 9];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 4)
+        smem[hid] = sum + smem[hid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+    elem = elem / (sqrt(sum) + 3.6f);
+    elem = min(elem, threshold);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    squares[tid] = elem * elem;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    sum = smem[hid];
+    if (hid < 18)
+      smem[hid] = sum = sum + smem[hid + 18];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 9)
+        smem[hid] = sum = sum + smem[hid + 9];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 4)
+        smem[hid] = sum + smem[hid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+    block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
+}
+
+//-------------------------------------------------------------
+//  Normalization of histograms via L2Hys_norm
+//
+inline float reduce_smem(volatile __local float* smem, int size)
+{
+    unsigned int tid = get_local_id(0);
+    float sum = smem[tid];
+
+    if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+#ifdef CPU
+    if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
+        barrier(CLK_LOCAL_MEM_FENCE); }
+#else
+    if (tid < 32)
+    {
+        if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
+#if WAVE_SIZE < 32
+    } barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) {
+#endif
+        if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
+        if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
+        if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
+        if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
+        if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
+    }
+#endif
+
+    return sum;
+}
+
+__kernel void normalize_hists_kernel(
+    const int nthreads, const int block_hist_size, const int img_block_width,
+    __global float* block_hists, const float threshold, __local float *squares)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global float* hist = block_hists + (gidY * img_block_width + gidX) *
+        block_hist_size + tid;
+
+    float elem = 0.f;
+    if (tid < block_hist_size)
+        elem = hist[0];
+
+    squares[tid] = elem * elem;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float sum = reduce_smem(squares, nthreads);
+
+    float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
+    elem = min(elem * scale, threshold);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    squares[tid] = elem * elem;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = reduce_smem(squares, nthreads);
+    scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+    if (tid < block_hist_size)
+        hist[0] = elem * scale;
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  48x96 window, 9 bins and default parameters
+//  180 threads, each thread corresponds to a bin in a row
+__kernel void classify_hists_180_kernel(
+    const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y *
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+
+    for (int i = 0; i < cdescr_height; i++)
+    {
+        product += coefs[i * cdescr_width + tid] *
+            hist[i * img_block_width * cblock_hist_size + tid];
+    }
+
+    __local float products[180];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 90) products[tid] = product = product + products[tid + 90];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 45) products[tid] = product = product + products[tid + 45];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* smem = products;
+#ifdef CPU
+    if (tid < 13) smem[tid] = product = product + smem[tid + 32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) smem[tid] = product = product + smem[tid + 16];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<8) smem[tid] = product = product + smem[tid + 8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<4) smem[tid] = product = product + smem[tid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<2) smem[tid] = product = product + smem[tid + 2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 13)
+    {
+        smem[tid] = product = product + smem[tid + 32];
+    }
+#if WAVE_SIZE < 32
+    barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+    if (tid < 16)
+    {
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+    }
+#endif
+
+    if (tid == 0){
+        product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+    }
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  64x128 window, 9 bins and default parameters
+//  256 threads, 252 of them are used
+__kernel void classify_hists_252_kernel(
+    const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y *
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    if (tid < cdescr_width)
+    {
+        for (int i = 0; i < cdescr_height; i++)
+            product += coefs[i * cdescr_width + tid] *
+                hist[i * img_block_width * cblock_hist_size + tid];
+    }
+
+    __local float products[NTHREADS];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* smem = products;
+#ifdef CPU
+    if(tid<32) smem[tid] = product = product + smem[tid + 32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<16) smem[tid] = product = product + smem[tid + 16];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<8) smem[tid] = product = product + smem[tid + 8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<4) smem[tid] = product = product + smem[tid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<2) smem[tid] = product = product + smem[tid + 2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 32)
+    {
+        smem[tid] = product = product + smem[tid + 32];
+#if WAVE_SIZE < 32
+    } barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) {
+#endif
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+    }
+#endif
+    if (tid == 0){
+        product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+    }
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  256 threads
+__kernel void classify_hists_kernel(
+    const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y *
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        product += coefs[i] *
+            hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+    }
+
+    __local float products[NTHREADS];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* smem = products;
+#ifdef CPU
+    if(tid<32) smem[tid] = product = product + smem[tid + 32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<16) smem[tid] = product = product + smem[tid + 16];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<8) smem[tid] = product = product + smem[tid + 8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<4) smem[tid] = product = product + smem[tid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(tid<2) smem[tid] = product = product + smem[tid + 2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 32)
+    {
+        smem[tid] = product = product + smem[tid + 32];
+#if WAVE_SIZE < 32
+    } barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) {
+#endif
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+    }
+#endif
+    if (tid == 0){
+        smem[tid] = product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+    }
+}
+
+//----------------------------------------------------------------------------
+// Extract descriptors
+
+__kernel void extract_descrs_by_rows_kernel(
+    const int cblock_hist_size, const int descriptors_quadstep,
+    const int cdescr_size, const int cdescr_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float* block_hists, __global float* descriptors)
+{
+    int tid = get_local_id(0);
+    int gidX = get_group_id(0);
+    int gidY = get_group_id(1);
+
+    // Get left top corner of the window in src
+    __global const float* hist = block_hists + (gidY * win_block_stride_y *
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    // Get left top corner of the window in dst
+    __global float* descriptor = descriptors +
+        (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+
+    // Copy elements from src to dst
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+    }
+}
+
+__kernel void extract_descrs_by_cols_kernel(
+    const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
+    const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float* block_hists, __global float* descriptors)
+{
+    int tid = get_local_id(0);
+    int gidX = get_group_id(0);
+    int gidY = get_group_id(1);
+
+    // Get left top corner of the window in src
+    __global const float* hist = block_hists +  (gidY * win_block_stride_y *
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    // Get left top corner of the window in dst
+    __global float* descriptor = descriptors +
+        (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+
+    // Copy elements from src to dst
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int block_idx = i / cblock_hist_size;
+        int idx_in_block = i - block_idx * cblock_hist_size;
+
+        int y = block_idx / cnblocks_win_x;
+        int x = block_idx - y * cnblocks_win_x;
+
+        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =
+            hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+    }
+}
+
+//----------------------------------------------------------------------------
+// Gradients computation
+
+__kernel void compute_gradients_8UC4_kernel(
+    const int height, const int width,
+    const int img_step, const int grad_quadstep, const int qangle_step,
+    const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
+    const float angle_scale, const char correct_gamma, const int cnbins)
+{
+    const int x = get_global_id(0);
+    const int tid = get_local_id(0);
+    const int gSizeX = get_local_size(0);
+    const int gidY = get_group_id(1);
+
+    __global const uchar4* row = img + gidY * img_step;
+
+    __local float sh_row[(NTHREADS + 2) * 3];
+
+    uchar4 val;
+    if (x < width)
+        val = row[x];
+    else
+        val = row[width - 2];
+
+    sh_row[tid + 1] = val.x;
+    sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
+    sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
+
+    if (tid == 0)
+    {
+        val = row[max(x - 1, 1)];
+        sh_row[0] = val.x;
+        sh_row[(NTHREADS + 2)] = val.y;
+        sh_row[2 * (NTHREADS + 2)] = val.z;
+    }
+
+    if (tid == gSizeX - 1)
+    {
+        val = row[min(x + 1, width - 2)];
+        sh_row[gSizeX + 1] = val.x;
+        sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
+        sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (x < width)
+    {
+        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
+            sh_row[tid + 2 * (NTHREADS + 2)]);
+        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
+            sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+
+        float3 dx;
+        if (correct_gamma == 1)
+            dx = sqrt(b) - sqrt(a);
+        else
+            dx = b - a;
+
+        float3 dy = (float3) 0.f;
+
+        if (gidY > 0 && gidY < height - 1)
+        {
+            a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
+            b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
+
+            if (correct_gamma == 1)
+                dy = sqrt(b) - sqrt(a);
+            else
+                dy = b - a;
+        }
+
+        float best_dx = dx.x;
+        float best_dy = dy.x;
+
+        float mag0 = dx.x * dx.x + dy.x * dy.x;
+        float mag1 = dx.y * dx.y + dy.y * dy.y;
+        if (mag0 < mag1)
+        {
+            best_dx = dx.y;
+            best_dy = dy.y;
+            mag0 = mag1;
+        }
+
+        mag1 = dx.z * dx.z + dy.z * dy.z;
+        if (mag0 < mag1)
+        {
+            best_dx = dx.z;
+            best_dy = dy.z;
+            mag0 = mag1;
+        }
+
+        mag0 = sqrt(mag0);
+
+        float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
+        int hidx = (int)floor(ang);
+        ang -= hidx;
+        hidx = (hidx + cnbins) % cnbins;
+
+        qangle[(gidY * qangle_step + x) << 1] = hidx;
+        qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
+        grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
+        grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
+    }
+}
+
+__kernel void compute_gradients_8UC1_kernel(
+    const int height, const int width,
+    const int img_step, const int grad_quadstep, const int qangle_step,
+    __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
+    const float angle_scale, const char correct_gamma, const int cnbins)
+{
+    const int x = get_global_id(0);
+    const int tid = get_local_id(0);
+    const int gSizeX = get_local_size(0);
+    const int gidY = get_group_id(1);
+
+    __global const uchar* row = img + gidY * img_step;
+
+    __local float sh_row[NTHREADS + 2];
+
+    if (x < width)
+        sh_row[tid + 1] = row[x];
+    else
+        sh_row[tid + 1] = row[width - 2];
+
+    if (tid == 0)
+        sh_row[0] = row[max(x - 1, 1)];
+
+    if (tid == gSizeX - 1)
+        sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (x < width)
+    {
+        float dx;
+
+        if (correct_gamma == 1)
+            dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
+        else
+            dx = sh_row[tid + 2] - sh_row[tid];
+
+        float dy = 0.f;
+        if (gidY > 0 && gidY < height - 1)
+        {
+            float a = (float) img[ (gidY + 1) * img_step + x ];
+            float b = (float) img[ (gidY - 1) * img_step + x ];
+            if (correct_gamma == 1)
+                dy = sqrt(a) - sqrt(b);
+            else
+                dy = a - b;
+        }
+        float mag = sqrt(dx * dx + dy * dy);
+
+        float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
+        int hidx = (int)floor(ang);
+        ang -= hidx;
+        hidx = (hidx + cnbins) % cnbins;
+
+        qangle[ (gidY * qangle_step + x) << 1 ]     = hidx;
+        qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
+        grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
+        grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
+    }
+}
--- a/modules/objdetect/test/opencl/test_hogdetector.cpp
+++ b/modules/objdetect/test/opencl/test_hogdetector.cpp
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Shengen Yan, yanshengen@gmail.com
+//    Jiang Liyuan,jlyuan001.good@163.com
+//    Rock Li, Rock.Li@amd.com
+//    Zailong Wu, bullet@yeah.net
+//    Yao Wang, bitwangyaoyao@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////////////// HOG /////////////////////////////
+PARAM_TEST_CASE(HOG, Size, MatType)
+{
+    Size winSize;
+    int type;
+    Mat img;
+    UMat uimg;
+    virtual void SetUp()
+    {
+        winSize = GET_PARAM(0);
+        type = GET_PARAM(1);
+        img = readImage("cascadeandhog/images/image_00000000_0.png", IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+        img.copyTo(uimg);
+    }
+};
+
+OCL_TEST_P(HOG, GetDescriptors)
+{
+    HOGDescriptor hog;
+    hog.gammaCorrection = true;
+
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
+
+    std::vector<float> cpu_descriptors;
+    std::vector<float> gpu_descriptors;
+
+    OCL_OFF(hog.compute(img, cpu_descriptors, hog.winSize));
+    OCL_ON(hog.compute(uimg, gpu_descriptors, hog.winSize));
+
+    Mat cpu_desc(cpu_descriptors), gpu_desc(gpu_descriptors);
+
+    EXPECT_MAT_SIMILAR(cpu_desc, gpu_desc, 1e-1);
+}
+
+OCL_TEST_P(HOG, Detect)
+{
+    HOGDescriptor hog;
+    hog.winSize = winSize;
+    hog.gammaCorrection = true;
+
+    if (winSize.width == 48 && winSize.height == 96)
+        hog.setSVMDetector(hog.getDaimlerPeopleDetector());
+    else
+        hog.setSVMDetector(hog.getDefaultPeopleDetector());
+
+    std::vector<Rect> cpu_found;
+    std::vector<Rect> gpu_found;
+
+    OCL_OFF(hog.detectMultiScale(img, cpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
+    OCL_ON(hog.detectMultiScale(uimg, gpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
+
+    EXPECT_LT(checkRectSimilarity(img.size(), cpu_found, gpu_found), 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
+                            testing::Values(Size(64, 128), Size(48, 96)),
+                            testing::Values( MatType(CV_8UC1) ) ) );
+
+}}
+#endif