Merge pull request #1417 from Yujun-Shi:segmentation_hfs

Segmentation hfs (#1417) * add hfs module * fix some warning * fix a CMakeLists.txt bug * fix some warnings * try to fix some warning * fix static check warning * final fix

Merge pull request #1417 from Yujun-Shi:segmentation_hfs
Segmentation hfs (#1417) * add hfs module * fix some warning * fix a CMakeLists.txt bug * fix some warnings * try to fix some warning * fix static check warning * final fix
7d4a4526 · Yujun Shi · Alexander Alekhin · 7e6ae5ce · 7d4a4526 · 7d4a4526
Commit 7d4a4526 authored Feb 04, 2018 by Yujun Shi Committed by Alexander Alekhin Feb 04, 2018
26 changed files
--- a/modules/hfs/CMakeLists.txt
+++ b/modules/hfs/CMakeLists.txt
+if(HAVE_CUDA)
+  add_definitions(-D_HFS_CUDA_ON_)
+endif()
+
+set(the_description "Hierarchical Feature Selection for Efficient Image Segmentation")
+ocv_define_module(hfs opencv_core opencv_imgproc WRAP python)
\ No newline at end of file
--- a/modules/hfs/README.md
+++ b/modules/hfs/README.md
+##   OpenCV Hierarchical Feature Selection for Efficient Image Segmentation module
+
+Author and maintainers: Yujun Shi (shiyujun1016@gmail.com), Yun Liu (nk12csly@mail.nankai.edu.cn).
+
+Hierachical Feature Selection (HFS) is a real-time system for image segmentation. It was originally proposed in [1]. Here is the original project website: http://mmcheng.net/zh/hfs/
+
+The algorithm is executed in 3 stages. In the first stage, it obtains an over-segmented image using SLIC(simple linear iterative clustering). In the last 2 stages, it iteratively merges the over-segmented  image with merging method mentioned in EGB(Efficient Graph-based Image Segmentation) and learned SVM parameters.
+
+In our implementation, we wrapped these stages into one single member function of the interface class.
+
+Since this module used cuda in some part of  the implementation, it has to be compiled with cuda support
+
+For more details about the algorithm, please refer to the original paper: [1]
+
+### usage
+
+c++ interface:
+
+```c++
+// read a image
+Mat img = imread(image_path), res;
+int _h = img.rows, _w = img.cols;
+
+// create engine
+Ptr<HfsSegment> seg = HfsSegment::create( _h, _w );
+
+// perform segmentation
+// now "res" is a matrix of indices
+// change the second parameter to "True" to get a rgb image for "res"
+res = seg->performSegmentGpu(img, false);
+```
+
+python interface:
+
+```python
+import cv2
+import numpy as np
+
+img = cv2.imread(image_path)
+
+# create engine
+engine = cv2.hfs.HfsSegment_create(img.shape[0], img.shape[1])
+
+# perform segmentation
+# now "res" is a matrix of indices
+# change the second parameter to "True" to get a rgb image for "res"
+res = engine.performSegmentGpu(img, False)
+```
+
+
+
+### Reference
+
+[1]: M. cheng, Y. Liu, Q. Hou, J. Bian, P. Torr, S. Hu, Z. Tu HFS: Hierarchical Feature Selection for Efficient Image Segmentation ECCV, Oct.2016.
\ No newline at end of file
--- a/modules/hfs/doc/hfs.bib
+++ b/modules/hfs/doc/hfs.bib
+@inproceedings{cheng2016hfs,
+  title={HFS: Hierarchical Feature Selection for Efficient Image Segmentation},
+  author={Cheng, Ming-Ming and Liu, Yun and Hou, Qibin and Bian, Jiawang and Torr, Philip and Hu, Shi-Min and Tu, Zhuowen},
+  booktitle={European Conference on Computer Vision},
+  pages={867--882},
+  year={2016},
+  organization={Springer}
+}
--- a/modules/hfs/include/opencv2/hfs.hpp
+++ b/modules/hfs/include/opencv2/hfs.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/core.hpp"
+
+namespace cv { namespace hfs {
+
+/** @defgroup hfs Hierarchical Feature Selection for Efficient Image Segmentation
+
+The opencv hfs module contains an efficient algorithm to segment an image.
+This module is implemented based on the paper Hierarchical Feature Selection for Efficient
+Image Segmentation, ECCV 2016. The original project was developed by
+Yun Liu(https://github.com/yun-liu/hfs).
+
+
+Introduction to Hierarchical Feature Selection
+----------------------------------------------
+
+
+This algorithm is executed in 3 stages:
+
+In the first stage, the algorithm uses SLIC (simple linear iterative clustering) algorithm
+to obtain the superpixel of the input image.
+
+In the second stage, the algorithm view each superpixel as a node in the graph.
+It will calculate a feature vector for each edge of the graph. It then calculates a weight
+for each edge based on the feature vector and trained SVM parameters. After obtaining
+weight for each edge, it will exploit  EGB (Efficient Graph-based Image Segmentation)
+algorithm to merge some nodes in the graph thus obtaining a coarser segmentation
+After these operations, a post process will be executed to merge regions that are smaller
+then a specific number of pixels into their nearby region.
+
+In the third stage, the algorithm exploits the similar mechanism to further merge
+the small regions obtained in the second stage into even coarser segmentation.
+
+After these three stages, we can obtain the final segmentation of the image.
+For further details about the algorithm, please refer to the original paper:
+Hierarchical Feature Selection for Efficient Image Segmentation, ECCV 2016
+
+*/
+
+//! @addtogroup hfs
+//! @{
+class CV_EXPORTS_W HfsSegment : public Algorithm {
+public:
+
+/** @brief: set and get the parameter segEgbThresholdI.
+* This parameter is used in the second stage mentioned above.
+* It is a constant used to threshold weights of the edge when merging
+* adjacent nodes when applying EGB algorithm. The segmentation result
+* tends to have more regions remained if this value is large and vice versa.
+*/
+CV_WRAP virtual void setSegEgbThresholdI(float c) = 0;
+CV_WRAP virtual float getSegEgbThresholdI() = 0;
+
+
+/** @brief: set and get the parameter minRegionSizeI.
+* This parameter is used in the second stage
+* mentioned above. After the EGB segmentation, regions that have fewer
+* pixels then this parameter will be merged into it's adjacent region.
+*/
+CV_WRAP virtual void setMinRegionSizeI(int n) = 0;
+CV_WRAP virtual int getMinRegionSizeI() = 0;
+
+
+/** @brief: set and get the parameter segEgbThresholdII.
+* This parameter is used in the third stage
+* mentioned above. It serves the same purpose as segEgbThresholdI.
+* The segmentation result tends to have more regions remained if
+* this value is large and vice versa.
+*/
+CV_WRAP virtual void setSegEgbThresholdII(float c) = 0;
+CV_WRAP virtual float getSegEgbThresholdII() = 0;
+
+
+/** @brief: set and get the parameter minRegionSizeII.
+* This parameter is used in the third stage
+* mentioned above. It serves the same purpose as minRegionSizeI
+*/
+CV_WRAP virtual void setMinRegionSizeII(int n) = 0;
+CV_WRAP virtual int getMinRegionSizeII() = 0;
+
+
+/** @brief: set and get the parameter spatialWeight.
+* This parameter is used in the first stage
+* mentioned above(the SLIC stage). It describes how important is the role
+* of position when calculating the distance between each pixel and it's
+* center. The exact formula to calculate the distance is
+* \f$colorDistance + spatialWeight \times spatialDistance\f$.
+* The segmentation result tends to have more local consistency
+* if this value is larger.
+*/
+CV_WRAP virtual void setSpatialWeight(float w) = 0;
+CV_WRAP virtual float getSpatialWeight() = 0;
+
+
+/** @brief: set and get the parameter slicSpixelSize.
+* This parameter is used in the first stage mentioned
+* above(the SLIC stage). It describes the size of each
+* superpixel when initializing SLIC. Every superpixel
+* approximately has \f$slicSpixelSize \times slicSpixelSize\f$
+* pixels in the begining.
+*/
+CV_WRAP virtual void setSlicSpixelSize(int n) = 0;
+CV_WRAP virtual int getSlicSpixelSize() = 0;
+
+
+/** @brief: set and get the parameter numSlicIter.
+* This parameter is used in the first stage. It
+* describes how many iteration to perform when executing SLIC.
+*/
+CV_WRAP virtual void setNumSlicIter(int n) = 0;
+CV_WRAP virtual int getNumSlicIter() = 0;
+
+/** @brief do segmentation gpu
+* @param src: the input image
+* @param ifDraw: if draw the image in the returned Mat. if this parameter is false,
+* then the content of the returned Mat is a matrix of index, describing the region
+* each pixel belongs to. And it's data type is CV_16U. If this parameter is true,
+* then the returned Mat is a segmented picture, and color of each region is the
+* average color of all pixels in that region. And it's data type is the same as
+* the input image
+*/
+CV_WRAP virtual Mat performSegmentGpu(InputArray src, bool ifDraw = true) = 0;
+
+/** @brief do segmentation with cpu
+* This method is only implemented for reference.
+* It is highly NOT recommanded to use it.
+*/
+CV_WRAP virtual Mat performSegmentCpu(InputArray src, bool ifDraw = true) = 0;
+
+/** @brief: create a hfs object
+* @param height: the height of the input image
+* @param width: the width of the input image
+* @param segEgbThresholdI: parameter segEgbThresholdI
+* @param minRegionSizeI: parameter minRegionSizeI
+* @param segEgbThresholdII: parameter segEgbThresholdII
+* @param minRegionSizeII: parameter minRegionSizeII
+* @param spatialWeight: parameter spatialWeight
+* @param slicSpixelSize: parameter slicSpixelSize
+* @param numSlicIter: parameter numSlicIter
+*/
+CV_WRAP static Ptr<HfsSegment> create(int height, int width,
+    float segEgbThresholdI = 0.08f, int minRegionSizeI = 100,
+    float segEgbThresholdII = 0.28f, int minRegionSizeII = 200,
+    float spatialWeight = 0.6f, int slicSpixelSize = 8, int numSlicIter = 5);
+
+};
+
+//! @}
+
+}} // namespace cv { namespace hfs {
--- a/modules/hfs/samples/CMakeLists.txt
+++ b/modules/hfs/samples/CMakeLists.txt
+cmake_minimum_required(VERSION 2.8)
+project(example)
+find_package(OpenCV REQUIRED)
+
+set(SOURCES example.cpp)
+
+include_directories(${OpenCV_INCLUDE_DIRS})
+add_executable(example ${SOURCES} ${HEADERS})
+target_link_libraries(example ${OpenCV_LIBS})
--- a/modules/hfs/samples/data/000.jpg
+++ b/modules/hfs/samples/data/000.jpg
--- a/modules/hfs/samples/data/001.jpg
+++ b/modules/hfs/samples/data/001.jpg
--- a/modules/hfs/samples/data/002.jpg
+++ b/modules/hfs/samples/data/002.jpg
--- a/modules/hfs/samples/example.cpp
+++ b/modules/hfs/samples/example.cpp
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/hfs.hpp"
+
+using namespace cv;
+using namespace cv::hfs;
+int main(int argc, char *argv[])
+{
+    // invalid number of command line parameter
+    if( argc != 2 ) {
+        return EXIT_FAILURE;
+    }
+
+    char* path = argv[1];
+    // read in a pictrue to initialize the height and width
+    Mat src = imread(path), res;
+    int _h = src.rows, _w = src.cols;
+
+    // initialize the HfsSegment object
+    // In this example, we used default paramters.
+    // However, bear in mind that you can pass in your
+    // own parameters in with this function.
+    Ptr<HfsSegment> h = HfsSegment::create( _h, _w );
+
+    // segment and write the first result.
+    res = h->performSegmentGpu(src);
+    imwrite( "segment_default_gpu.jpg", res );
+    // also, there is CPU interface for that
+    res = h->performSegmentCpu(src);
+    imwrite( "segment_default_cpu.jpg", res );
+
+    // also, instead of getting a segmented image
+    // from our interface, you can also choose to not to
+    // draw the result on the Mat and only get a matrix
+    // of index. Note that the data type of the returned
+    // Mat in this case is CV_16U
+    Mat idx_mat = h->performSegmentGpu( src, false );
+
+    // also, you can change any parameters as you want
+    h->setSlicSpixelSize(10);
+    res = h->performSegmentGpu(src);
+    imwrite( "segment_changed_param.jpg", res );
+
+    return 0;
+}
--- a/modules/hfs/src/cuda/gslic_seg_engine_gpu.cu
+++ b/modules/hfs/src/cuda/gslic_seg_engine_gpu.cu
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// #ifdef _HFS_CUDA_ON_
+
+#include "../precomp.hpp"
+#include "../slic/slic.hpp"
+
+namespace cv { namespace hfs { namespace slic { namespace engines {
+
+
+__global__ void cvtImgSpaceDevice(const Vector4u* inimg,
+    Vector2i img_size, Vector4f* outimg);
+
+__global__ void initClusterCentersDevice(const Vector4f* inimg,
+    Vector2i map_size, Vector2i img_size, int spixel_size,
+    gSpixelInfo* out_spixel);
+
+__global__ void findCenterAssociationDevice(const Vector4f* inimg,
+    const gSpixelInfo* in_spixel_map, Vector2i map_size,
+    Vector2i img_size, int spixel_size, float weight,
+    float max_xy_dist, float max_color_dist, int* out_idx_img);
+
+__global__ void updateClusterCenterDevice(const Vector4f* inimg,
+    const int* in_idx_img, Vector2i map_size, Vector2i img_size,
+    int spixel_size, int no_blocks_per_line, gSpixelInfo* accum_map);
+
+__global__ void finalizeReductionResultDevice(const gSpixelInfo* accum_map,
+    Vector2i map_size, int no_blocks_per_spixel,
+    gSpixelInfo* spixel_list);
+
+__global__ void enforceConnectivityDevice(const int* in_idx_img,
+    Vector2i img_size, int* out_idx_img);
+
+__global__ void enforceConnectivityDevice1_2(const int* in_idx_img,
+    Vector2i img_size, int* out_idx_img);
+
+
+
+SegEngineGPU::SegEngineGPU(const slicSettings& in_settings) : SegEngine(in_settings)
+{
+    source_img = Ptr<UChar4Image>(new UChar4Image(in_settings.img_size));
+    cvt_img = Ptr<Float4Image>(new Float4Image(in_settings.img_size));
+    idx_img = Ptr<IntImage>(new IntImage(in_settings.img_size));
+    tmp_idx_img = Ptr<IntImage>(new IntImage(in_settings.img_size));
+
+    spixel_size = in_settings.spixel_size;
+
+    int spixel_per_col = (int)ceil((float)in_settings.img_size.x / (float)spixel_size);
+    int spixel_per_row = (int)ceil((float)in_settings.img_size.y / (float)spixel_size);
+
+    map_size = Vector2i(spixel_per_col, spixel_per_row);
+    spixel_map = Ptr<gSpixelMap>(new gSpixelMap(map_size));
+
+    no_grid_per_center =
+        (int)ceil(spixel_size*3.0f / HFS_BLOCK_DIM)*((int)ceil(spixel_size*3.0f / HFS_BLOCK_DIM));
+
+    Vector2i accum_size(map_size.x*no_grid_per_center, map_size.y);
+    accum_map = Ptr<gSpixelMap>(new gSpixelMap(accum_size));
+
+    // normalizing factors
+    max_color_dist = 15.0f / (1.7321f * 128);
+    max_color_dist *= max_color_dist;
+    max_xy_dist = 1.0f / (2 * spixel_size * spixel_size);
+}
+
+SegEngineGPU::~SegEngineGPU() {}
+
+
+void SegEngineGPU::cvtImgSpace(Ptr<UChar4Image> inimg, Ptr<Float4Image> outimg)
+{
+    Vector4u* inimg_ptr = inimg->getGpuData();
+    Vector4f* outimg_ptr = outimg->getGpuData();
+
+    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
+    dim3 gridSize = getGridSize(img_size, blockSize);
+    cvtImgSpaceDevice << <gridSize, blockSize >> >(inimg_ptr, img_size, outimg_ptr);
+}
+
+void SegEngineGPU::initClusterCenters()
+{
+    gSpixelInfo* spixel_list = spixel_map->getGpuData();
+    Vector4f* img_ptr = cvt_img->getGpuData();
+
+    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
+    dim3 gridSize = getGridSize(map_size, blockSize);
+    initClusterCentersDevice << <gridSize, blockSize >> >
+        (img_ptr, map_size, img_size, spixel_size, spixel_list);
+}
+
+void SegEngineGPU::findCenterAssociation()
+{
+    gSpixelInfo* spixel_list = spixel_map->getGpuData();
+    Vector4f* img_ptr = cvt_img->getGpuData();
+    int* idx_ptr = idx_img->getGpuData();
+
+    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
+    dim3 gridSize = getGridSize(img_size, blockSize);
+
+    findCenterAssociationDevice << <gridSize, blockSize >> >
+        (img_ptr, spixel_list, map_size, img_size,
+            spixel_size, slic_settings.coh_weight,
+            max_xy_dist, max_color_dist, idx_ptr);
+}
+
+void SegEngineGPU::updateClusterCenter()
+{
+    gSpixelInfo* accum_map_ptr = accum_map->getGpuData();
+    gSpixelInfo* spixel_list_ptr = spixel_map->getGpuData();
+    Vector4f* img_ptr = cvt_img->getGpuData();
+    int* idx_ptr = idx_img->getGpuData();
+
+    int no_blocks_per_line = (int)ceil(spixel_size * 3.0f / HFS_BLOCK_DIM);
+
+    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
+    dim3 gridSize(map_size.x, map_size.y, no_grid_per_center);
+
+    updateClusterCenterDevice << <gridSize, blockSize >> >
+        (img_ptr, idx_ptr, map_size, img_size,
+            spixel_size, no_blocks_per_line, accum_map_ptr);
+
+    dim3 gridSize2(map_size.x, map_size.y);
+
+    finalizeReductionResultDevice << <gridSize2, blockSize >> >
+        (accum_map_ptr, map_size, no_grid_per_center, spixel_list_ptr);
+}
+
+void SegEngineGPU::enforceConnectivity()
+{
+    int* idx_ptr = idx_img->getGpuData();
+    int* tmp_idx_ptr = tmp_idx_img->getGpuData();
+
+    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
+    dim3 gridSize = getGridSize(img_size, blockSize);
+
+    enforceConnectivityDevice << <gridSize, blockSize >> >
+        (idx_ptr, img_size, tmp_idx_ptr);
+    enforceConnectivityDevice << <gridSize, blockSize >> >
+        (tmp_idx_ptr, img_size, idx_ptr);
+    enforceConnectivityDevice1_2 << <gridSize, blockSize >> >
+        (idx_ptr, img_size, tmp_idx_ptr);
+    enforceConnectivityDevice1_2 << <gridSize, blockSize >> >
+        (tmp_idx_ptr, img_size, idx_ptr);
+}
+
+
+__global__ void cvtImgSpaceDevice(const Vector4u* inimg, Vector2i img_size,
+    Vector4f* outimg)
+{
+    int idx_x = threadIdx.x + blockIdx.x * blockDim.x;
+    int idx_y = threadIdx.y + blockIdx.y * blockDim.y;
+    if (idx_x >= img_size.x || idx_y >= img_size.y)
+        return;
+
+    int idx = idx_y*img_size.x + idx_x;
+    rgb2CIELab(inimg[idx], outimg[idx]);
+}
+
+__global__ void initClusterCentersDevice(const Vector4f* inimg,
+    Vector2i map_size, Vector2i img_size, int spixel_size,
+    gSpixelInfo* out_spixel)
+{
+    int x = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = threadIdx.y + blockIdx.y * blockDim.y;
+    if (x >= map_size.x || y >= map_size.y) return;
+
+    initClusterCentersShared(inimg, map_size,
+        img_size, spixel_size, x, y, out_spixel);
+}
+
+__global__ void findCenterAssociationDevice(const Vector4f* inimg,
+    const gSpixelInfo* in_spixel_map, Vector2i map_size,
+    Vector2i img_size, int spixel_size, float weight,
+    float max_xy_dist, float max_color_dist, int* out_idx_img)
+{
+    int x = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = threadIdx.y + blockIdx.y * blockDim.y;
+    if (x >= img_size.x || y >= img_size.y) return;
+    findCenterAssociationShared(inimg, in_spixel_map, map_size, img_size,
+        spixel_size, weight, x, y, max_xy_dist, max_color_dist, out_idx_img);
+}
+
+__global__ void updateClusterCenterDevice(const Vector4f* inimg,
+    const int* in_idx_img, Vector2i map_size, Vector2i img_size,
+    int spixel_size, int no_blocks_per_line, gSpixelInfo* accum_map)
+{
+    int local_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+    __shared__ Float4_ color_shared[HFS_BLOCK_DIM*HFS_BLOCK_DIM];
+    __shared__ Float2_ xy_shared[HFS_BLOCK_DIM*HFS_BLOCK_DIM];
+    __shared__ volatile int count_shared[HFS_BLOCK_DIM*HFS_BLOCK_DIM];
+    __shared__ bool should_add;
+
+    color_shared[local_id] = Float4_(0, 0, 0, 0);
+    xy_shared[local_id] = Float2_(0, 0);
+    count_shared[local_id] = 0;
+    should_add = false;
+    __syncthreads();
+
+    int no_blocks_per_spixel = gridDim.z;
+
+    int spixel_id = blockIdx.y * map_size.x + blockIdx.x;
+
+    // compute the relative position in the search window
+    int block_x = blockIdx.z % no_blocks_per_line;
+    int block_y = blockIdx.z / no_blocks_per_line;
+
+    int x_offset = block_x * HFS_BLOCK_DIM + threadIdx.x;
+    int y_offset = block_y * HFS_BLOCK_DIM + threadIdx.y;
+
+    if (x_offset < spixel_size * 3 && y_offset < spixel_size * 3)
+    {
+        // compute the start of the search window
+        int x_start = blockIdx.x * spixel_size - spixel_size;
+        int y_start = blockIdx.y * spixel_size - spixel_size;
+
+        int x_img = x_start + x_offset;
+        int y_img = y_start + y_offset;
+
+        if (x_img >= 0 && x_img < img_size.x && y_img >= 0 && y_img < img_size.y)
+        {
+            int img_idx = y_img * img_size.x + x_img;
+            if (in_idx_img[img_idx] == spixel_id)
+            {
+                color_shared[local_id] =
+                    Float4_(inimg[img_idx].x, inimg[img_idx].y,
+                        inimg[img_idx].z, inimg[img_idx].w);
+                xy_shared[local_id] = Float2_(x_img, y_img);
+                count_shared[local_id] = 1;
+                should_add = true;
+            }
+        }
+    }
+    __syncthreads();
+
+    if (should_add)
+    {
+        if (local_id < 128)
+        {
+            color_shared[local_id] += color_shared[local_id + 128];
+            xy_shared[local_id] += xy_shared[local_id + 128];
+            count_shared[local_id] += count_shared[local_id + 128];
+        }
+        __syncthreads();
+
+        if (local_id < 64)
+        {
+            color_shared[local_id] += color_shared[local_id + 64];
+            xy_shared[local_id] += xy_shared[local_id + 64];
+            count_shared[local_id] += count_shared[local_id + 64];
+        }
+        __syncthreads();
+
+        if (local_id < 32)
+        {
+            color_shared[local_id] += color_shared[local_id + 32];
+            color_shared[local_id] += color_shared[local_id + 16];
+            color_shared[local_id] += color_shared[local_id + 8];
+            color_shared[local_id] += color_shared[local_id + 4];
+            color_shared[local_id] += color_shared[local_id + 2];
+            color_shared[local_id] += color_shared[local_id + 1];
+
+            xy_shared[local_id] += xy_shared[local_id + 32];
+            xy_shared[local_id] += xy_shared[local_id + 16];
+            xy_shared[local_id] += xy_shared[local_id + 8];
+            xy_shared[local_id] += xy_shared[local_id + 4];
+            xy_shared[local_id] += xy_shared[local_id + 2];
+            xy_shared[local_id] += xy_shared[local_id + 1];
+
+            count_shared[local_id] += count_shared[local_id + 32];
+            count_shared[local_id] += count_shared[local_id + 16];
+            count_shared[local_id] += count_shared[local_id + 8];
+            count_shared[local_id] += count_shared[local_id + 4];
+            count_shared[local_id] += count_shared[local_id + 2];
+            count_shared[local_id] += count_shared[local_id + 1];
+        }
+    }
+    __syncthreads();
+
+    if (local_id == 0)
+    {
+        int accum_map_idx = spixel_id * no_blocks_per_spixel + blockIdx.z;
+        accum_map[accum_map_idx].center = Vector2f(xy_shared[0].x, xy_shared[0].y);
+        accum_map[accum_map_idx].color_info =
+            Vector4f(color_shared[0].x, color_shared[0].y,
+                color_shared[0].z, color_shared[0].w);
+        accum_map[accum_map_idx].num_pixels = count_shared[0];
+    }
+}
+
+__global__ void finalizeReductionResultDevice(const gSpixelInfo* accum_map,
+    Vector2i map_size, int no_blocks_per_spixel, gSpixelInfo* spixel_list)
+{
+    int x = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = threadIdx.y + blockIdx.y * blockDim.y;
+    if (x >= map_size.x || y >= map_size.y) return;
+
+    finalizeReductionResultShared(accum_map,
+        map_size, no_blocks_per_spixel, x, y, spixel_list);
+}
+
+__global__ void enforceConnectivityDevice(const int* in_idx_img,
+    Vector2i img_size, int* out_idx_img)
+{
+    int x = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = threadIdx.y + blockIdx.y * blockDim.y;
+    if (x >= img_size.x || y >= img_size.y) return;
+
+    supressLocalLable(in_idx_img, img_size, x, y, out_idx_img);
+}
+
+__global__ void enforceConnectivityDevice1_2(const int* in_idx_img,
+    Vector2i img_size, int* out_idx_img)
+{
+    int x = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = threadIdx.y + blockIdx.y * blockDim.y;
+    if (x >= img_size.x || y >= img_size.y) return;
+
+    supressLocalLable2(in_idx_img, img_size, x, y, out_idx_img);
+}
+
+}}}}
+
+// #endif
--- a/modules/hfs/src/cuda/magnitude.cu
+++ b/modules/hfs/src/cuda/magnitude.cu
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+
+// #if defined _HFS_CUDA_ON_
+
+#include "../precomp.hpp"
+#include "../magnitude/magnitude.hpp"
+
+namespace cv { namespace hfs {
+
+__global__ void derrivativeXYDevice(const uchar *gray_img,
+    int *delta_x, int *delta_y, int *mag, Vector2i img_size)
+{
+    int x = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = threadIdx.y + blockIdx.y * blockDim.y;
+    if (x > img_size.x - 1 || y > img_size.y - 1)
+        return;
+
+    int idx = y*img_size.x + x;
+
+    if (x == 0)
+        delta_x[idx] = gray_img[idx + 1] - gray_img[idx];
+    else if (x == img_size.x - 1)
+        delta_x[idx] = gray_img[idx] - gray_img[idx - 1];
+    else
+        delta_x[idx] = gray_img[idx + 1] - gray_img[idx - 1];
+
+    if (y == 0)
+        delta_y[idx] = gray_img[idx + img_size.x] - gray_img[idx];
+    else if (y == img_size.y - 1)
+        delta_y[idx] = gray_img[idx] - gray_img[idx - img_size.x];
+    else
+        delta_y[idx] = gray_img[idx + img_size.x] - gray_img[idx - img_size.x];
+
+    mag[idx] = (int)(0.5 +
+        sqrt((double)(delta_x[idx] * delta_x[idx] + delta_y[idx] * delta_y[idx])));
+}
+
+__device__ __forceinline__ int dmin(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+__device__ __forceinline__ int dmax(int a, int b)
+{
+    return a > b ? a : b;
+}
+
+__global__ void nonMaxSuppDevice(uchar *nms_mag,
+    int *delta_x, int *delta_y, int *mag, Vector2i img_size)
+{
+    int x = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = threadIdx.y + blockIdx.y * blockDim.y;
+    if (x > img_size.x - 1 || y > img_size.y - 1) return;
+
+    int idx = y*img_size.x + x;
+
+    if (x == 0 || x == img_size.x - 1 || y == 0 || y == img_size.y - 1)
+    {
+        nms_mag[idx] = 0;
+        return;
+    }
+
+    int m00, gx, gy, z1, z2;
+    double mag1, mag2, xprep, yprep;
+
+    m00 = mag[idx];
+    if (m00 == 0)
+    {
+        nms_mag[idx] = 0;
+        return;
+    }
+    else
+    {
+        xprep = -(gx = delta_x[idx]) / ((double)m00);
+        yprep = (gy = delta_y[idx]) / ((double)m00);
+    }
+
+    if (gx >= 0)
+    {
+        if (gy >= 0)
+        {
+            if (gx >= gy)
+            {
+                z1 = mag[idx - 1];
+                z2 = mag[idx - img_size.x - 1];
+                mag1 = (m00 - z1)*xprep + (z2 - z1)*yprep;
+
+                z1 = mag[idx + 1];
+                z2 = mag[idx + img_size.x + 1];
+                mag2 = (m00 - z1)*xprep + (z2 - z1)*yprep;
+            }
+            else
+            {
+                z1 = mag[idx - img_size.x];
+                z2 = mag[idx - img_size.x - 1];
+                mag1 = (z1 - z2)*xprep + (z1 - m00)*yprep;
+
+                z1 = mag[idx + img_size.x];
+                z2 = mag[idx + img_size.x + 1];
+                mag2 = (z1 - z2)*xprep + (z1 - m00)*yprep;
+            }
+        }
+        else
+        {
+            if (gx >= -gy)
+            {
+                z1 = mag[idx - 1];
+                z2 = mag[idx + img_size.x - 1];
+                mag1 = (m00 - z1)*xprep + (z1 - z2)*yprep;
+
+                z1 = mag[idx + 1];
+                z2 = mag[idx - img_size.x + 1];
+                mag2 = (m00 - z1)*xprep + (z1 - z2)*yprep;
+            }
+            else
+            {
+                z1 = mag[idx + img_size.x];
+                z2 = mag[idx + img_size.x - 1];
+                mag1 = (z1 - z2)*xprep + (m00 - z1)*yprep;
+
+                z1 = mag[idx - img_size.x];
+                z2 = mag[idx - img_size.x + 1];
+                mag2 = (z1 - z2)*xprep + (m00 - z1)*yprep;
+            }
+        }
+    }
+    else
+    {
+        if (gy >= 0)
+        {
+            if (-gx >= gy)
+            {
+                z1 = mag[idx + 1];
+                z2 = mag[idx - img_size.x + 1];
+                mag1 = (z1 - m00)*xprep + (z2 - z1)*yprep;
+
+                z1 = mag[idx - 1];
+                z2 = mag[idx + img_size.x - 1];
+                mag2 = (z1 - m00)*xprep + (z2 - z1)*yprep;
+            }
+            else
+            {
+                z1 = mag[idx - img_size.x];
+                z2 = mag[idx - img_size.x + 1];
+                mag1 = (z2 - z1)*xprep + (z1 - m00)*yprep;
+
+                z1 = mag[idx + img_size.x];
+                z2 = mag[idx + img_size.x - 1];
+                mag2 = (z2 - z1)*xprep + (z1 - m00)*yprep;
+            }
+        }
+        else
+        {
+            if (-gx > -gy)
+            {
+                z1 = mag[idx + 1];
+                z2 = mag[idx + img_size.x + 1];
+                mag1 = (z1 - m00)*xprep + (z1 - z2)*yprep;
+
+                z1 = mag[idx - 1];
+                z2 = mag[idx - img_size.x - 1];
+                mag2 = (z1 - m00)*xprep + (z1 - z2)*yprep;
+            }
+            else
+            {
+                z1 = mag[idx + img_size.x];
+                z2 = mag[idx + img_size.x + 1];
+                mag1 = (z2 - z1)*xprep + (m00 - z1)*yprep;
+
+                z1 = mag[idx - img_size.x];
+                z2 = mag[idx - img_size.x - 1];
+                mag2 = (z2 - z1)*xprep + (m00 - z1)*yprep;
+            }
+        }
+    }
+
+    if (mag1 > 0 || mag2 >= 0)
+        nms_mag[idx] = 0;
+    else
+        nms_mag[idx] = (uchar)dmin(dmax(m00, 0), 255);
+}
+
+void Magnitude::derrivativeXYGpu()
+{
+    uchar *gray_ptr = gray_img->getGpuData();
+    int *dx_ptr = delta_x->getGpuData();
+    int *dy_ptr = delta_y->getGpuData();
+    int *mag_ptr = mag->getGpuData();
+
+    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
+    dim3 gridSize((int)ceil((float)img_size.x / (float)blockSize.x),
+        (int)ceil((float)img_size.y / (float)blockSize.y));
+
+    derrivativeXYDevice << <gridSize, blockSize >> >
+        (gray_ptr, dx_ptr, dy_ptr, mag_ptr, img_size);
+}
+
+void Magnitude::nonMaxSuppGpu()
+{
+    int *dx_ptr = delta_x->getGpuData();
+    int *dy_ptr = delta_y->getGpuData();
+    int *mag_ptr = mag->getGpuData();
+    uchar *nms_ptr = nms_mag->getGpuData();
+
+    dim3 blockSize(HFS_BLOCK_DIM, HFS_BLOCK_DIM);
+    dim3 gridSize((int)ceil((float)img_size.x / (float)blockSize.x),
+        (int)ceil((float)img_size.y / (float)blockSize.y));
+
+    nonMaxSuppDevice << <gridSize, blockSize >> >
+        (nms_ptr, dx_ptr, dy_ptr, mag_ptr, img_size);
+}
+
+void Magnitude::processImgGpu(const Mat& bgr3u, Mat& mag1u)
+{
+    Mat gray, blur1u;
+    cvtColor(bgr3u, gray, COLOR_BGR2GRAY);
+    GaussianBlur(gray, blur1u, Size(7, 7), 1, 1);
+
+    img_size.x = bgr3u.cols;
+    img_size.y = bgr3u.rows;
+
+    loadImage(blur1u, gray_img);
+    gray_img->updateDeviceFromHost();
+    derrivativeXYGpu();
+    nonMaxSuppGpu();
+    mag1u.create(bgr3u.rows, bgr3u.cols, CV_8UC1);
+    nms_mag->updateHostFromDevice();
+    loadImage(nms_mag, mag1u);
+}
+
+}}
+
+// #endif
--- a/modules/hfs/src/hfs.cpp
+++ b/modules/hfs/src/hfs.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+
+#include "precomp.hpp"
+#include "opencv2/hfs.hpp"
+#include "hfs_core.hpp"
+
+namespace cv{ namespace hfs{
+
+class HfsSegmentImpl : public cv::hfs::HfsSegment{
+public:
+
+    void setSegEgbThresholdI(float c)
+    {
+        core->hfsSettings.egbThresholdI = c;
+    }
+    float getSegEgbThresholdI() {
+        return core->hfsSettings.egbThresholdI;
+    }
+
+
+    void setMinRegionSizeI(int n)
+    {
+        core->hfsSettings.minRegionSizeI = n;
+    }
+    int getMinRegionSizeI()
+    {
+        return core->hfsSettings.minRegionSizeI;
+    }
+
+    void setSegEgbThresholdII(float c)
+    {
+        core->hfsSettings.egbThresholdII = c;
+    }
+    float getSegEgbThresholdII() {
+        return core->hfsSettings.egbThresholdII;
+    }
+
+
+    void setMinRegionSizeII(int n)
+    {
+        core->hfsSettings.minRegionSizeII = n;
+    }
+    int getMinRegionSizeII()
+    {
+        return core->hfsSettings.minRegionSizeII;
+    }
+
+    void setSpatialWeight(float w)
+    {
+        core->hfsSettings.slicSettings.coh_weight = w;
+        core->reconstructEngine();
+    }
+    float getSpatialWeight()
+    {
+        return core->hfsSettings.slicSettings.coh_weight;
+    }
+
+
+    void setSlicSpixelSize(int n)
+    {
+        core->hfsSettings.slicSettings.spixel_size = n;
+        core->reconstructEngine();
+    }
+    int getSlicSpixelSize()
+    {
+        return core->hfsSettings.slicSettings.spixel_size;
+    }
+
+
+    void setNumSlicIter(int n)
+    {
+        core->hfsSettings.slicSettings.num_iters = n;
+        core->reconstructEngine();
+    }
+    int getNumSlicIter()
+    {
+        return core->hfsSettings.slicSettings.num_iters;
+    }
+
+
+    HfsSegmentImpl(int height, int width,
+        float segEgbThresholdI, int minRegionSizeI, float segEgbThresholdII, int minRegionSizeII,
+        float spatialWeight, int spixelSize, int numIter)
+    {
+        core = Ptr<HfsCore>(new HfsCore(height, width,
+            segEgbThresholdI, minRegionSizeI, segEgbThresholdII, minRegionSizeII,
+            spatialWeight, spixelSize, numIter));
+    }
+
+    Mat performSegmentGpu(InputArray src, bool ifDraw = true);
+    Mat performSegmentCpu(InputArray src, bool ifDraw = true);
+private:
+    Ptr<HfsCore> core;
+};
+
+Mat HfsSegmentImpl::performSegmentGpu(InputArray src, bool ifDraw) {
+    Mat src_ = src.getMat();
+
+    CV_Assert(src_.rows == core->hfsSettings.slicSettings.img_size.y);
+    CV_Assert(src_.cols == core->hfsSettings.slicSettings.img_size.x);
+
+    Mat seg;
+    int num_css = core->processImageGpu(src_, seg);
+    if(ifDraw){
+        Mat res;
+        core->drawSegmentationRes( seg, src_, num_css, res );
+        return res;
+    }else{
+        return seg;
+    }
+}
+
+Mat HfsSegmentImpl::performSegmentCpu(InputArray src, bool ifDraw) {
+    Mat src_ = src.getMat();
+
+    CV_Assert(src_.rows == core->hfsSettings.slicSettings.img_size.y);
+    CV_Assert(src_.cols == core->hfsSettings.slicSettings.img_size.x);
+
+    Mat seg;
+    int num_css = core->processImageCpu(src_, seg);
+    if (ifDraw) {
+        Mat res;
+        core->drawSegmentationRes(seg, src_, num_css, res);
+        return res;
+    }
+    else {
+        return seg;
+    }
+}
+
+Ptr<HfsSegment> HfsSegment::create(int height, int width, float segEgbThresholdI, int minRegionSizeI,
+                                   float segEgbThresholdII, int minRegionSizeII,
+                                   float spatialWeight, int spixelSize, int numIter)
+{
+    return Ptr<HfsSegmentImpl>(new HfsSegmentImpl(height, width,
+        segEgbThresholdI, minRegionSizeI, segEgbThresholdII, minRegionSizeII,
+        spatialWeight, spixelSize, numIter));
+}
+
+}}
--- a/modules/hfs/src/hfs_core.cpp
+++ b/modules/hfs/src/hfs_core.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include "hfs_core.hpp"
+#include <vector>
+using namespace std;
+
+namespace cv{ namespace hfs{
+
+
+const Point DIRECTION4[5] =
+{
+    Point(-1, 0),
+    Point(0, -1),
+    Point(1, 0),
+    Point(0, 1),
+    Point(0, 0)
+};
+
+const Point CIRCLE2[13] =
+{
+    Point(0, 1), Point(0, 2), Point(1, 1),
+    Point(1, 0), Point(2, 0), Point(1, -1),
+    Point(0, -1), Point(0, -2), Point(-1, -1),
+    Point(-1, 0), Point(-2, 0), Point(-1, 1),
+    Point(0, 0)
+};
+
+HfsCore::HfsCore(int height, int width,
+    float segThresholdI, int minRegionSizeI, float segThresholdII, int minRegionSizeII,
+    float spatialWeight, int spixelSize, int numIter)
+{
+    hfsSettings.egbThresholdI = segThresholdI;
+    hfsSettings.minRegionSizeI = minRegionSizeI;
+    hfsSettings.egbThresholdII = segThresholdII;
+    hfsSettings.minRegionSizeII = minRegionSizeII;
+
+    hfsSettings.slicSettings.img_size.y = height;
+    hfsSettings.slicSettings.img_size.x = width;
+    hfsSettings.slicSettings.coh_weight = spatialWeight;
+    hfsSettings.slicSettings.spixel_size = spixelSize;
+    hfsSettings.slicSettings.num_iters = numIter;
+    constructEngine();
+
+    float weight1[] = { -0.0024710407f, 0.00608298f,
+        0.0047505307f, 0.0051097558f, 0.00089799752f };
+    float weight2[] = { -0.0040629096f, 0.010430338f,
+        0.0092625152f, 0.004976281f, 0.0037279273f };
+    w1.resize(sizeof(weight1) / sizeof(weight1[0]));
+    w2.resize(sizeof(weight2) / sizeof(weight2[0]));
+    memcpy(w1.data(), weight1, sizeof(weight1));
+    memcpy(w2.data(), weight2, sizeof(weight2));
+}
+
+void HfsCore::constructEngine()
+{
+    mag_engine = Ptr<Magnitude>(
+        new Magnitude(hfsSettings.slicSettings.img_size.y,
+            hfsSettings.slicSettings.img_size.x));
+#ifdef _HFS_CUDA_ON_
+    gslic_engine = Ptr<slic::engines::CoreEngine>(
+        new slic::engines::CoreEngine(hfsSettings.slicSettings));
+    in_img = Ptr<UChar4Image>(
+        new UChar4Image(hfsSettings.slicSettings.img_size));
+    out_img = Ptr<UChar4Image>(
+        new UChar4Image(hfsSettings.slicSettings.img_size));
+#endif
+}
+
+void HfsCore::reconstructEngine()
+{
+#ifdef _HFS_CUDA_ON_
+    gslic_engine = Ptr<slic::engines::CoreEngine>(
+        new slic::engines::CoreEngine(hfsSettings.slicSettings));
+#endif
+}
+
+HfsCore::~HfsCore(){}
+
+void HfsCore::loadImage( const Mat& inimg, Ptr<UChar4Image> outimg )
+{
+    Vector4u* outimg_ptr = outimg->getCpuData();
+    for ( int y = 0; y < inimg.rows; y++ )
+    {
+        const Vec3b *ptr = inimg.ptr<Vec3b>(y);
+        for ( int x = 0; x < inimg.cols; x++ )
+        {
+            int idx = x + y * inimg.cols;
+            outimg_ptr[idx].z = ptr[x][0];
+            outimg_ptr[idx].y = ptr[x][1];
+            outimg_ptr[idx].x = ptr[x][2];
+        }
+    }
+}
+
+
+
+Mat HfsCore::getSLICIdxCpu(const Mat& img3u, int &num_css)
+{
+    const int _h = img3u.rows;
+    const int _w = img3u.cols;
+    const int _s = _h*_w;
+
+    slic::cSLIC cslic;
+    vector<int> idx_img = cslic.generate_superpixels(img3u,
+        hfsSettings.slicSettings.spixel_size, hfsSettings.slicSettings.coh_weight);
+
+    num_css = 0;
+    int _max =
+        (int)ceil((float)_w / 8.0f)*(int)ceil((float)_h / 8.0f);
+    vector<int> indexes(_max, 0);
+    for (int i = 0; i < _s; i++)
+        indexes[idx_img[i]]++;
+    for (int i = 0; i < _max; i++)
+        indexes[i] = (indexes[i] != 0) ? num_css++ : 0;
+    for (int i = 0; i < _s; i++)
+        idx_img[i] = indexes[idx_img[i]];
+    Mat idx_mat(_h, _w, CV_32S, idx_img.data());
+    idx_mat.convertTo(idx_mat, CV_16U);
+    return idx_mat;
+}
+
+Vec4f HfsCore::getColorFeature( const Vec3f& in1, const Vec3f& in2 )
+{
+    Vec4f feature;
+    Vec3f diff = (in1 - in2);
+    feature[0] = abs(diff[0]);
+    feature[1] = abs(diff[1]);
+    feature[2] = abs(diff[2]);
+    feature[3] = getEulerDistance( in1, in2 );
+    return feature;
+}
+
+int HfsCore::getAvgGradientBdry( const Mat& idx_mat,
+    const vector<Mat>& mag1us, int num_css, Mat& bd_num,
+    vector<Mat>& gradients )
+{
+    const int _h = idx_mat.rows;
+    const int _w = idx_mat.cols;
+    const int size = (int)mag1us.size();
+
+    gradients.resize(size);
+    for (int i = 0; i < size; i++)
+    {
+        gradients[i].create(num_css, num_css, CV_32F);
+        gradients[i] = Scalar::all(0);
+    }
+    bd_num.create(num_css, num_css, CV_32F);
+    bd_num = Scalar::all(0);
+
+    for (int r = 1; r < _h - 1; r++)
+    for (int c = 1; c < _w - 1; c++)
+    {
+        ushort curr = idx_mat.at<ushort>(r, c);
+        ushort pre, tmp = 0, v[4];
+        Point p1(c, r), p2;
+        for (int k = 0; k < 4; k++)
+        {
+            p2 = p1 + DIRECTION4[k];
+            pre = idx_mat.at<ushort>(p2);
+            if (pre != curr)
+            {
+                bool flag = true;
+                for (int t = 0; t < tmp; t++)
+                {
+                    if (v[t] == pre)
+                        flag = false;
+                }
+                if (flag)
+                    v[tmp++] = pre;
+            }
+        }
+
+        if (tmp > 0)
+        {
+            for (int n = 0; n < size; n++)
+            {
+                int u[13]; float m[13];
+                for (int k = 0; k < 13; k++)
+                {
+                    p2 = p1 + CIRCLE2[k];
+                    if (!(p2.x >= 0 && p2.x < _w && p2.y >= 0 && p2.y < _h))
+                    {
+                        u[k] = -1, m[k] = 0;
+                        continue;
+                    }
+                    u[k] = idx_mat.at<ushort>(p2);
+                    m[k] = mag1us[n].at<uchar>(p2);
+                }
+
+                for (int t = 0; t < tmp; t++)
+                {
+                    float m_max = 0;
+                    for (int k = 0; k < 13; k++)
+                    {
+                        if ((u[k] == curr || u[k] == v[t]) && m[k] > m_max)
+                            m_max = m[k];
+                    }
+                    gradients[n].at<float>(curr, v[t]) += m_max;
+                    gradients[n].at<float>(v[t], curr) += m_max;
+                    bd_num.at<float>(curr, v[t])++;
+                    bd_num.at<float>(v[t], curr)++;
+                }
+            }
+        }
+    }
+
+    int num = 0;
+    for (int r_ = 0; r_ < num_css; r_++)
+    for (int c_ = 0; c_ < num_css; c_++)
+    {
+        if (abs(bd_num.at<float>(r_, c_)) > DOUBLE_EPS)
+        {
+            for (int i = 0; i < size; i++)
+                gradients[i].at<float>(r_, c_) /= bd_num.at<float>(r_, c_);
+            num++;
+        }
+    }
+    return num;
+}
+
+void HfsCore::getSegmentationI( const Mat &lab3u, const Mat &mag1u,
+    const Mat &idx_mat, float c, int min_size, Mat &seg, int &num_css)
+{
+    const int _h = lab3u.rows;
+    const int _w = lab3u.cols;
+
+    vector<vector<int> > adjacent(num_css), bdPixNum(num_css);
+    vector<vector<float> > bdGradient(num_css);
+    for (int r_ = 1; r_ < _h - 1; r_++)
+    for (int c_ = 1; c_ < _w - 1; c_++)
+    {
+        ushort curr = idx_mat.at<ushort>(r_, c_);
+        for (int k = 0; k < 4; k++)
+        {
+            Point p = Point(c_, r_) + DIRECTION4[k];
+            ushort pre = idx_mat.at<ushort>(p);
+            if (curr > pre)
+            {
+                float maxG = max(mag1u.at<uchar>(p), mag1u.at<uchar>(r_, c_));
+                vector<int>::iterator iter =
+                    find(adjacent[curr].begin(), adjacent[curr].end(), pre);
+                if (iter == adjacent[curr].end())
+                {
+                    adjacent[curr].push_back(pre);
+                    bdGradient[curr].push_back(maxG);
+                    bdPixNum[curr].push_back(1);
+                }
+                else
+                {
+                    int temp = (int)(iter - adjacent[curr].begin());
+                    bdGradient[curr][temp] += maxG;
+                    bdPixNum[curr][temp] += 1;
+                }
+            }
+        }
+    }
+    for (size_t i = 0; i < (size_t)num_css; i++)
+    for (size_t j = 0; j < adjacent[i].size(); j++)
+        bdGradient[i][j] /= bdPixNum[i][j];
+
+    int num = 0;
+    for (int i = 0; i < num_css; i++)
+        num += (int)adjacent[i].size();
+
+    vector<int> numR(num_css, 0);
+    vector<Vec3f> avg_color(num_css, Vec3f(0, 0, 0));
+    for (int r_ = 0; r_ < _h; r_++)
+    {
+        const ushort *iP = idx_mat.ptr<ushort>(r_);
+        const Vec3b *cP = lab3u.ptr<Vec3b>(r_);
+        for (int c_ = 0; c_ < _w; c_++)
+            avg_color[iP[c_]] += cP[c_], numR[iP[c_]]++;
+    }
+    for (int i = 0; i < num_css; i++)
+        avg_color[i] /= numR[i];
+
+    vector<Edge> edges(num);
+    int index = 0;
+    for (int i = 0; i < num_css; i++)
+    {
+        int adjaNum = (int)adjacent[i].size();
+        for (int j = 0; j < adjaNum; j++)
+        {
+            edges[index].a = i;
+            edges[index].b = adjacent[i][j];
+            Vec4f fcolor =
+                getColorFeature(avg_color[i], avg_color[adjacent[i][j]]);
+
+            edges[index++].w =
+                fcolor[0] * w1[0] + fcolor[1] * w1[1]+
+                fcolor[2] * w1[2] + fcolor[3] * w1[3]+
+                bdGradient[i][j] * w1[4];
+        }
+    }
+    CV_Assert(num == index);
+
+    Ptr<RegionSet> regions = egb_merge(num_css, num, edges, c, numR);
+    for (int i = 0; i < num; i++)
+    {
+        int a = regions->find(edges[i].a);
+        int b = regions->find(edges[i].b);
+        if ((a != b) && ((regions->numPix(a) < min_size) || (regions->numPix(b) < min_size)))
+            regions->join(a, b);
+    }
+
+    int idx = 1; vector<int> reg2ind(num_css), indexes(num_css);
+    std::memset(indexes.data(), 0, num_css*sizeof(int));
+    for (int i = 0; i < num_css; i++)
+    {
+        int comp = regions->find(i);
+        if (!indexes[comp])
+            indexes[comp] = idx++;
+        reg2ind[i] = indexes[comp];
+    }
+    CV_Assert(regions->num_sets() == idx - 1);
+    seg.create(_h, _w, CV_16U);
+    for (int r_ = 0; r_ < _h; r_++)
+    {
+        ushort *sP = seg.ptr<ushort>(r_);
+        const ushort *iP = idx_mat.ptr<ushort>(r_);
+        for (int c_ = 0; c_ < _w; c_++)
+            sP[c_] = (ushort)reg2ind[iP[c_]];
+    }
+    num_css = idx;
+}
+
+void HfsCore::getSegmentationII(
+    const Mat &lab3u, const Mat &mag1u, const Mat &idx_mat,
+    float c, int min_size, Mat &seg, int &num_css)
+{
+    const int _h = lab3u.rows;
+    const int _w = lab3u.cols;
+
+    vector<Mat> mag1us, gradients;
+    Mat bd_num, texture;
+    mag1us.push_back(mag1u);
+    int num = getAvgGradientBdry(idx_mat, mag1us,
+        num_css, bd_num, gradients);
+    // const int size = (int)gradients.size();
+    CV_Assert(num % 2 == 0);
+    num /= 2;
+
+    vector<int> num_pix(num_css, 0);
+    vector<Vec3f> avg_color(num_css, Vec3f(0, 0, 0));
+    for (int r_ = 0; r_ < _h; r_++)
+    {
+        const ushort *idx_ptr = idx_mat.ptr<ushort>(r_);
+        const Vec3b *clr_ptr = lab3u.ptr<Vec3b>(r_);
+        for (int c_ = 0; c_ < _w; c_++)
+            num_pix[idx_ptr[c_]]++, avg_color[idx_ptr[c_]] += clr_ptr[c_];
+    }
+    for (int i = 1; i < num_css; i++)
+        avg_color[i] /= num_pix[i];
+
+    vector<Edge> edges(num);
+    int index = 0;
+    for (int r_ = 0; r_ < num_css; r_++)
+    for (int c_ = 0; c_ < r_; c_++)
+    {
+        if (bd_num.at<int>(r_, c_) == 0) continue;
+        edges[index].a = r_;
+        edges[index].b = c_;
+        Vec4f fcolor = getColorFeature(avg_color[r_], avg_color[c_]);
+        edges[index].w =
+            fcolor[0] * w2[0] + fcolor[1] * w2[1]+
+            fcolor[2] * w2[2] + fcolor[3] * w2[3];
+        edges[index].w += gradients[0].at<float>(r_, c_)*w2[4];
+        index++;
+    }
+    CV_Assert(num == index);
+
+    Ptr<RegionSet> regions = egb_merge(num_css, num, edges, c, num_pix);
+    for (int i = 0; i < num; i++)
+    {
+        int a = regions->find(edges[i].a);
+        int b = regions->find(edges[i].b);
+        if ((a != b) && ((regions->numPix(a) < min_size)
+            || (regions->numPix(b) < min_size)))
+            regions->join(a, b);
+    }
+
+    int idx = 1;
+    vector<int> reg2ind(num_css), indexes(num_css, 0);
+    for (int i = 1; i < num_css; i++)
+    {
+        int comp = regions->find(i);
+        if (!indexes[comp])
+            indexes[comp] = idx++;
+        reg2ind[i] = indexes[comp];
+    }
+    CV_Assert(regions->num_sets() == idx);
+    seg.create(_h, _w, CV_16U);
+    for (int r_ = 0; r_ < _h; r_++)
+    {
+        ushort *sP = seg.ptr<ushort>(r_);
+        const ushort *iP = idx_mat.ptr<ushort>(r_);
+        for (int c_ = 0; c_ < _w; c_++)
+            sP[c_] = (ushort)reg2ind[iP[c_]];
+    }
+
+    num_css = idx - 1;
+}
+
+void HfsCore::drawSegmentationRes( const Mat& seg,
+    const Mat& img3u, int num_css, Mat &show )
+{
+    const int _h = img3u.rows;
+    const int _w = img3u.cols;
+
+    vector<int> region_size(num_css, 0);
+    vector<Vec3f> avg_color(num_css, Vec3f(0, 0, 0));
+    for (int r = 0; r < _h; r++)
+    {
+        const Vec3b* imP = img3u.ptr<Vec3b>(r);
+        const ushort* segP = seg.ptr<ushort>(r);
+        for (int c = 0; c < _w; c++)
+        {
+            avg_color[segP[c] - 1] += imP[c];
+            region_size[segP[c] - 1]++;
+        }
+    }
+    for (int i = 0; i < num_css; i++)
+        avg_color[i] /= region_size[i];
+
+    show.create(img3u.size(), img3u.type());
+    for (int r = 0; r < _h; r++)
+    {
+        Vec3b *data = show.ptr<Vec3b>(r);
+        const ushort* seg_ptr = seg.ptr<ushort>(r);
+        for (int c = 0; c < _w; c++)
+            data[c] = avg_color[seg_ptr[c] - 1];
+    }
+}
+
+int HfsCore::processImageCpu(const Mat &img3u, Mat &seg)
+{
+    Mat idx_mat, lab3u, mag1u, tmp;
+    int num_css;
+
+    idx_mat = getSLICIdxCpu(img3u, num_css);
+    cv::cvtColor(img3u, lab3u, COLOR_BGR2Lab);
+
+    mag_engine->processImgCpu(img3u, mag1u);
+
+    getSegmentationI( lab3u, mag1u, idx_mat,
+        hfsSettings.egbThresholdI, hfsSettings.minRegionSizeI, tmp, num_css );
+    getSegmentationII(lab3u, mag1u, tmp,
+        hfsSettings.egbThresholdII, hfsSettings.minRegionSizeII, seg, num_css);
+    return num_css;
+}
+
+int HfsCore::processImageGpu(const Mat &img3u, Mat &seg)
+{
+#ifdef _HFS_CUDA_ON_
+    Mat idx_mat, lab3u, mag1u, tmp;
+    int num_css;
+
+    idx_mat = getSLICIdxGpu(img3u, num_css);
+    cv::cvtColor(img3u, lab3u, COLOR_BGR2Lab);
+
+    mag_engine->processImgGpu(img3u, mag1u);
+
+    getSegmentationI(lab3u, mag1u, idx_mat,
+        hfsSettings.egbThresholdI, hfsSettings.minRegionSizeI, tmp, num_css);
+    getSegmentationII(lab3u, mag1u, tmp,
+        hfsSettings.egbThresholdII, hfsSettings.minRegionSizeII, seg, num_css);
+    return num_css;
+#else
+    return processImageCpu(img3u, seg);
+#endif
+}
+
+#ifdef _HFS_CUDA_ON_
+Mat HfsCore::getSLICIdxGpu(const Mat& img3u, int &num_css)
+{
+    const int _h = img3u.rows;
+    const int _w = img3u.cols;
+    const int _s = _h*_w;
+
+    loadImage(img3u, in_img);
+    gslic_engine->setImageSize(img3u.cols, img3u.rows);
+
+    gslic_engine->processFrame(in_img);
+    const IntImage *idx_img = gslic_engine->getSegRes();
+    int* idx_img_ptr = (int*)idx_img->getCpuData();
+
+    num_css = 0;
+    int _max =
+        (int)ceil((float)_w / 8.0f)*(int)ceil((float)_h / 8.0f);
+    vector<int> indexes(_max, 0);
+    for (int i = 0; i < _s; i++)
+        indexes[idx_img_ptr[i]]++;
+    for (int i = 0; i < _max; i++)
+        indexes[i] = (indexes[i] != 0) ? num_css++ : 0;
+    for (int i = 0; i < _s; i++)
+        idx_img_ptr[i] = indexes[idx_img_ptr[i]];
+    Mat idx_mat(_h, _w, CV_32S, idx_img_ptr);
+    idx_mat.convertTo(idx_mat, CV_16U);
+    return idx_mat;
+}
+
+#endif
+
+}}
--- a/modules/hfs/src/hfs_core.hpp
+++ b/modules/hfs/src/hfs_core.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_HFS_CORE_HPP_
+#define _OPENCV_HFS_CORE_HPP_
+
+#include "opencv2/core.hpp"
+
+#include "magnitude/magnitude.hpp"
+#include "merge/merge.hpp"
+
+#include "or_utils/or_types.hpp"
+#include "slic/slic.hpp"
+
+#define DOUBLE_EPS 1E-6
+
+namespace cv { namespace hfs {
+
+
+struct HfsSettings
+{
+    float egbThresholdI;
+    int minRegionSizeI;
+    float egbThresholdII;
+    int minRegionSizeII;
+    cv::hfs::slic::slicSettings slicSettings;
+};
+
+class HfsCore
+{
+public:
+    HfsCore(int height, int width,
+        float segThresholdI, int minRegionSizeI,
+        float segThresholdII, int minRegionSizeII,
+        float spatialWeight, int spixelSize, int numIter);
+    ~HfsCore();
+
+    void loadImage( const cv::Mat& inimg, Ptr<UChar4Image> outimg );
+    inline float getEulerDistance( cv::Vec3f in1, cv::Vec3f in2 )
+    {
+        cv::Vec3f diff = in1 - in2;
+        return sqrt(diff.dot(diff));
+    }
+
+    cv::Vec4f getColorFeature( const cv::Vec3f& in1, const cv::Vec3f& in2 );
+    int getAvgGradientBdry( const cv::Mat& idx_mat,
+        const std::vector<cv::Mat> &mag1u, int num_css, cv::Mat &bd_num,
+        std::vector<cv::Mat> &gradients );
+
+    void getSegmentationI( const cv::Mat& lab3u,
+        const cv::Mat& mag1u, const cv::Mat& idx_mat,
+        float c, int min_size, cv::Mat& seg, int& num_css);
+    void getSegmentationII(
+        const cv::Mat& lab3u, const cv::Mat& mag1u, const cv::Mat& idx_mat,
+        float c, int min_size, cv::Mat& seg, int &num_css );
+    void drawSegmentationRes( const cv::Mat& seg, const cv::Mat& img3u,
+                              int num_css, cv::Mat& show );
+
+    cv::Mat getSLICIdxCpu(const cv::Mat& img3u, int &num_css);
+    int processImageCpu( const cv::Mat& img3u, cv::Mat& seg );
+    int processImageGpu(const cv::Mat& img3u, cv::Mat& seg);
+
+    void constructEngine();
+    void reconstructEngine();
+
+public:
+    HfsSettings hfsSettings;
+
+private:
+    std::vector<float> w1, w2;
+    Ptr<Magnitude> mag_engine;
+
+#ifdef _HFS_CUDA_ON_
+public:
+    cv::Mat getSLICIdxGpu(const cv::Mat& img3u, int &num_css);
+private:
+    cv::Ptr<UChar4Image> in_img, out_img;
+    cv::Ptr<slic::engines::CoreEngine> gslic_engine;
+#endif
+
+};
+
+}}
+
+#endif
--- a/modules/hfs/src/magnitude/magnitude.cpp
+++ b/modules/hfs/src/magnitude/magnitude.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "../precomp.hpp"
+#include "magnitude.hpp"
+
+namespace cv { namespace hfs {
+
+Magnitude::Magnitude(int height, int width)
+{
+    Vector2i size(height, width);
+    delta_x = Ptr<IntImage>(new IntImage(size));
+    delta_y = Ptr<IntImage>(new IntImage(size));
+    mag = Ptr<IntImage>(new IntImage(size));
+    gray_img = Ptr<UCharImage>(new UCharImage(size));
+    nms_mag = Ptr<UCharImage>(new UCharImage(size));
+    img_size = Vector2i(height, width);
+}
+
+Magnitude::~Magnitude()
+{
+}
+
+void Magnitude::loadImage(const Mat& inimg, Ptr<UCharImage> outimg)
+{
+    const int _h = inimg.rows, _w = inimg.cols;
+    uchar* outimg_ptr = outimg->getCpuData();
+    for (int y = 0; y < _h; y++)
+    {
+        const uchar *ptr = inimg.ptr<uchar>(y);
+        for (int x = 0; x < _w; x++)
+        {
+            int idx = x + y * _w;
+            outimg_ptr[idx] = ptr[x];
+        }
+    }
+}
+
+void Magnitude::loadImage(const Ptr<UCharImage> inimg, Mat& outimg)
+{
+    const int _h = outimg.rows, _w = outimg.cols;
+    const uchar* inimg_ptr = inimg->getCpuData();
+    for (int y = 0; y < _h; y++)
+    {
+        uchar *ptr = outimg.ptr<uchar>(y);
+        for (int x = 0; x < _w; x++)
+        {
+            int idx = x + y * outimg.cols;
+            ptr[x] = inimg_ptr[idx];
+        }
+    }
+}
+
+void Magnitude::derrivativeXYCpu()
+{
+    uchar *gray_ptr = gray_img->getCpuData();
+    int *dx_ptr = delta_x->getCpuData();
+    int *dy_ptr = delta_y->getCpuData();
+    int *mag_ptr = mag->getCpuData();
+
+    for (int y = 0; y < img_size.y; ++y) {
+        for (int x = 0; x < img_size.x; ++x) {
+            int idx = y * img_size.x + x;
+            if (x == 0)
+                dx_ptr[idx] = gray_ptr[idx + 1] - gray_ptr[idx];
+            else if (x == img_size.x - 1)
+                dx_ptr[idx] = gray_ptr[idx] - gray_ptr[idx - 1];
+            else
+                dx_ptr[idx] = gray_ptr[idx + 1] - gray_ptr[idx - 1];
+
+            if (y == 0)
+                dy_ptr[idx] = gray_ptr[idx + img_size.x] - gray_ptr[idx];
+            else if (y == img_size.y - 1)
+                dy_ptr[idx] = gray_ptr[idx] - gray_ptr[idx - img_size.x];
+            else
+                dy_ptr[idx] = gray_ptr[idx + img_size.x] - gray_ptr[idx - img_size.x];
+
+            mag_ptr[idx] = (int)(0.5 + sqrt((double)(dx_ptr[idx] * dx_ptr[idx] + dy_ptr[idx] * dy_ptr[idx])));
+
+        }
+    }
+}
+
+void Magnitude::nonMaxSuppCpu()
+{
+    int *dx_ptr = delta_x->getCpuData();
+    int *dy_ptr = delta_y->getCpuData();
+    int *mag_ptr = mag->getCpuData();
+    uchar *nms_ptr = nms_mag->getCpuData();
+
+    for (int y = 0; y < img_size.y; ++y) {
+        for (int x = 0; x < img_size.x; ++x) {
+            int idx = y*img_size.x + x;
+            if (x == 0 || x == img_size.x - 1 || y == 0 || y == img_size.y - 1) {
+                nms_ptr[idx] = 0;
+                continue;
+            }
+            int m00, gx, gy, z1, z2;
+            double mag1, mag2, xprep, yprep;
+
+            m00 = mag_ptr[idx];
+            if (m00 == 0) {
+                nms_ptr[idx] = 0;
+                continue;
+            }
+            else {
+                xprep = -(gx = dx_ptr[idx]) / ((double)m00);
+                yprep = (gy = dy_ptr[idx]) / ((double)m00);
+            }
+
+            if (gx >= 0) {
+                if (gy >= 0) {
+                    if (gx >= gy) {
+                        z1 = mag_ptr[idx - 1];
+                        z2 = mag_ptr[idx - img_size.x - 1];
+                        mag1 = (m00 - z1)*xprep + (z2 - z1)*yprep;
+
+                        z1 = mag_ptr[idx + 1];
+                        z2 = mag_ptr[idx + img_size.x + 1];
+                        mag2 = (m00 - z1)*xprep + (z2 - z1)*yprep;
+                    }
+                    else {
+                        z1 = mag_ptr[idx - img_size.x];
+                        z2 = mag_ptr[idx - img_size.x - 1];
+                        mag1 = (z1 - z2)*xprep + (z1 - m00)*yprep;
+
+                        z1 = mag_ptr[idx + img_size.x];
+                        z2 = mag_ptr[idx + img_size.x + 1];
+                        mag2 = (z1 - z2)*xprep + (z1 - m00)*yprep;
+                    }
+                }
+                else {
+                    if (gx >= -gy) {
+                        z1 = mag_ptr[idx - 1];
+                        z2 = mag_ptr[idx + img_size.x - 1];
+                        mag1 = (m00 - z1)*xprep + (z1 - z2)*yprep;
+
+                        z1 = mag_ptr[idx + 1];
+                        z2 = mag_ptr[idx - img_size.x + 1];
+                        mag2 = (m00 - z1)*xprep + (z1 - z2)*yprep;
+                    }
+                    else {
+                        z1 = mag_ptr[idx + img_size.x];
+                        z2 = mag_ptr[idx + img_size.x - 1];
+                        mag1 = (z1 - z2)*xprep + (m00 - z1)*yprep;
+
+                        z1 = mag_ptr[idx - img_size.x];
+                        z2 = mag_ptr[idx - img_size.x + 1];
+                        mag2 = (z1 - z2)*xprep + (m00 - z1)*yprep;
+                    }
+                }
+            }
+            else {
+                if (gy >= 0) {
+                    if (-gx >= gy) {
+                        z1 = mag_ptr[idx + 1];
+                        z2 = mag_ptr[idx - img_size.x + 1];
+                        mag1 = (z1 - m00)*xprep + (z2 - z1)*yprep;
+
+                        z1 = mag_ptr[idx - 1];
+                        z2 = mag_ptr[idx + img_size.x - 1];
+                        mag2 = (z1 - m00)*xprep + (z2 - z1)*yprep;
+                    }
+                    else {
+                        z1 = mag_ptr[idx - img_size.x];
+                        z2 = mag_ptr[idx - img_size.x + 1];
+                        mag1 = (z2 - z1)*xprep + (z1 - m00)*yprep;
+
+                        z1 = mag_ptr[idx + img_size.x];
+                        z2 = mag_ptr[idx + img_size.x - 1];
+                        mag2 = (z2 - z1)*xprep + (z1 - m00)*yprep;
+                    }
+                }
+                else {
+                    if (-gx > -gy) {
+                        z1 = mag_ptr[idx + 1];
+                        z2 = mag_ptr[idx + img_size.x + 1];
+                        mag1 = (z1 - m00)*xprep + (z1 - z2)*yprep;
+
+                        z1 = mag_ptr[idx - 1];
+                        z2 = mag_ptr[idx - img_size.x - 1];
+                        mag2 = (z1 - m00)*xprep + (z1 - z2)*yprep;
+                    }
+                    else {
+                        z1 = mag_ptr[idx + img_size.x];
+                        z2 = mag_ptr[idx + img_size.x + 1];
+                        mag1 = (z2 - z1)*xprep + (m00 - z1)*yprep;
+
+                        z1 = mag_ptr[idx - img_size.x];
+                        z2 = mag_ptr[idx - img_size.x - 1];
+                        mag2 = (z2 - z1)*xprep + (m00 - z1)*yprep;
+                    }
+                }
+            }
+
+            if (mag1 > 0 || mag2 >= 0)
+                nms_ptr[idx] = 0;
+            else
+                nms_ptr[idx] = (uchar)min(max(m00, 0), 255);
+        }
+    }
+}
+
+void Magnitude::processImgCpu(const Mat &bgr3u, Mat &mag1u)
+{
+    Mat gray, blur1u;
+    cvtColor(bgr3u, gray, COLOR_BGR2GRAY);
+    GaussianBlur(gray, blur1u, Size(7, 7), 1, 1);
+
+    img_size.x = bgr3u.cols;
+    img_size.y = bgr3u.rows;
+
+    loadImage(blur1u, gray_img);
+    derrivativeXYCpu();
+    nonMaxSuppCpu();
+    mag1u.create(bgr3u.rows, bgr3u.cols, CV_8UC1);
+    loadImage(nms_mag, mag1u);
+}
+
+}}
--- a/modules/hfs/src/magnitude/magnitude.hpp
+++ b/modules/hfs/src/magnitude/magnitude.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_MAGNITUDE_HPP_
+#define _OPENCV_MAGNITUDE_HPP_
+#include "../or_utils/or_types.hpp"
+#include "opencv2/core.hpp"
+
+//------------------------------------------------------
+//
+// Compile time GPU Settings
+//
+//------------------------------------------------------
+#ifndef HFS_BLOCK_DIM
+#define HFS_BLOCK_DIM 16
+#endif
+
+namespace cv { namespace hfs {
+
+class Magnitude
+{
+    cv::Ptr<IntImage> delta_x, delta_y, mag;
+    cv::Ptr<UCharImage> gray_img, nms_mag;
+    Vector2i img_size;
+
+public:
+    Magnitude(int height, int width);
+    ~Magnitude();
+
+    void loadImage(const cv::Mat& inimg, cv::Ptr<UCharImage> outimg);
+    void loadImage(const cv::Ptr<UCharImage> inimg, cv::Mat& outimg);
+
+    void derrivativeXYCpu();
+    void nonMaxSuppCpu();
+
+    void derrivativeXYGpu();
+    void nonMaxSuppGpu();
+
+    void processImgCpu(const cv::Mat& bgr3u, cv::Mat& mag1u);
+    void processImgGpu(const cv::Mat& bgr3u, cv::Mat& mag1u);
+};
+
+}}
+
+#endif
--- a/modules/hfs/src/merge/merge.cpp
+++ b/modules/hfs/src/merge/merge.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "merge.hpp"
+
+using namespace std;
+
+namespace cv { namespace hfs {
+
+Ptr<RegionSet> egb_merge(int num_vertices, int num_edges,
+    vector<Edge> &edges, float c, vector<int> size)
+{
+    sort(edges.begin(), edges.end());
+
+    Ptr<RegionSet> regions(new RegionSet(num_vertices, size));
+
+    vector<float> threshold(num_vertices);
+    for (int i = 0; i < num_vertices; i++)
+        threshold[i] = c;
+
+
+    for (int i = 0; i < num_edges; i++) {
+        Edge *pedge = &edges[i];
+
+        int a = regions->find(pedge->a);
+        int b = regions->find(pedge->b);
+        if (a != b) {
+            if ((pedge->w <= threshold[a]) &&
+                (pedge->w <= threshold[b])) {
+                regions->join(a, b);
+                a = regions->find(a);
+                threshold[a] = pedge->w + c / regions->mergedSize(a);
+            }
+        }
+    }
+    return regions;
+}
+
+}}
--- a/modules/hfs/src/merge/merge.hpp
+++ b/modules/hfs/src/merge/merge.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_EGB_SEGMENT_HPP_
+#define _OPENCV_EGB_SEGMENT_HPP_
+
+
+#include <vector>
+#include <opencv2/core.hpp>
+
+
+namespace cv { namespace hfs {
+
+struct Region
+{
+    int rank;
+    int p;
+    int mergedSize;
+    int numPix;
+};
+
+class Edge
+{
+public:
+    float w;
+    int a, b;
+
+    bool operator<(const Edge &other) const
+    {
+        return this->w < other.w;
+    }
+};
+
+class RegionSet
+{
+public:
+    RegionSet(int elements, std::vector<int> size)
+    {
+        elts = std::vector<Region>(elements);
+        num = elements;
+        for (int i = 0; i < elements; i++)
+        {
+            elts[i].rank = 0;
+            elts[i].mergedSize = 1;
+            elts[i].numPix = size[i];
+            elts[i].p = i;
+        }
+    }
+
+    ~RegionSet() {}
+
+    int find(int x)
+    {
+        int y = x;
+        while (y != elts[y].p)
+            y = elts[y].p;
+        elts[x].p = y;
+        return y;
+    }
+
+    void join(int x, int y)
+    {
+        if (elts[x].rank > elts[y].rank)
+        {
+            elts[y].p = x;
+            elts[x].mergedSize += elts[y].mergedSize;
+            elts[x].numPix += elts[y].numPix;
+        }
+        else
+        {
+            elts[x].p = y;
+            elts[y].mergedSize += elts[x].mergedSize;
+            elts[y].numPix += elts[x].numPix;
+            if (elts[x].rank == elts[y].rank)
+                elts[y].rank++;
+        }
+        num--;
+    }
+
+    int mergedSize(int x) const { return elts[x].mergedSize; }
+    int numPix(int x) const { return elts[x].numPix; }
+
+    int num_sets() const { return num; }
+
+private:
+    std::vector<Region> elts;
+    int num;
+};
+
+Ptr<RegionSet> egb_merge(int num_vertices, int num_edges,
+    std::vector<Edge>& edges, float c, std::vector<int> size);
+
+}}
+
+#endif
--- a/modules/hfs/src/or_utils/or_image.hpp
+++ b/modules/hfs/src/or_utils/or_image.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_OR_IMAGE_HPP_
+#define _OPENCV_OR_IMAGE_HPP_
+
+
+#include "or_memory_block.hpp"
+namespace cv { namespace hfs { namespace orutils {
+
+template <typename T>
+class Image : public MemoryBlock < T >
+{
+public:
+    Vector2<int> noDims;
+
+    Image( Vector2<int> noDims_ )
+        : MemoryBlock<T>( noDims_.x * noDims_.y )
+    {
+        this->noDims = noDims_;
+    }
+
+    Image(const Image&);
+    Image& operator=(const Image&);
+};
+
+}}}
+
+#endif
--- a/modules/hfs/src/or_utils/or_memory_block.hpp
+++ b/modules/hfs/src/or_utils/or_memory_block.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_OR_MEMORY_BLOCK_HPP_
+#define _OPENCV_OR_MEMORY_BLOCK_HPP_
+
+#ifdef _HFS_CUDA_ON_
+#include "opencv2/core/cuda/common.hpp"
+#endif
+
+#include "stddef.h"
+
+namespace cv { namespace hfs { namespace orutils {
+
+template <typename T>
+class MemoryBlock
+{
+protected:
+    T* data_cpu;
+
+#ifdef _HFS_CUDA_ON_
+    T* data_cuda;
+#endif
+
+public:
+
+    size_t dataSize;
+    inline T* getCpuData()
+    {
+        return data_cpu;
+    }
+
+    inline const T* getCpuData() const
+    {
+        return data_cpu;
+    }
+
+    MemoryBlock(size_t dataSize_)
+    {
+        Allocate(dataSize_);
+        clear();
+    }
+
+    void clear(unsigned char defaultValue = 0)
+    {
+        memset(data_cpu, defaultValue, dataSize * sizeof(T));
+#ifdef _HFS_CUDA_ON_
+        cudaSafeCall(cudaMemset(data_cuda,
+                defaultValue, dataSize * sizeof(T)));
+#endif
+    }
+
+#ifdef _HFS_CUDA_ON_
+    enum MemoryCopyDirection
+    {
+        CPU_TO_CPU, CPU_TO_CUDA, CUDA_TO_CPU, CUDA_TO_CUDA
+    };
+    inline const T* getGpuData() const
+    {
+        return data_cuda;
+    }
+
+    inline T* getGpuData()
+    {
+        return data_cuda;
+    }
+    void updateDeviceFromHost()
+    {
+        cudaSafeCall(cudaMemcpy(data_cuda,
+                data_cpu, dataSize * sizeof(T), cudaMemcpyHostToDevice));
+    }
+    void updateHostFromDevice()
+    {
+        cudaSafeCall(cudaMemcpy(data_cpu,
+                data_cuda, dataSize * sizeof(T), cudaMemcpyDeviceToHost));
+    }
+    void setFrom(const MemoryBlock<T> *source,
+        MemoryCopyDirection memoryCopyDirection)
+    {
+        switch (memoryCopyDirection)
+        {
+        case CPU_TO_CPU:
+            memcpy(this->data_cpu, source->data_cpu,
+                source->dataSize * sizeof(T));
+            break;
+        case CPU_TO_CUDA:
+            cudaSafeCall(cudaMemcpyAsync(this->data_cuda, source->data_cpu,
+                source->dataSize * sizeof(T), cudaMemcpyHostToDevice));
+            break;
+        case CUDA_TO_CPU:
+            cudaSafeCall(cudaMemcpy(this->data_cpu, source->data_cuda,
+                source->dataSize * sizeof(T), cudaMemcpyDeviceToHost));
+            break;
+        case CUDA_TO_CUDA:
+            cudaSafeCall(cudaMemcpyAsync(this->data_cuda, source->data_cuda,
+                source->dataSize * sizeof(T), cudaMemcpyDeviceToDevice));
+            break;
+        default: break;
+        }
+    }
+#endif
+    virtual ~MemoryBlock() { this->Free(); }
+
+    void Allocate(size_t dataSize_)
+    {
+        //Free();
+        this->dataSize = dataSize_;
+#ifdef _HFS_CUDA_ON_
+        cudaSafeCall(cudaMallocHost((void**)&data_cpu, dataSize_ * sizeof(T)));
+        cudaSafeCall(cudaMalloc((void**)&data_cuda, dataSize_ * sizeof(T)));
+#else
+        data_cpu = new T[dataSize_];
+#endif
+    }
+
+    void Free() {
+#ifdef _HFS_CUDA_ON_
+        cudaSafeCall(cudaFreeHost(data_cpu));
+        cudaSafeCall(cudaFree(data_cuda));
+#else
+        delete[] data_cpu;
+#endif
+    }
+
+    MemoryBlock(const MemoryBlock&);
+    MemoryBlock& operator=(const MemoryBlock&);
+};
+
+
+}}}
+
+
+#endif
--- a/modules/hfs/src/or_utils/or_types.hpp
+++ b/modules/hfs/src/or_utils/or_types.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_OR_TYPES_HPP_
+#define _OPENCV_OR_TYPES_HPP_
+
+#include "or_vector.hpp"
+#include "or_image.hpp"
+
+//------------------------------------------------------
+//
+// math defines
+//
+//------------------------------------------------------
+
+typedef cv::hfs::orutils::Vector2<int> Vector2i;
+typedef cv::hfs::orutils::Vector2<float> Vector2f;
+
+typedef cv::hfs::orutils::Vector4<float> Vector4f;
+typedef cv::hfs::orutils::Vector4<int> Vector4i;
+typedef cv::hfs::orutils::Vector4<unsigned char> Vector4u;
+
+//------------------------------------------------------
+//
+// image defines
+//
+//------------------------------------------------------
+
+typedef  cv::hfs::orutils::Image<int> IntImage;
+typedef  cv::hfs::orutils::Image<unsigned char> UCharImage;
+typedef  cv::hfs::orutils::Image<float> FloatImage;
+typedef  cv::hfs::orutils::Image<Vector4f> Float4Image;
+typedef  cv::hfs::orutils::Image<Vector4u> UChar4Image;
+
+#endif
--- a/modules/hfs/src/or_utils/or_vector.hpp
+++ b/modules/hfs/src/or_utils/or_vector.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_OR_VECTOR_HPP_
+#define _OPENCV_OR_VECTOR_HPP_
+
+#ifdef __CUDACC__
+#define __CV_CUDA_HOST_DEVICE__ __host__ __device__ __forceinline__
+#else
+#define __CV_CUDA_HOST_DEVICE__
+#endif
+
+namespace cv { namespace hfs { namespace orutils {
+
+
+template <class T>
+struct Vector2_ {
+    T x, y;
+};
+
+template <class T>
+struct Vector4_ {
+    T x, y, z, w;
+};
+
+template <class T> class Vector2 : public Vector2_ < T >
+{
+public:
+    __CV_CUDA_HOST_DEVICE__ Vector2() {}
+
+    __CV_CUDA_HOST_DEVICE__ Vector2(const T v0, const T v1)
+    {
+        this->x = v0;
+        this->y = v1;
+    }
+
+    __CV_CUDA_HOST_DEVICE__ Vector2(const Vector2_<T> &v)
+    {
+        this->x = v.x;
+        this->y = v.y;
+    }
+
+    __CV_CUDA_HOST_DEVICE__ friend Vector2<T> &operator /= (Vector2<T> &lhs, T d)
+    {
+        if (d == 0) {
+            return lhs;
+        }
+        lhs.x /= d;
+        lhs.y /= d;
+        return lhs;
+    }
+
+    __CV_CUDA_HOST_DEVICE__ friend Vector2<T>&
+        operator += (Vector2<T> &lhs, const Vector2<T> &rhs)
+    {
+        lhs.x += rhs.x;
+        lhs.y += rhs.y;
+        return lhs;
+    }
+};
+
+template <class T> class Vector4 : public Vector4_ < T >
+{
+public:
+    __CV_CUDA_HOST_DEVICE__ Vector4() {}
+
+    __CV_CUDA_HOST_DEVICE__ Vector4(const T v0, const T v1, const T v2, const T v3)
+    {
+        this->x = v0;
+        this->y = v1;
+        this->z = v2;
+        this->w = v3;
+    }
+
+    __CV_CUDA_HOST_DEVICE__ friend Vector4<T> &operator /= (Vector4<T> &lhs, T d)
+    {
+        lhs.x /= d;
+        lhs.y /= d;
+        lhs.z /= d;
+        lhs.w /= d;
+        return lhs;
+    }
+
+    __CV_CUDA_HOST_DEVICE__ friend Vector4<T>&
+        operator += (Vector4<T> &lhs, const Vector4<T> &rhs)
+    {
+        lhs.x += rhs.x;
+        lhs.y += rhs.y;
+        lhs.z += rhs.z;
+        lhs.w += rhs.w;
+        return lhs;
+    }
+};
+
+
+}}}
+
+#endif
--- a/modules/hfs/src/precomp.hpp
+++ b/modules/hfs/src/precomp.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_PRECOMP_HPP_
+#define _OPENCV_PRECOMP_HPP_
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+#include <vector>
+#include "math.h"
+#include "or_utils/or_types.hpp"
+
+#endif
--- a/modules/hfs/src/slic/gslic_engine.cpp
+++ b/modules/hfs/src/slic/gslic_engine.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#ifdef _HFS_CUDA_ON_
+
+#include "slic.hpp"
+
+namespace cv { namespace hfs { namespace slic { namespace engines {
+
+SegEngine::SegEngine(const slicSettings& in_settings)
+{
+    slic_settings = in_settings;
+}
+
+SegEngine::~SegEngine() {}
+
+void SegEngine::performSegmentation(Ptr<UChar4Image> in_img)
+{
+    source_img->setFrom(in_img, UChar4Image::CPU_TO_CUDA);
+    cvtImgSpace(source_img, cvt_img);
+
+    initClusterCenters();
+    findCenterAssociation();
+
+    for (int i = 0; i < slic_settings.num_iters; i++)
+    {
+        updateClusterCenter();
+        findCenterAssociation();
+    }
+
+    enforceConnectivity();
+    cudaDeviceSynchronize();
+}
+
+
+CoreEngine::CoreEngine(const slicSettings& in_settings)
+{
+    slic_seg_engine = Ptr<SegEngine>(new SegEngineGPU(in_settings));
+}
+
+CoreEngine::~CoreEngine() {}
+
+void CoreEngine::setImageSize(int x, int y)
+{
+    slic_seg_engine->setImageSize(x, y);
+}
+
+void CoreEngine::processFrame(Ptr<UChar4Image> in_img)
+{
+    slic_seg_engine->performSegmentation(in_img);
+}
+
+const Ptr<IntImage> CoreEngine::getSegRes()
+{
+    return slic_seg_engine->getSegMask();
+}
+
+
+}}}}
+
+#endif
--- a/modules/hfs/src/slic/slic.cpp
+++ b/modules/hfs/src/slic/slic.cpp
+#include "../precomp.hpp"
+#include "slic.hpp"
+using namespace std;
+
+#define vec2db vector<vector<bool> >
+
+namespace cv{ namespace hfs{ namespace slic{
+
+void cSLIC::init_data(Mat image_) {
+    image = image_;
+    lab = cvt_img_space();
+    map_size[0] = (int)ceil((float)lab.cols / (float)spixel_size);
+    map_size[1] = (int)ceil((float)lab.rows / (float)spixel_size);
+
+    // initialize distance normalizer
+    // normalizing factors
+    max_color_dist = 15.0f / (1.7321f * 128);
+    max_color_dist *= max_color_dist;
+    max_xy_dist = 1.0f / (2.0f * spixel_size * spixel_size);
+    // initialize index map
+    idx_img = vector<int>(lab.rows * lab.cols);
+    for (int i = 0; i < lab.rows * lab.cols; ++i) {
+        idx_img[i] = -1;
+    }
+    // initialize super pixel list
+    spixel_list = vector<cSpixelInfo>(map_size[0] * map_size[1]);
+
+    // initialize cluster
+    for (int x = 0; x < map_size[0]; ++x) {
+        for (int y = 0; y < map_size[1]; ++y) {
+            int cluster_idx = y * map_size[0] + x;
+
+            int img_x = x * spixel_size + spixel_size / 2;
+            int img_y = y * spixel_size + spixel_size / 2;
+
+            img_x = img_x >= lab.cols ? (x * spixel_size + lab.cols) / 2 : img_x;
+            img_y = img_y >= lab.rows ? (y * spixel_size + lab.rows) / 2 : img_y;
+
+            spixel_list[cluster_idx].id = cluster_idx;
+            spixel_list[cluster_idx].center = Vec2f((float)img_x, (float) img_y);
+
+            spixel_list[cluster_idx].color_info = lab.at<Vec3f>(img_y, img_x);
+            spixel_list[cluster_idx].num_pixels = 0;
+        }
+    }
+}
+
+Mat cSLIC::cvt_img_space() {
+    float epsilon = 0.008856f;	//actual CIE standard
+    float kappa = 903.3f;		//actual CIE standard
+
+    float Xr = 0.950456f;	//reference white
+	float Yr = 1.0f;		//reference white
+	float Zr = 1.088754f;	//reference white
+
+    Mat lab_ = Mat(image.size(), CV_32FC3);
+
+    for(int i = 0; i < image.rows; ++i) {
+        for(int j = 0; j < image.cols; ++j){
+            Vec3b pix_in = image.at<Vec3b>(i,j);
+            float _r = (float)pix_in[0] / 255;
+	        float _g = (float)pix_in[1] / 255;
+	        float _b = (float)pix_in[2] / 255;
+
+	        if (_b <= 0.04045f)    _b = _b / 12.92f;
+	        else                   _b = pow((_b + 0.055f) / 1.055f, 2.4f);
+	        if (_g <= 0.04045f)    _g = _g / 12.92f;
+	        else                   _g = pow((_g + 0.055f) / 1.055f, 2.4f);
+	        if (_r <= 0.04045f)    _r = _r / 12.92f;
+	        else                   _r = pow((_r + 0.055f) / 1.055f, 2.4f);
+
+	        float x = _r*0.4124564f + _g*0.3575761f + _b*0.1804375f;
+	        float y = _r*0.2126729f + _g*0.7151522f + _b*0.0721750f;
+	        float z = _r*0.0193339f + _g*0.1191920f + _b*0.9503041f;
+
+
+	        float xr = x / Xr;
+	        float yr = y / Yr;
+	        float zr = z / Zr;
+
+	        float fx, fy, fz;
+	        if (xr > epsilon)	fx = pow(xr, 1.0f / 3.0f);
+	        else				fx = (kappa*xr + 16.0f) / 116.0f;
+	        if (yr > epsilon)	fy = pow(yr, 1.0f / 3.0f);
+	        else				fy = (kappa*yr + 16.0f) / 116.0f;
+	        if (zr > epsilon)	fz = pow(zr, 1.0f / 3.0f);
+	        else				fz = (kappa*zr + 16.0f) / 116.0f;
+
+	        lab_.at<Vec3f>(i, j)[0] = 116.0f*fy - 16.0f;
+	        lab_.at<Vec3f>(i, j)[1]  = 500.0f*(fx - fy);
+	        lab_.at<Vec3f>(i, j)[2]  = 200.0f*(fy - fz);
+        }
+    }
+
+    return lab_;
+}
+
+float cSLIC::compute_dist(Point pix, cSpixelInfo center_info) {
+    Vec3f color = lab.at<Vec3f>(pix.y, pix.x);
+    float dcolor =
+        (color[0] - center_info.color_info[0])*(color[0] - center_info.color_info[0])
+        + (color[1] - center_info.color_info[1])*(color[1] - center_info.color_info[1])
+        + (color[2] - center_info.color_info[2])*(color[2] - center_info.color_info[2]);
+
+    float dxy =
+        (float)((pix.x - center_info.center[0]) * (pix.x - center_info.center[0])
+        + (pix.y - center_info.center[1]) * (pix.y - center_info.center[1]));
+
+    float retval =
+        dcolor * max_color_dist + spatial_weight * dxy * max_xy_dist;
+    return sqrtf(retval);
+}
+
+vector<int> cSLIC::generate_superpixels(Mat image_, int spixel_size_, float spatial_weight_) {
+    spixel_size = spixel_size_;
+    spatial_weight = spatial_weight_;
+
+    init_data(image_);
+    find_association();
+    for (int iter = 0; iter < 5; ++iter) {
+        update_cluster_center();
+        find_association();
+    }
+    enforce_connect(2, 16);
+    enforce_connect(2, 16);
+    enforce_connect(1, 5);
+    enforce_connect(1, 5);
+    return idx_img;
+}
+
+void cSLIC::find_association() {
+    for (int y = 0; y < lab.rows; ++y) {
+        for (int x = 0; x < lab.cols; ++x) {
+
+            int ctr_x = x / spixel_size;
+            int ctr_y = y / spixel_size;
+
+            int idx = y * lab.cols + x;
+
+            int minidx = -1;
+            float dist = FLT_MAX;
+
+            for (int i = -1; i <= 1; ++i) {
+                for (int j = -1; j <= 1; ++j) {
+                    int ctr_x_check = ctr_x + j;
+                    int ctr_y_check = ctr_y + i;
+                    if (ctr_x_check >= 0 && ctr_y_check >= 0 &&
+                        ctr_x_check < map_size[0] && ctr_y_check < map_size[1]) {
+                        int ctr_idx = ctr_y_check*map_size[0] + ctr_x_check;
+                        float cdist = compute_dist(Point(x, y), spixel_list[ctr_idx]);
+                        if (cdist < dist) {
+                            dist = cdist;
+                            minidx = spixel_list[ctr_idx].id;
+                        }
+                    }
+                }
+            }
+            if (minidx >= 0) {
+                idx_img[idx] = minidx;
+            }
+        }
+    }
+}
+
+void cSLIC::update_cluster_center() {
+
+    for (int i = 0; i < map_size[0] * map_size[1]; ++i) {
+        spixel_list[i].center = Vec2f(0.0f, 0.0f);
+        spixel_list[i].color_info = Vec3f(0.0f, 0.0f, 0.0f);
+        spixel_list[i].num_pixels = 0;
+    }
+
+    for (int i = 0; i < lab.rows; ++i) {
+        for (int j = 0; j < lab.cols; ++j) {
+            int idx = i * lab.cols + j;
+            spixel_list[idx_img[idx]].center += Vec2f((float)j, (float)i);
+            spixel_list[idx_img[idx]].color_info += lab.at<Vec3f>(i, j);
+            spixel_list[idx_img[idx]].num_pixels += 1;
+        }
+    }
+
+    for (int i = 0; i < map_size[0] * map_size[1]; ++i) {
+        if (spixel_list[i].num_pixels != 0) {
+            spixel_list[i].center /= spixel_list[i].num_pixels;
+            spixel_list[i].color_info /= spixel_list[i].num_pixels;
+        }
+        else {
+            spixel_list[i].center = Vec2f(-100.0f, -100.0f);
+            spixel_list[i].color_info = Vec3f(-100.0f, -100.0f, -100.0f);
+        }
+    }
+}
+
+void cSLIC::enforce_connect(int padding, int diff_threshold) {
+    vector<int> idx_img_cpy = idx_img;
+    for (int r = 0; r < lab.rows; ++r) {
+        for (int c = 0; c < lab.cols; ++c) {
+            if (r < padding || r >= lab.rows - padding || c < padding || c >= lab.cols - padding) {
+                continue;
+            }
+            int idx = r*lab.cols + c;
+            int num_diff = 0;
+            int diff_label = -1;
+            for (int i = -padding; i <= padding; ++i) {
+                for (int j = -padding; j <= padding; ++j) {
+                    int idx_t = (r + i)*lab.cols + (c + j);
+                    if (idx_img_cpy[idx] != idx_img_cpy[idx_t]) {
+                        ++num_diff;
+                        diff_label = idx_img_cpy[idx_t];
+                    }
+                }
+            }
+            if (num_diff > diff_threshold) {
+                idx_img[idx] = diff_label;
+            }
+        }
+    }
+}
+
+}}}
--- a/modules/hfs/src/slic/slic.hpp
+++ b/modules/hfs/src/slic/slic.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef _OPENCV_SLIC_HPP_
+#define _OPENCV_SLIC_HPP_
+
+#include "../or_utils/or_types.hpp"
+#include "opencv2/core.hpp"
+//------------------------------------------------------
+//
+// Compile time GPU Settings
+//
+//------------------------------------------------------
+
+#ifndef HFS_BLOCK_DIM
+#define HFS_BLOCK_DIM 16
+#endif
+
+namespace cv { namespace hfs { namespace slic {
+
+struct slicSettings
+{
+    Vector2i img_size;
+    int spixel_size;
+    int num_iters;
+    float coh_weight;
+};
+
+struct cSpixelInfo
+{
+    Vec2f center;
+    Vec3f color_info;
+    int id;
+    int num_pixels;
+};
+
+class cSLIC {
+private:
+    cv::Mat image;
+    cv::Mat lab;
+    std::vector<int> idx_img;
+
+    cv::Vec2i map_size;
+    std::vector<cSpixelInfo> spixel_list;
+    int spixel_size;
+    float spatial_weight;
+    float max_xy_dist, max_color_dist;
+
+    float compute_dist(cv::Point pix, cSpixelInfo center_info);
+
+    void init_data(cv::Mat image_);
+    cv::Mat cvt_img_space();
+    void find_association();
+    void update_cluster_center();
+    void enforce_connect(int padding, int diff_threshold);
+public:
+    cSLIC() {}
+    ~cSLIC() {}
+
+    std::vector<int> generate_superpixels(cv::Mat image, int spixel_size_, float spatial_weight_);
+};
+
+#ifdef _HFS_CUDA_ON_
+
+// utils used only by GPU version of slic
+struct gSpixelInfo
+{
+    Vector2f center;
+    Vector4f color_info;
+    int id;
+    int num_pixels;
+};
+
+typedef orutils::Image<gSpixelInfo> gSpixelMap;
+
+namespace engines
+{
+    class SegEngine
+    {
+    protected:
+
+        float max_color_dist;
+        float max_xy_dist;
+
+        cv::Ptr<UChar4Image> source_img;
+        cv::Ptr<Float4Image> cvt_img;
+        cv::Ptr<IntImage> idx_img;
+
+        cv::Ptr<gSpixelMap> spixel_map;
+        int spixel_size;
+
+        Vector2i img_size;
+        Vector2i map_size;
+
+        slicSettings slic_settings;
+        virtual void cvtImgSpace(cv::Ptr<UChar4Image> inimg,
+                                 cv::Ptr<Float4Image> outimg) = 0;
+        virtual void initClusterCenters() = 0;
+        virtual void findCenterAssociation() = 0;
+        virtual void updateClusterCenter() = 0;
+        virtual void enforceConnectivity() = 0;
+
+    public:
+
+        SegEngine(const slicSettings& in_settings);
+        virtual ~SegEngine();
+
+        const cv::Ptr<IntImage> getSegMask() const
+        {
+            idx_img->updateHostFromDevice();
+            return idx_img;
+        };
+
+        void setImageSize( int x, int y )
+        {
+            img_size.x = x;
+            img_size.y = y;
+            map_size.x = (int)ceil((float)x / (float)spixel_size);
+            map_size.y = (int)ceil((float)y / (float)spixel_size);
+        };
+
+        Vector2i getImageSize()
+        {
+            return img_size;
+        };
+
+        void performSegmentation(cv::Ptr<UChar4Image> in_img);
+    };
+
+    class SegEngineGPU : public SegEngine
+    {
+    private:
+        int no_grid_per_center;
+        cv::Ptr<gSpixelMap> accum_map;
+        cv::Ptr<IntImage> tmp_idx_img;
+    protected:
+        void cvtImgSpace(cv::Ptr<UChar4Image> inimg,
+                         cv::Ptr<Float4Image> outimg);
+        void initClusterCenters();
+        void findCenterAssociation();
+        void updateClusterCenter();
+        void enforceConnectivity();
+    public:
+        SegEngineGPU(const slicSettings& in_settings);
+        ~SegEngineGPU();
+    };
+
+    class CoreEngine
+    {
+    private:
+        cv::Ptr<SegEngine> slic_seg_engine;
+
+    public:
+
+        CoreEngine(const slicSettings& in_settings);
+        ~CoreEngine();
+
+        void setImageSize(int x, int y);
+
+        void processFrame(cv::Ptr<UChar4Image> in_img);
+
+        const cv::Ptr<IntImage> getSegRes();
+    };
+} // end namespace engine
+
+__host__ __device__ __forceinline__ void rgb2CIELab( const Vector4u& pix_in,
+                                           Vector4f& pix_out )
+{
+    float _b = (float)pix_in.x / 255;
+    float _g = (float)pix_in.y / 255;
+    float _r = (float)pix_in.z / 255;
+
+    if (_b <= 0.04045f)    _b = _b / 12.92f;
+    else                   _b = pow( (_b + 0.055f) / 1.055f, 2.4f );
+    if (_g <= 0.04045f)    _g = _g / 12.92f;
+    else                   _g = pow( (_g + 0.055f) / 1.055f, 2.4f );
+    if (_r <= 0.04045f)    _r = _r / 12.92f;
+    else                   _r = pow( (_r + 0.055f) / 1.055f, 2.4f );
+
+    float x = _r*0.4124564f + _g*0.3575761f + _b*0.1804375f;
+    float y = _r*0.2126729f + _g*0.7151522f + _b*0.0721750f;
+    float z = _r*0.0193339f + _g*0.1191920f + _b*0.9503041f;
+
+    float epsilon = 0.008856f;
+    float kappa = 903.3f;
+
+    float Xr = 0.950456f;
+    float Yr = 1.0f;
+    float Zr = 1.088754f;
+
+    float xr = x / Xr;
+    float yr = y / Yr;
+    float zr = z / Zr;
+
+    float fx, fy, fz;
+    if ( xr > epsilon )    fx = pow( xr, 1.0f / 3.0f );
+    else                fx = ( kappa*xr + 16.0f ) / 116.0f;
+    if ( yr > epsilon )    fy = pow( yr, 1.0f / 3.0f );
+    else                fy = ( kappa*yr + 16.0f ) / 116.0f;
+    if ( zr > epsilon )    fz = pow( zr, 1.0f / 3.0f );
+    else                fz = ( kappa*zr + 16.0f ) / 116.0f;
+
+    pix_out.x = 116.0f*fy - 16.0f;
+    pix_out.y = 500.0f*(fx - fy);
+    pix_out.z = 200.0f*(fy - fz);
+}
+
+__host__ __device__ __forceinline__ void initClusterCentersShared(
+    const Vector4f* inimg, Vector2i map_size, Vector2i img_size,
+    int spixel_size, int x, int y, cv::hfs::slic::gSpixelInfo* out_spixel)
+{
+    int cluster_idx = y * map_size.x + x;
+
+    int img_x = x * spixel_size + spixel_size / 2;
+    int img_y = y * spixel_size + spixel_size / 2;
+
+    img_x = img_x >= img_size.x ? (x * spixel_size + img_size.x) / 2 : img_x;
+    img_y = img_y >= img_size.y ? (y * spixel_size + img_size.y) / 2 : img_y;
+
+    out_spixel[cluster_idx].id = cluster_idx;
+    out_spixel[cluster_idx].center = Vector2f((float)img_x, (float)img_y);
+    out_spixel[cluster_idx].color_info = inimg[img_y*img_size.x + img_x];
+
+    out_spixel[cluster_idx].num_pixels = 0;
+}
+
+__host__ __device__ __forceinline__ float computeSlicDistance(
+    const Vector4f& pix, int x, int y,
+    const cv::hfs::slic::gSpixelInfo& center_info,
+    float weight, float normalizer_xy, float normalizer_color)
+{
+    float dcolor =
+        (pix.x - center_info.color_info.x)*(pix.x - center_info.color_info.x)
+        + (pix.y - center_info.color_info.y)*(pix.y - center_info.color_info.y)
+        + (pix.z - center_info.color_info.z)*(pix.z - center_info.color_info.z);
+
+    float dxy =
+        (x - center_info.center.x) * (x - center_info.center.x)
+        + (y - center_info.center.y) * (y - center_info.center.y);
+
+
+    float retval =
+        dcolor * normalizer_color + weight * dxy * normalizer_xy;
+    return sqrtf(retval);
+}
+
+__host__ __device__ __forceinline__ void findCenterAssociationShared(
+    const Vector4f* inimg,
+    const cv::hfs::slic::gSpixelInfo* in_spixel_map,
+    Vector2i map_size, Vector2i img_size,
+    int spixel_size, float weight, int x, int y,
+    float max_xy_dist, float max_color_dist, int* out_idx_img)
+{
+    int idx_img = y * img_size.x + x;
+
+    int ctr_x = x / spixel_size;
+    int ctr_y = y / spixel_size;
+
+    int minidx = -1;
+    float dist = 999999.9999f;
+
+    for ( int i = -1; i <= 1; i++ )
+    for ( int j = -1; j <= 1; j++ )
+    {
+        int ctr_x_check = ctr_x + j;
+        int ctr_y_check = ctr_y + i;
+        if (ctr_x_check >= 0 && ctr_y_check >= 0 &&
+            ctr_x_check < map_size.x && ctr_y_check < map_size.y)
+        {
+            int ctr_idx = ctr_y_check*map_size.x + ctr_x_check;
+            float cdist =
+                computeSlicDistance(inimg[idx_img], x, y,
+                    in_spixel_map[ctr_idx], weight,
+                    max_xy_dist, max_color_dist);
+            if (cdist < dist)
+            {
+                dist = cdist;
+                minidx = in_spixel_map[ctr_idx].id;
+            }
+        }
+    }
+
+    if (minidx >= 0)
+        out_idx_img[idx_img] = minidx;
+}
+
+__host__ __device__ __forceinline__ void finalizeReductionResultShared(
+    const cv::hfs::slic::gSpixelInfo* accum_map,
+    Vector2i map_size, int num_blocks_per_spixel, int x, int y,
+    cv::hfs::slic::gSpixelInfo* spixel_list)
+{
+    int spixel_idx = y * map_size.x + x;
+
+    spixel_list[spixel_idx].center = Vector2f(0, 0);
+    spixel_list[spixel_idx].color_info = Vector4f(0, 0, 0, 0);
+    spixel_list[spixel_idx].num_pixels = 0;
+
+    for (int i = 0; i < num_blocks_per_spixel; i++)
+    {
+        int accum_list_idx = spixel_idx * num_blocks_per_spixel + i;
+
+        spixel_list[spixel_idx].center +=
+            accum_map[accum_list_idx].center;
+        spixel_list[spixel_idx].color_info +=
+            accum_map[accum_list_idx].color_info;
+        spixel_list[spixel_idx].num_pixels +=
+            accum_map[accum_list_idx].num_pixels;
+    }
+
+    if (spixel_list[spixel_idx].num_pixels != 0)
+    {
+        spixel_list[spixel_idx].center /=
+            (float)spixel_list[spixel_idx].num_pixels;
+        spixel_list[spixel_idx].color_info /=
+            (float)spixel_list[spixel_idx].num_pixels;
+    }
+    else
+    {
+        spixel_list[spixel_idx].center =
+            Vector2f(-100, -100);
+        spixel_list[spixel_idx].color_info =
+            Vector4f(-100, -100, -100, -100);
+    }
+}
+
+__host__ __device__ __forceinline__ void supressLocalLable(
+    const int* in_idx_img, Vector2i img_size,
+    int x, int y, int* out_idx_img)
+{
+    int clable = in_idx_img[y*img_size.x + x];
+
+    if (x < 2 || y < 2 || x >= img_size.x - 2 || y >= img_size.y - 2)
+    {
+        out_idx_img[y*img_size.x + x] = clable;
+        return;
+    }
+
+    int diff_count = 0;
+    int diff_lable = -1;
+
+    for ( int j = -2; j <= 2; j++ )
+    for ( int i = -2; i <= 2; i++ )
+    {
+        int nlable = in_idx_img[(y + j)*img_size.x + (x + i)];
+        if (nlable != clable)
+        {
+            diff_lable = nlable;
+            diff_count++;
+        }
+    }
+
+    if (diff_count > 16)
+        out_idx_img[y*img_size.x + x] = diff_lable;
+    else
+        out_idx_img[y*img_size.x + x] = clable;
+}
+
+__host__ __device__ __forceinline__ void supressLocalLable2(const int* in_idx_img,
+    Vector2i img_size, int x, int y, int* out_idx_img)
+{
+    int pixel_idx = y*img_size.x + x;
+    int clable = in_idx_img[pixel_idx];
+    if (x < 1 || y < 1 || x >= img_size.x - 1 || y >= img_size.y - 1)
+    {
+        out_idx_img[y*img_size.x + x] = clable;
+        return;
+    }
+
+    int diff_count = 0;
+    int diff_lable = -1;
+
+    for (int j = -1; j <= 1; j++)
+    for (int i = -1; i <= 1; i++)
+    {
+        int nlable = in_idx_img[(y + j)*img_size.x + (x + i)];
+        if (nlable != clable)
+        {
+            diff_lable = nlable;
+            diff_count++;
+        }
+    }
+
+    if (diff_count >= 6)
+        out_idx_img[pixel_idx] = diff_lable;
+    else
+        out_idx_img[pixel_idx] = clable;
+}
+
+__host__ __device__ __forceinline__ dim3 getGridSize( Vector2i dataSz, dim3 blockSz )
+{
+    return dim3((dataSz.x + blockSz.x - 1) / blockSz.x,
+        (dataSz.y + blockSz.y - 1) / blockSz.y);
+}
+
+struct Float4_
+{
+    __host__ __device__ Float4_() {}
+    __host__ __device__ Float4_( float x_, float y_, float z_, float w_ ) {
+        x = x_, y = y_, z = z_, w = w_;
+    }
+    volatile float x, y, z, w;
+};
+
+struct Float2_
+{
+    __host__ __device__ Float2_() {}
+    __host__ __device__ Float2_( float x_, float y_ ) {
+        x = x_, y = y_;
+    }
+    volatile float x, y;
+};
+
+__host__ __device__ __forceinline__ Float4_ operator+= ( Float4_ &a, Float4_ b )
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+    return a;
+}
+
+__host__ __device__ __forceinline__ Float2_ operator+= ( Float2_ &a, Float2_ b )
+{
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+}
+
+
+#endif
+
+
+}}}
+
+#endif