Paralellized methods using open cv parrallel for

reordered initiallizations removed unused variable removed tabs put const Mat:: & where needed replaced Macros with enums i did the step thing removed the white spaces from the stereo binary sgbm

Paralellized methods using open cv parrallel for
reordered initiallizations removed unused variable removed tabs put const Mat:: & where needed replaced Macros with enums i did the step thing removed the white spaces from the stereo binary sgbm
37478fc5 · Muresan Mircea Paul · c639de6a · 37478fc5 · 37478fc5 · 37478fc5
Commit 37478fc5 authored Jul 03, 2015 by Muresan Mircea Paul
Showing with 362 additions and 467 deletions

descriptor.cpp modules/stereo/src/descriptor.cpp +30 -337

descriptor.hpp modules/stereo/src/descriptor.hpp +332 -34

stereo_binary_sgbm.cpp modules/stereo/src/stereo_binary_sgbm.cpp +0 -96

No files found.
--- a/modules/stereo/src/descriptor.cpp
+++ b/modules/stereo/src/descriptor.cpp
@@ -41,11 +41,8 @@
 //the use of this software, even if advised of the possibility of such damage.
 /*****************************************************************************************************************\
-*   The interface contains the main descriptors that will be implemented in the descriptor class			      *
+*   The interface contains the main descriptors that will be implemented in the descriptor class                  *
-*																												  *
 \******************************************************************************************************************/
 #include "descriptor.hpp"
 using namespace cv;
@@ -53,364 +50,60 @@ using namespace stereo;
 Descriptor::Descriptor()
 {
 }
 //!Implementation for computing the Census transform on the given image
-void Descriptor::applyCensusOnImage(const cv::Mat img, int kernelSize, cv::Mat &dist, const int type)
+void Descriptor::applyCensusOnImage(const cv::Mat &img, int kernelSize, cv::Mat &dist, const int type)
 {
-	int n2 = (kernelSize - 1) / 2;
+    int n2 = (kernelSize - 1) / 2;
-	int height = img.rows;
+    parallel_for_(cv::Range(n2, img.rows - n2), singleImageCensus(img.data, img.cols, img.rows, n2, (int *)dist.data, type));
-	int width = img.cols;
-	uint8_t * image = img.data;
-	int *dst = (int *)dist.data;
-//#pragma omp parallel for
-	for (int i = n2; i <= height - n2; i++)
-	{
-		int rWidth = i * width;
-		for (int j = n2; j <= width - n2; j++)
-		{
-			if(type == CV_SSE_CENSUS)
-			{
-				//to do
-			}
-			else
-			{
-				int c = 0;
-				for (int ii = i - n2; ii <= i + n2; ii++)
-				{
-					int rrWidth = ii * width;
-					for (int jj = j - n2; jj <= j + n2; jj++)
-					{
-						if (ii != i || jj != j)
-						{
-							if (image[(rrWidth + jj)] > image[(rWidth + j)])
-							{
-								c = c + 1;
-							}
-							c = c * 2;
-						}
-					}
-				}
-				dst[(rWidth + j)] = c;
-			}
-		}
-	}
 }
 /**
 Two variations of census applied on input images
 Implementation of a census transform which is taking into account just the some pixels from the census kernel thus allowing for larger block sizes
 **/
-void Descriptor::applyCensusOnImages(const cv::Mat im1, cv::Mat im2, int kernelSize,cv::Mat &dist, cv::Mat &dist2, const int type)
+void Descriptor::applyCensusOnImages(const cv::Mat &im1, cv::Mat &im2, int kernelSize, cv::Mat &dist, cv::Mat &dist2, const int type)
 {
-	int n2 = (kernelSize - 1) / 2;
+    int n2 = (kernelSize - 1) / 2;
-	int width = im1.cols;
+    parallel_for_(cv::Range(n2, im1.rows - n2), parallelCensus(im1.data, im2.data, im1.cols, im2.rows, n2, (int *)dist.data, (int *)dist2.data, type));
-	int height = im2.rows;
-	uint8_t * image1 = im1.data;
-	uint8_t * image2 = im2.data;
-	int *dst1 = (int *)dist.data;
-	int *dst2 = (int *)dist2.data;
-//#pragma omp parallel for
-	for (int i =  n2; i <= height - n2; i++)
-	{
-		int rWidth = i * width;
-		int distV = (i) * width;
-		for (int j = n2; j <= width - n2; j++)
-		{
-			int c = 0;
-			int c2 = 0;
-			if(type == CV_DENSE_CENSUS)
-			{
-				for (int ii = i - n2; ii <= i + n2; ii++)
-				{
-					int rrWidth = ii * width;
-					for (int jj = j - n2; jj <= j + n2; jj++)
-					{
-						if (ii != i || jj != j)
-						{
-							if (image1[rrWidth + jj] > image1[rWidth + j])
-							{
-								c = c + 1;
-							}
-							c = c * 2;
-						}
-						if (ii != i || jj != j)
-						{
-							if (image2[rrWidth + jj] > image2[rWidth + j])
-							{
-								c2 = c2 + 1;
-							}
-							c2 = c2 * 2;
-						}
-					}
-				}
-			}
-			else if(type == CV_SPARSE_CENSUS)
-			{
-				for (int ii = i - n2; ii <= i + n2; ii += 2)
-				{
-					int rrWidth = ii * width;
-					for (int jj = j - n2; jj <= j + n2; jj += 2)
-					{
-						if (ii != i || jj != j)
-						{
-							if (image1[(rrWidth + jj)] > image1[(rWidth + j)])
-							{
-								c = c + 1;
-							}
-							c = c * 2;
-						}
-						if (ii != i || jj != j)
-						{
-							if (image2[(rrWidth + jj)] > image2[(rWidth + j)])
-							{
-								c2 = c2 + 1;
-							}
-							c2 = c2 * 2;
-						}
-					}
-				}
-			}
-			dst1[(distV + j)] = c;
-			dst2[(distV + j)] = c2;
-		}
-	}
 }
-/** 
+/**
 STANDARD_MCT - Modified census which is memorizing for each pixel 2 bits and includes a tolerance to the pixel comparison
 MCT_MEAN_VARIATION - Implementation of a modified census transform which is also taking into account the variation to the mean of the window not just the center pixel
 **/
-void Descriptor::applyMCTOnImages(const cv::Mat img1, const cv::Mat img2, int kernelSize, int t, cv::Mat &dist, cv::Mat &dist2, const int type)
+void Descriptor::applyMCTOnImages(const cv::Mat &img1, const cv::Mat &img2, int kernelSize, int t, cv::Mat &dist, cv::Mat &dist2, const int type)
 {
-	int n2 = (kernelSize - 1) >> 1;
+    int n2 = (kernelSize - 1) >> 1;
-	int width = img1.cols;
+    parallel_for_(cv::Range(n2, img1.rows - n2), parallelMctDescriptor(img1.data, img2.data, img1.cols, img2.rows, n2,t, (int *)dist.data, (int *)dist2.data, type));
-	int height = img1.rows;
-	uint8_t *image1 = img1.data;
-	uint8_t *image2 = img2.data;
-	int *dst1 = (int *)dist.data;
-	int *dst2 = (int *)dist2.data;
-//	#pragma omp parallel for
-	for (int i = n2 + 2; i <= height - n2 - 2; i++)
-	{
-		int rWidth = i * width;
-		int distV = (i) * width;
-		for (int j = n2 + 2; j <= width - n2 - 2; j++)
-		{
-			int c = 0;
-			int c2 = 0;
-			if(type == CV_STANDARD_MCT)
-			{
-				for (int ii = i - n2; ii <= i + n2; ii += 2)
-				{
-					int rrWidth = ii * width;
-					for (int jj = j - n2; jj <= j + n2; jj += 2)
-					{
-						if (ii != i || jj != j)
-						{
-							if (image1[rrWidth + jj] > image1[rWidth + j] - t)
-							{
-								c <<= 2;
-								c |= 0x3;
-							}
-							else if (image1[rWidth + j] - t < image1[rrWidth + jj] && image1[rWidth + j] + t >= image1[rrWidth + jj])
-							{
-								c <<= 2;
-								c = c + 1;
-							}
-							else
-							{
-								c <<= 2;
-							}
-						}
-						if (ii != i || jj != j)
-						{
-							if (image2[rrWidth + jj]  > image2[rWidth + j] - t)
-							{
-								c2 <<= 2;
-								c2 |= 0x3;
-							}
-							else if (image2[rWidth + j] - t < image2[rrWidth + jj] && image2[rWidth + j] + t >= image2[rrWidth + jj])
-							{
-								c2 <<= 2;
-								c2 = c2 + 1;
-							}
-							else
-							{
-								c2 <<= 2;
-							}
-						}
-					}
-				}
-				for (int ii = i - n2; ii <= i + n2; ii += 4)
-				{
-					int rrWidth = ii * width;
-					for (int jj = j - n2; jj <= j + n2; jj += 4)
-					{
-						if (ii != i || jj != j)
-						{
-							if (image1[rrWidth + jj]  > image1[rWidth + j] - t)
-							{
-								c <<= 2;
-								c |= 0x3;
-							}
-							else if (image1[rWidth + j] - t < image1[rrWidth + jj] && image1[rWidth + j] + t >= image1[rrWidth + jj])
-							{
-								c <<= 2;
-								c += 1;
-							}
-							else
-							{
-								c <<= 2;
-							}
-						}
-						if (ii != i || jj != j)
-						{
-							if (image2[rrWidth + jj]  > image2[rWidth + j] - t)
-							{
-								c2 <<= 2;
-								c2 |= 0x3;
-							}
-							else if (image2[rWidth + j] - t < image2[rrWidth + jj] && image2[rWidth + j] + t >= image2[rrWidth + jj])
-							{
-								c2 <<= 2;
-								c2 = c2 + 1;
-							}
-							else
-							{
-								c2 <<= 2;
-							}
-						}
-					}
-				}
-			}
-			else if(type == CV_MCT_MEAN_VARIATION)
-			{
-				//to do mean variation
-			}
-			dst1[distV + j] = c;
-			dst2[distV + j] = c2;
-		}
-	}
 }
 /**The classical center symetric census
 A modified version of cs census which is comparing a pixel with its correspondent after the center
 **/
-void Descriptor::applySimetricCensus(const cv::Mat img1, const cv::Mat img2,int kernelSize, cv::Mat &dist, cv::Mat &dist2, const int type)
+void Descriptor::applySimetricCensus(const cv::Mat &img1, const cv::Mat &img2, int kernelSize, cv::Mat &dist, cv::Mat &dist2, const int type)
 {
-	int n2 = (kernelSize - 1) / 2;
+    int n2 = (kernelSize - 1) >> 1;
-	int height = img1.rows;
+    parallel_for_(cv::Range(n2, img1.rows - n2), parallelSymetricCensus(img1.data, img2.data, img1.cols, img2.rows, n2, (int *)dist.data, (int *)dist2.data, type));
-	int width = img1.cols;
-	uint8_t *image1 = img1.data;
-	uint8_t *image2 = img2.data;
-	int *dst1 = (int *)dist.data;
-	int *dst2 = (int *)dist2.data;
-	//#pragma omp parallel for
-	for (int i = + n2; i <= height - n2; i++)
-	{
-		int distV = (i) * width;
-		for (int j = n2; j <= width - n2; j++)
-		{
-			int c = 0;
-			int c2 = 0;
-			if(type == CV_CLASSIC_CENTER_SYMETRIC_CENSUS)
-			{
-				for (int ii = -n2; ii < 0; ii++)
-				{
-					int rrWidth = (ii + i) * width;
-					for (int jj = -n2; jj <= +n2; jj++)
-					{
-						if (ii != i || jj != j)
-						{
-							if (image1[(rrWidth + (jj + j))] > image1[((ii * (-1) + i) * width + (-1 * jj) + j)])
-							{
-								c = c + 1;
-							}
-							c = c * 2;
-						}
-						if (ii != i || jj != j)
-						{
-							if (image2[(rrWidth + (jj + j))] > image2[((ii * (-1) + i) * width + (-1 * jj) + j)])
-							{
-								c2 = c2 + 1;
-							}
-							c2 = c2 * 2;
-						}
-					}
-				}
-				for (int jj = -n2; jj < 0; jj++)
-				{
-					if (image1[(i * width + (jj + j))] > image1[(i * width + (-1 * jj) + j)])
-					{
-						c = c + 1;
-					}
-					c = c * 2;
-					if (image2[(i * width + (jj + j))] > image2[(i * width + (-1 * jj) + j)])
-					{
-						c2 = c2 + 1;
-					}
-					c2 = c2 * 2;
-				}
-			}
-			else if(type == CV_MODIFIED_CENTER_SIMETRIC_CENSUS)
-			{
-				for (int ii = i - n2; ii <= i + 1; ii++)
-				{
-					int rrWidth = ii * width;
-					int rrWidthC = (ii + n2) * width;
-					for (int jj = j - n2; jj <= j + n2; jj += 2)
-					{
-						if (ii != i || jj != j)
-						{
-							if (image1[(rrWidth + jj)] > image1[(rrWidthC + (jj + n2))])
-							{
-								c = c + 1;
-							}
-							c = c * 2;
-						}
-						if (ii != i || jj != j)
-						{
-							if (image2[(rrWidth + jj)] > image2[(rrWidthC + (jj + n2))])
-							{
-								c2 = c2 + 1;
-							}
-							c2 = c2 * 2;
-						}
-					}
-				}
-			}
-			dst1[(distV + j)] = c;
-			dst2[(distV + j)] = c2;
-		}
-	}
 }
 //!brief binary descriptor used in stereo correspondence
-void Descriptor::applyBrifeDescriptor(const cv::Mat image1, const cv::Mat image2,int kernelSize, cv::Mat &dist, cv::Mat &dist2)
+void Descriptor::applyBrifeDescriptor(const cv::Mat &image1, const cv::Mat &image2, int kernelSize, cv::Mat &dist, cv::Mat &dist2)
 {
-	//TO DO
+    //TO DO
-	//marked the variables in order to avoid warnings
+    //marked the variables in order to avoid warnings
-	(void) image1;
+    (void)image1;
-	(void) image2;
+    (void)image2;
-	(void) dist;
+    (void)dist;
-	(void) dist2;
+    (void)dist2;
-	(void) kernelSize;
+    (void)kernelSize;
 }
 //The classical Rank Transform
-void  Descriptor::applyRTDescriptor(const cv::Mat image1, const cv::Mat image2, int kernelSize,  cv::Mat &dist,  cv::Mat &dist2)
+void  Descriptor::applyRTDescriptor(const cv::Mat &image1, const cv::Mat &image2, int kernelSize, cv::Mat &dist, cv::Mat &dist2)
 {
-	//TO DO
+    //TO DO
-	//marked the variables in order to avoid warnings
+    //marked the variables in order to avoid warnings
-	(void) image1;
+    (void)image1;
-	(void) image2;
+    (void)image2;
-	(void) dist;
+    (void)dist;
-	(void) dist2;
+    (void)dist2;
-	(void) kernelSize;
+    (void)kernelSize;
 }
 Descriptor::~Descriptor(void)

--- a/modules/stereo/src/descriptor.hpp
+++ b/modules/stereo/src/descriptor.hpp
@@ -41,8 +41,7 @@
 //the use of this software, even if advised of the possibility of such damage.
 /*****************************************************************************************************************\
-*   The interface contains the main descriptors that will be implemented in the descriptor class			      *
+*   The interface contains the main descriptors that will be implemented in the descriptor class                  *
-*																												  *
 \******************************************************************************************************************/
 #include "precomp.hpp"
@@ -50,42 +49,341 @@
 #ifndef _OPENCV_DESCRIPTOR_HPP_
 #define _OPENCV_DESCRIPTOR_HPP_
 #ifdef __cplusplus
-#define CV_DENSE_CENSUS 0
-#define CV_SPARSE_CENSUS 1
-#define CV_MODIFIED_CENTER_SIMETRIC_CENSUS 0
-#define CV_CLASSIC_CENTER_SYMETRIC_CENSUS 1
-#define CV_STANDARD_MCT 0
-#define CV_MCT_MEAN_VARIATION 1
-#define CV_SSE_CENSUS 1
 namespace cv
 {
-	namespace stereo
+    namespace stereo
-	{
+    {
-		class Descriptor
+        enum ClassicCensus { Dense_Census, Sparse_Census};
-		{
+        enum SymetricCensus {ClassicCenterSymetricCensus, ModifiedCenterSymetricCensus};
-		public:
+        enum MCT {StandardMct,MeanVariation};
+        enum CensusImage {SSE, NonSSE};
+        //!class implemented to run the census descriptor in parralel
+        class parallelCensus :public ParallelLoopBody
+        {
+        private:
+            uint8_t *image1, *image2;
+            int *dst1, *dst2;
+            int n2, width, height, type;
+        public:
+            parallelCensus(uint8_t * img1, uint8_t * img2, int w, int h, int k2, int * distance1, int * distance2,const int t) :
+                image1(img1), image2(img2), dst1(distance1), dst2(distance2), n2(k2), width(w), height(h), type(t){}
+            virtual void operator()(const cv::Range &r) const {
+                int step = (type == ClassicCensus::Sparse_Census)? 2:1;
+                for (int i = r.start; i <= r.end ; i++)
+                {
+                    int rWidth = i * width;
+                    for (int j = n2; j <= width - n2; j++)
+                    {
+                        //imitialize the costs for the 2 census descriptors
+                        int c = 0;
+                        int c2 = 0;
+                        for (int ii = i - n2; ii <= i + n2; ii+=step)
+                        {
+                            int rrWidth = ii * width;
+                            for (int jj = j - n2; jj <= j + n2; jj+=step)
+                            {
+                                if (ii != i || jj != j)
+                                {
+                                    //compare a pixel with the center from the kernel
+                                    if (image1[rrWidth + jj] > image1[rWidth + j])
+                                    {
+                                        c = c + 1;
+                                    }
+                                    c = c * 2;
+                                }
+                                if (ii != i || jj != j)
+                                {
+                                    //compare pixel with center for image 2
+                                    if (image2[rrWidth + jj] > image2[rWidth + j])
+                                    {
+                                        c2 = c2 + 1;
+                                    }
+                                    c2 = c2 * 2;
+                                }
+                            }
+                        }
+                        dst1[(rWidth + j)] = c;
+                        dst2[(rWidth + j)] = c2;
+                    }
+                }
+            }
+        };
+        //!class that implemented the census descriptor on single images
+        class singleImageCensus : public ParallelLoopBody
+        {
+        private:
+            uint8_t *image;
+            int *dst;
+            int n2, width, height, type;
+        public:
+            singleImageCensus(uint8_t * img1, int w, int h, int k2, int * distance1,const int t) :
+                image(img1), dst(distance1), n2(k2), width(w), height(h), type(t){}
+            virtual void operator()(const cv::Range &r) const {
+                for (int i = r.start; i <= r.end ; i++)
+                {
+                    int rWidth = i * width;
+                    for (int j = n2; j <= width - n2; j++)
+                    {
+                        if (type == CensusImage::SSE)
+                        {
+                            //to do
+                        }
+                        else
+                        {
+                            int c = 0;
+                            for (int ii = i - n2; ii <= i + n2; ii++)
+                            {
+                                int rrWidth = ii * width;
+                                for (int jj = j - n2; jj <= j + n2; jj++)
+                                {
+                                    if (ii != i || jj != j)
+                                    {
+                                        if (image[(rrWidth + jj)] > image[(rWidth + j)])
+                                        {
+                                            c = c + 1;
+                                        }
+                                        c = c * 2;
+                                    }
+                                }
+                            }
+                            dst[(rWidth + j)] = c;
+                        }
+                    }
+                }
+            }
+        };
+        //! parallel implementation of MCT type of descriptors
+        class parallelMctDescriptor:public ParallelLoopBody
+        {
+        private:
+            uint8_t *image1, *image2;
+            int *dst1, *dst2;
+            int n2,t , width, height, type;
+        public:
+            parallelMctDescriptor(uint8_t * img1, uint8_t * img2, int w, int h, int k2,int threshold, int * distance1, int * distance2,const int tip) :
+                image1(img1), image2(img2), dst1(distance1), dst2(distance2), n2(k2), t(threshold), width(w), height(h), type(tip){}
+            virtual void operator()(const cv::Range &r) const {
+                for (int i = r.start; i <= r.end ; i++)
+                {
+                    int rWidth = i * width;
+                    int distV = (i)* width;
+                    for (int j = n2 + 2; j <= width - n2 - 2; j++)
+                    {
+                        int c = 0;
+                        int c2 = 0;
+                        if (type == MCT::StandardMct)
+                        {
+                            for (int ii = i - n2; ii <= i + n2; ii += 2)
+                            {
+                                int rrWidth = ii * width;
+                                for (int jj = j - n2; jj <= j + n2; jj += 2)
+                                {
+                                    if (ii != i || jj != j)
+                                    {
+                                        if (image1[rrWidth + jj] > image1[rWidth + j] - t)
+                                        {
+                                            c <<= 2;
+                                            c |= 0x3;
+                                        }
+                                        else if (image1[rWidth + j] - t < image1[rrWidth + jj] && image1[rWidth + j] + t >= image1[rrWidth + jj])
+                                        {
+                                            c <<= 2;
+                                            c = c + 1;
+                                        }
+                                        else
+                                        {
+                                            c <<= 2;
+                                        }
+                                    }
+                                    if (ii != i || jj != j)
+                                    {
+                                        if (image2[rrWidth + jj]  > image2[rWidth + j] - t)
+                                        {
+                                            c2 <<= 2;
+                                            c2 |= 0x3;
+                                        }
+                                        else if (image2[rWidth + j] - t < image2[rrWidth + jj] && image2[rWidth + j] + t >= image2[rrWidth + jj])
+                                        {
+                                            c2 <<= 2;
+                                            c2 = c2 + 1;
+                                        }
+                                        else
+                                        {
+                                            c2 <<= 2;
+                                        }
+                                    }
+                                }
+                            }
+                            for (int ii = i - n2; ii <= i + n2; ii += 4)
+                            {
+                                int rrWidth = ii * width;
+                                for (int jj = j - n2; jj <= j + n2; jj += 4)
+                                {
+                                    if (ii != i || jj != j)
+                                    {
+                                        if (image1[rrWidth + jj]  > image1[rWidth + j] - t)
+                                        {
+                                            c <<= 2;
+                                            c |= 0x3;
+                                        }
+                                        else if (image1[rWidth + j] - t < image1[rrWidth + jj] && image1[rWidth + j] + t >= image1[rrWidth + jj])
+                                        {
+                                            c <<= 2;
+                                            c += 1;
+                                        }
+                                        else
+                                        {
+                                            c <<= 2;
+                                        }
+                                    }
+                                    if (ii != i || jj != j)
+                                    {
+                                        if (image2[rrWidth + jj]  > image2[rWidth + j] - t)
+                                        {
+                                            c2 <<= 2;
+                                            c2 |= 0x3;
+                                        }
+                                        else if (image2[rWidth + j] - t < image2[rrWidth + jj] && image2[rWidth + j] + t >= image2[rrWidth + jj])
+                                        {
+                                            c2 <<= 2;
+                                            c2 = c2 + 1;
+                                        }
+                                        else
+                                        {
+                                            c2 <<= 2;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else if (type == MCT::MeanVariation)
+                        {
+                            //to do mean variation
+                        }
+                        dst1[distV + j] = c;
+                        dst2[distV + j] = c2;
+                    }
+                }
+            }
+        };
+        //!paralel implementation of the center symetric census
+        class parallelSymetricCensus:public ParallelLoopBody
+        {
+        private:
+            uint8_t *image1, *image2;
+            int *dst1, *dst2;
+            int n2, width, height, type;
+        public:
+            parallelSymetricCensus(uint8_t * img1, uint8_t * img2, int w, int h, int k2, int * distance1, int * distance2,const int t) :
+                image1(img1), image2(img2), dst1(distance1), dst2(distance2), n2(k2), width(w), height(h), type(t){}
+            virtual void operator()(const cv::Range &r) const {
-			//Implementation for computing the Census transform on the given image
+                for (int i = r.start; i <= r.end ; i++)
-			void applyCensusOnImage(const cv::Mat image, int kernelSize, cv::Mat &dist, const int type = 0);
+                {
-			//two variations of census applied on input images
+                    int distV = (i)* width;
-			//Implementation of a census transform which is taking into account just the some pixels from the census kernel thus allowing for larger block sizes
+                    for (int j = n2; j <= width - n2; j++)
-			void applyCensusOnImages(const cv::Mat image1, cv::Mat image2, int kernelSize,cv::Mat &dist, cv::Mat &dist2, const int type = CV_SPARSE_CENSUS);
+                    {
-			// STANDARD_MCT - Modified census which is memorizing for each pixel 2 bits and includes a tolerance to the pixel comparison
+                        int c = 0;
-			//MCT_MEAN_VARIATION - Implementation of a modified census transform which is also taking into account the variation to the mean of the window not just the center pixel
+                        int c2 = 0;
-			void applyMCTOnImages(const cv::Mat image1, const cv::Mat image2, int kernelSize, int t, cv::Mat &dist, cv::Mat &dist2, const int type = CV_STANDARD_MCT);
+                        //the classic center symetric census which compares the curent pixel with its symetric not its center.
-			//The classical center symetric census
+                        if (type == SymetricCensus::ClassicCenterSymetricCensus)
-			//A modified version of cs census which is comparing the a pixel with its correspondent from the after the center
+                        {
-			void applySimetricCensus(const cv::Mat image1, const cv::Mat image2,int kernelSize, cv::Mat &dist, cv::Mat &dist2, const int type = CV_CLASSIC_CENTER_SYMETRIC_CENSUS);
+                            for (int ii = -n2; ii < 0; ii++)
-			//The brief binary descriptor
+                            {
-			void applyBrifeDescriptor(const cv::Mat image1, const cv::Mat image2,int kernelSize, cv::Mat &dist, cv::Mat &dist2);
+                                int rrWidth = (ii + i) * width;
-			//The classical Rank Transform
+                                for (int jj = -n2; jj <= +n2; jj++)
-			void applyRTDescriptor(const cv::Mat image1, const cv::Mat image2, int kernelSize,  cv::Mat &dist,  cv::Mat &dist2);
+                                {
-			Descriptor();
+                                    if (ii != i || jj != j)
-			~Descriptor(void);
+                                    {
-		};
+                                        if (image1[(rrWidth + (jj + j))] > image1[((ii * (-1) + i) * width + (-1 * jj) + j)])
-	}
+                                        {
+                                            c = c + 1;
+                                        }
+                                        c = c * 2;
+                                    }
+                                    if (ii != i || jj != j)
+                                    {
+                                        if (image2[(rrWidth + (jj + j))] > image2[((ii * (-1) + i) * width + (-1 * jj) + j)])
+                                        {
+                                            c2 = c2 + 1;
+                                        }
+                                        c2 = c2 * 2;
+                                    }
+                                }
+                            }
+                            for (int jj = -n2; jj < 0; jj++)
+                            {
+                                if (image1[(i * width + (jj + j))] > image1[(i * width + (-1 * jj) + j)])
+                                {
+                                    c = c + 1;
+                                }
+                                c = c * 2;
+                                if (image2[(i * width + (jj + j))] > image2[(i * width + (-1 * jj) + j)])
+                                {
+                                    c2 = c2 + 1;
+                                }
+                                c2 = c2 * 2;
+                            }
+                        }//a modified version of cs census which compares each pixel with its correspondent from
+                        //the same distance from the center
+                        else if (type == SymetricCensus::ModifiedCenterSymetricCensus)
+                        {
+                            for (int ii = i - n2; ii <= i + 1; ii++)
+                            {
+                                int rrWidth = ii * width;
+                                int rrWidthC = (ii + n2) * width;
+                                for (int jj = j - n2; jj <= j + n2; jj += 2)
+                                {
+                                    if (ii != i || jj != j)
+                                    {
+                                        if (image1[(rrWidth + jj)] > image1[(rrWidthC + (jj + n2))])
+                                        {
+                                            c = c + 1;
+                                        }
+                                        c = c * 2;
+                                    }
+                                    if (ii != i || jj != j)
+                                    {
+                                        if (image2[(rrWidth + jj)] > image2[(rrWidthC + (jj + n2))])
+                                        {
+                                            c2 = c2 + 1;
+                                        }
+                                        c2 = c2 * 2;
+                                    }
+                                }
+                            }
+                        }
+                        dst1[(distV + j)] = c;
+                        dst2[(distV + j)] = c2;
+                    }
+                }
+            }
+        };
+        class Descriptor
+        {
+        public:
+            //Implementation for computing the Census transform on the given image
+            void applyCensusOnImage(const cv::Mat &image, int kernelSize, cv::Mat &dist, const int type = 0);
+            //two variations of census applied on input images
+            //Implementation of a census transform which is taking into account just the some pixels from the census kernel thus allowing for larger block sizes
+            void applyCensusOnImages(const cv::Mat &image1, cv::Mat &image2, int kernelSize, cv::Mat &dist, cv::Mat &dist2, const int type = ClassicCensus::Sparse_Census);
+            // STANDARD_MCT - Modified census which is memorizing for each pixel 2 bits and includes a tolerance to the pixel comparison
+            //MCT_MEAN_VARIATION - Implementation of a modified census transform which is also taking into account the variation to the mean of the window not just the center pixel
+            void applyMCTOnImages(const cv::Mat &image1, const cv::Mat &image2, int kernelSize, int t, cv::Mat &dist, cv::Mat &dist2, const int type = MCT::StandardMct);
+            //The classical center symetric census
+            //A modified version of cs census which is comparing the a pixel with its correspondent from the after the center
+            void applySimetricCensus(const cv::Mat &image1, const cv::Mat &image2, int kernelSize, cv::Mat &dist, cv::Mat &dist2, const int type = SymetricCensus::ClassicCenterSymetricCensus);
+            //The brief binary descriptor
+            void applyBrifeDescriptor(const cv::Mat &image1, const cv::Mat &image2, int kernelSize, cv::Mat &dist, cv::Mat &dist2);
+            //The classical Rank Transform
+            void applyRTDescriptor(const cv::Mat &image1, const cv::Mat &image2, int kernelSize, cv::Mat &dist, cv::Mat &dist2);
+            Descriptor();
+            ~Descriptor(void);
+        };
+    }
 }
 #endif
 #endif

--- a/modules/stereo/src/stereo_binary_sgbm.cpp
+++ b/modules/stereo/src/stereo_binary_sgbm.cpp
@@ -40,12 +40,10 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 /*
 This is a variation of
 "Stereo Processing by Semiglobal Matching and Mutual Information"
 by Heiko Hirschmuller.
 We match blocks rather than individual pixels, thus the algorithm is called
 SGBM (Semi-global block matching)
 */
@@ -62,8 +60,6 @@ namespace cv
 		typedef short DispType;
 		enum { NR = 16, NR2 = NR/2 };
 		struct StereoBinarySGBMParams
 		{
 			StereoBinarySGBMParams()
@@ -96,7 +92,6 @@ namespace cv
 				speckleRange = _speckleRange;
 				mode = _mode;
 			}
 			int minDisparity;
 			int numDisparities;
 			int SADWindowSize;
@@ -109,7 +104,6 @@ namespace cv
 			int disp12MaxDiff;
 			int mode;
 		};
 		/*
 		For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
 		and for each disparity minD<=d<maxD the function
@@ -117,7 +111,6 @@ namespace cv
 		row1[x] and row2[x-d]. The subpixel algorithm from
 		"Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
 		is used, hence the suffix BT.
 		the temporary buffer should contain width2*2 elements
 		*/
 		static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
@@ -131,25 +124,20 @@ namespace cv
 			int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
 			const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
 			PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
 			tab += tabOfs;
 			for( c = 0; c < cn*2; c++ )
 			{
 				prow1[width*c] = prow1[width*c + width-1] =
 					prow2[width*c] = prow2[width*c + width-1] = tab[0];
 			}
 			int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
 			int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
 			if( cn == 1 )
 			{
 				for( x = 1; x < width-1; x++ )
 				{
 					prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
 					prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
 					prow1[x+width] = row1[x];
 					prow2[width-1-x+width] = row2[x];
 				}
@@ -161,21 +149,17 @@ namespace cv
 					prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
 					prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
 					prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
 					prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
 					prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
 					prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
 					prow1[x+width*3] = row1[x*3];
 					prow1[x+width*4] = row1[x*3+1];
 					prow1[x+width*5] = row1[x*3+2];
 					prow2[width-1-x+width*3] = row2[x*3];
 					prow2[width-1-x+width*4] = row2[x*3+1];
 					prow2[width-1-x+width*5] = row2[x*3+2];
 				}
 			}
 			memset( cost, 0, width1*D*sizeof(cost[0]) );
 			buffer -= minX2;
@@ -184,12 +168,10 @@ namespace cv
 #if CV_SSE2
 			volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 #endif
 #if 1
 			for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
 			{
 				int diff_scale = c < cn ? 0 : 2;
 				// precompute
 				//   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
 				//   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
@@ -203,7 +185,6 @@ namespace cv
 					buffer[x] = (PixType)v0;
 					buffer[x + width2] = (PixType)v1;
 				}
 				for( x = minX1; x < maxX1; x++ )
 				{
 					int u = prow1[x];
@@ -211,14 +192,12 @@ namespace cv
 					int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
 					int u0 = std::min(ul, ur); u0 = std::min(u0, u);
 					int u1 = std::max(ul, ur); u1 = std::max(u1, u);
 #if CV_SSE2
 					if( useSIMD )
 					{
 						__m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
 						__m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
 						__m128i ds = _mm_cvtsi32_si128(diff_scale);
 						for( int d = minD; d < maxD; d += 16 )
 						{
 							__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
@@ -227,10 +206,8 @@ namespace cv
 							__m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
 							__m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
 							__m128i diff = _mm_min_epu8(c0, c1);
 							c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
 							c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
 							_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
 							_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
 						}
@@ -245,7 +222,6 @@ namespace cv
 							int v1 = buffer[width-x-1 + d + width2];
 							int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
 							int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
 							cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
 						}
 					}
@@ -286,25 +262,20 @@ namespace cv
 			}
 #endif
 		}
 		/*
 		computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
 		that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
 		minD <= d < maxD.
 		disp2full is the reverse disparity map, that is:
 		disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
 		note that disp1buf will have the same size as the roi and
 		disp2full will have the same size as img1 (or img2).
 		On exit disp2buf is not the final disparity, it is an intermediate result that becomes
 		final after all the tiles are processed.
 		the disparity in disp1buf is written with sub-pixel accuracy
 		(4 fractional bits, see StereoSGBM::DISP_SCALE),
 		using quadratic interpolation, while the disparity in disp2buf
 		is written as is, without interpolation.
 		disp2cost also has the same size as img1 (or img2).
 		It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
 		*/
@@ -332,7 +303,6 @@ namespace cv
 			const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
 			const int DISP_SCALE = (1 << DISP_SHIFT);
 			const CostType MAX_COST = SHRT_MAX;
 			int minD = params.minDisparity, maxD = minD + params.numDisparities;
 			Size SADWindowSize;
 			SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
@@ -349,28 +319,22 @@ namespace cv
 			int npasses = fullDP ? 2 : 1;
 			const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
 			PixType clipTab[TAB_SIZE];
 			for( k = 0; k < TAB_SIZE; k++ )
 				clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 			if( minX1 >= maxX1 )
 			{
 				disp1 = Scalar::all(INVALID_DISP_SCALED);
 				return;
 			}
 			CV_Assert( D % 16 == 0 );
 			// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
 			// if you change NR, please, modify the loop as well.
 			int D2 = D+16, NRD2 = NR2*D2;
 			// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
 			// for 8-way dynamic programming we need the current row and
 			// the previous row, i.e. 2 rows in total
 			const int NLR = 2;
 			const int LrBorder = NLR - 1;
 			// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
 			// we keep pixel difference cost (C) and the summary cost over NR directions (S).
 			// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
@@ -383,29 +347,23 @@ namespace cv
 				CSBufSize*2*sizeof(CostType) + // C, S
 				width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
 				width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
 			if( buffer.empty() || !buffer.isContinuous() ||
 				buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
 				buffer.create(1, (int)totalBufSize, CV_8U);
 			// summary cost over different (nDirs) directions
 			CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
 			CostType* Sbuf = Cbuf + CSBufSize;
 			CostType* hsumBuf = Sbuf + CSBufSize;
 			CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 			CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
 			DispType* disp2ptr = (DispType*)(disp2cost + width);
 			PixType* tempBuf = (PixType*)(disp2ptr + width);
 			// add P2 to every C(x,y). it saves a few operations in the inner loops
 			for( k = 0; k < width1*D; k++ )
 				Cbuf[k] = (CostType)P2;
 			for( int pass = 1; pass <= npasses; pass++ )
 			{
 				int x1, y1, x2, y2, dx, dy;
 				if( pass == 1 )
 				{
 					y1 = 0; y2 = height; dy = 1;
@@ -416,9 +374,7 @@ namespace cv
 					y1 = height-1; y2 = -1; dy = -1;
 					x1 = width1-1; x2 = -1; dx = -1;
 				}
 				CostType *Lr[NLR]={0}, *minLr[NLR]={0};
 				for( k = 0; k < NLR; k++ )
 				{
 					// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
@@ -431,18 +387,15 @@ namespace cv
 					minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
 					memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
 				}
 				for( int y = y1; y != y2; y += dy )
 				{
 					int x, d;
 					DispType* disp1ptr = disp1.ptr<DispType>(y);
 					CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
 					CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
 					if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
 					{
 						int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
 						for( k = dy1; k <= dy2; k++ )
 						{
 							CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
@@ -450,7 +403,6 @@ namespace cv
 							if( k < height )
 							{
 								calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
 								memset(hsumAdd, 0, D*sizeof(CostType));
 								for( x = 0; x <= SW2*D; x += D )
 								{
@@ -503,13 +455,11 @@ namespace cv
 									{
 										const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
 										const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
 										for( d = 0; d < D; d++ )
 											hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 									}
 								}
 							}
 							if( y == 0 )
 							{
 								int scale = k == 0 ? SH2 + 1 : 1;
@@ -517,18 +467,15 @@ namespace cv
 									C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
 							}
 						}
 						// also, clear the S buffer
 						for( k = 0; k < width1*D; k++ )
 							S[k] = 0;
 					}
 					// clear the left and the right borders
 					memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 					memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 					memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
 					memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
 					/*
 					[formula 13 in the paper]
 					compute L_r(p, d) = C(p, d) +
@@ -550,87 +497,65 @@ namespace cv
 					for( x = x1; x != x2; x += dx )
 					{
 						int xm = x*NR2, xd = xm*D2;
 						int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
 						int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
 						CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
 						CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
 						CostType* Lr_p2 = Lr[1] + xd + D2*2;
 						CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
 						Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
 							Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
 						CostType* Lr_p = Lr[0] + xd;
 						const CostType* Cp = C + x*D;
 						CostType* Sp = S + x*D;
 #if CV_SSE2
 						if( useSIMD )
 						{
 							__m128i _P1 = _mm_set1_epi16((short)P1);
 							__m128i _delta0 = _mm_set1_epi16((short)delta0);
 							__m128i _delta1 = _mm_set1_epi16((short)delta1);
 							__m128i _delta2 = _mm_set1_epi16((short)delta2);
 							__m128i _delta3 = _mm_set1_epi16((short)delta3);
 							__m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
 							for( d = 0; d < D; d += 8 )
 							{
 								__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
 								__m128i L0, L1, L2, L3;
 								L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 								L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
 								L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
 								L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
 								L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 								L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 								L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
 								L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
 								L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
 								L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
 								L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
 								L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
 								L0 = _mm_min_epi16(L0, _delta0);
 								L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 								L1 = _mm_min_epi16(L1, _delta1);
 								L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
 								L2 = _mm_min_epi16(L2, _delta2);
 								L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
 								L3 = _mm_min_epi16(L3, _delta3);
 								L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
 								_mm_store_si128( (__m128i*)(Lr_p + d), L0);
 								_mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
 								_mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
 								_mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
 								__m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
 								__m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
 								t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
 								_minL0 = _mm_min_epi16(_minL0, t0);
 								__m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
 								L0 = _mm_adds_epi16(L0, L1);
 								L2 = _mm_adds_epi16(L2, L3);
 								Sval = _mm_adds_epi16(Sval, L0);
 								Sval = _mm_adds_epi16(Sval, L2);
 								_mm_store_si128((__m128i*)(Sp + d), Sval);
 							}
 							_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 							_mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
 						}
@@ -703,22 +628,18 @@ namespace cv
 									__m128i _minL0 = _mm_set1_epi16((short)minL0);
 									__m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
 									__m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
 									for( d = 0; d < D; d += 8 )
 									{
 										__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
 										L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 										L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 										L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 										L0 = _mm_min_epi16(L0, _delta0);
 										L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 										_mm_store_si128((__m128i*)(Lr_p + d), L0);
 										_minL0 = _mm_min_epi16(_minL0, L0);
 										L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
 										_mm_store_si128((__m128i*)(Sp + d), L0);
 										__m128i mask = _mm_cmpgt_epi16(_minS, L0);
 										_minS = _mm_min_epi16(_minS, L0);
 										_bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
@@ -727,22 +648,17 @@ namespace cv
 									short CV_DECL_ALIGNED(16) bestDispBuf[8];
 									_mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
 									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
 									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
 									__m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
 									qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
 									qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
 									minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
 									minS = (CostType)_mm_cvtsi128_si32(qS);
 									qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
 									qS = _mm_cmpeq_epi16(_minS, qS);
 									int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
 									bestDisp = bestDispBuf[LSBTab[idx]];
 								}
 								else
@@ -751,10 +667,8 @@ namespace cv
 									for( d = 0; d < D; d++ )
 									{
 										int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
 										Lr_p[d] = (CostType)L0;
 										minL0 = std::min(minL0, L0);
 										int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
 										if( Sval < minS )
 										{
@@ -777,7 +691,6 @@ namespace cv
 									}
 								}
 							}
 							for( d = 0; d < D; d++ )
 							{
 								if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
@@ -792,7 +705,6 @@ namespace cv
 								disp2cost[_x2] = (CostType)minS;
 								disp2ptr[_x2] = (DispType)(d + minD);
 							}
 							if( 0 < d && d < D-1 )
 							{
 								// do subpixel quadratic interpolation:
@@ -805,7 +717,6 @@ namespace cv
 								d *= DISP_SCALE;
 							disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
 						}
 						for( x = minX1; x < maxX1; x++ )
 						{
 							// we round the computed disparity both towards -inf and +inf and check
@@ -822,14 +733,12 @@ namespace cv
 								disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
 						}
 					}
 					// now shift the cyclic buffers
 					std::swap( Lr[0], Lr[1] );
 					std::swap( minLr[0], minLr[1] );
 				}
 			}
 		}
 		class StereoBinarySGBMImpl : public StereoBinarySGBM
 		{
 		public:
@@ -837,7 +746,6 @@ namespace cv
 			{
 				params = StereoBinarySGBMParams();
 			}
 			StereoBinarySGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
 				int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
 				int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
@@ -848,16 +756,13 @@ namespace cv
 					_uniquenessRatio, _speckleWindowSize, _speckleRange,
 					_mode );
 			}
 			void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
 			{
 				Mat left = leftarr.getMat(), right = rightarr.getMat();
 				CV_Assert( left.size() == right.size() && left.type() == right.type() &&
 					left.depth() == CV_8U );
 				disparr.create( left.size(), CV_16S );
 				Mat disp = disparr.getMat();
 				computeDisparityBinarySGBM( left, right, disp, params, buffer );
 				medianBlur(disp, disp, 3);
@@ -865,7 +770,6 @@ namespace cv
 					filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
 					StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
 			}
 			int getMinDisparity() const { return params.minDisparity; }
 			void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }