I have put all the modules in the stereo namespace

changed the ptr<StereBinaryBM> to ptr<cv::stereo::StereoBinaryBM> modified the documentation modified documentation for the stereo_c documentation doc fixed two issues modfified the precomp.hpp header by explicitly adding the cvdef header from core modified comments for documentation for stereo and removed some headers added a header and modified some function definition test test 2 changed exports_w to exports removed the correct matches module

I have put all the modules in the stereo namespace
changed the ptr<StereBinaryBM> to ptr<cv::stereo::StereoBinaryBM> modified the documentation modified documentation for the stereo_c documentation doc fixed two issues modfified the precomp.hpp header by explicitly adding the cvdef header from core modified comments for documentation for stereo and removed some headers added a header and modified some function definition test test 2 changed exports_w to exports removed the correct matches module
9b55c04e · Muresan Mircea Paul · 0d1fd8de · 9b55c04e · 9b55c04e · 9b55c04e
Commit 9b55c04e authored Jun 10, 2015 by Muresan Mircea Paul
8 changed files
--- a/modules/stereo/include/opencv2/stereo.hpp
+++ b/modules/stereo/include/opencv2/stereo.hpp
@@ -47,218 +47,232 @@
 #include "opencv2/core.hpp"
 #include "opencv2/features2d.hpp"
 #include "opencv2/core/affine.hpp"
+#include "opencv2/core/cvdef.h"

-namespace cv
-{
-CV_EXPORTS_W void correctMatches( InputArray F, InputArray points1, InputArray points2,
-                                  OutputArray newPoints1, OutputArray newPoints2 );
-
-/** @brief Filters off small noise blobs (speckles) in the disparity map
-
-@param img The input 16-bit signed disparity image
-@param newVal The disparity value used to paint-off the speckles
-@param maxSpeckleSize The maximum speckle size to consider it a speckle. Larger blobs are not
-affected by the algorithm
-@param maxDiff Maximum difference between neighbor disparity pixels to put them into the same
-blob. Note that since StereoBM, StereoSGBM and may be other algorithms return a fixed-point
-disparity map, where disparity values are multiplied by 16, this scale factor should be taken into
-account when specifying this parameter value.
-@param buf The optional temporary buffer to avoid memory allocation within the function.
- */
-CV_EXPORTS_W void filterSpeckles( InputOutputArray img, double newVal,
-                                  int maxSpeckleSize, double maxDiff,
-                                  InputOutputArray buf = noArray() );

-//! computes valid disparity ROI from the valid ROIs of the rectified images (that are returned by cv::stereoRectify())
-CV_EXPORTS_W Rect getValidDisparityROI( Rect roi1, Rect roi2,
-                                        int minDisparity, int numberOfDisparities,
-                                        int SADWindowSize );
+/**
+@defgroup stereo Stereo Correspondance Algorithms

-//! validates disparity using the left-right check. The matrix "cost" should be computed by the stereo correspondence algorithm
-CV_EXPORTS_W void validateDisparity( InputOutputArray disparity, InputArray cost,
-                                     int minDisparity, int numberOfDisparities,
-                                     int disp12MaxDisp = 1 );
-
-/** @brief The base class for stereo correspondence algorithms.
 */
-class CV_EXPORTS_W StereoMatcher : public Algorithm
-{
-public:
-    enum { DISP_SHIFT = 4,
-           DISP_SCALE = (1 << DISP_SHIFT)
-         };
-
-    /** @brief Computes disparity map for the specified stereo pair
-
-    @param left Left 8-bit single-channel image.
-    @param right Right image of the same size and the same type as the left one.
-    @param disparity Output disparity map. It has the same size as the input images. Some algorithms,
-    like StereoBM or StereoSGBM compute 16-bit fixed-point disparity map (where each disparity value
-    has 4 fractional bits), whereas other algorithms output 32-bit floating-point disparity map.
-     */
-    CV_WRAP virtual void compute( InputArray left, InputArray right,
-                                  OutputArray disparity ) = 0;
-
-    CV_WRAP virtual int getMinDisparity() const = 0;
-    CV_WRAP virtual void setMinDisparity(int minDisparity) = 0;
-
-    CV_WRAP virtual int getNumDisparities() const = 0;
-    CV_WRAP virtual void setNumDisparities(int numDisparities) = 0;
-
-    CV_WRAP virtual int getBlockSize() const = 0;
-    CV_WRAP virtual void setBlockSize(int blockSize) = 0;

-    CV_WRAP virtual int getSpeckleWindowSize() const = 0;
-    CV_WRAP virtual void setSpeckleWindowSize(int speckleWindowSize) = 0;
-
-    CV_WRAP virtual int getSpeckleRange() const = 0;
-    CV_WRAP virtual void setSpeckleRange(int speckleRange) = 0;
-
-    CV_WRAP virtual int getDisp12MaxDiff() const = 0;
-    CV_WRAP virtual void setDisp12MaxDiff(int disp12MaxDiff) = 0;
-};
-
-
-/** @brief Class for computing stereo correspondence using the block matching algorithm, introduced and
-contributed to OpenCV by K. Konolige.
- */
-class CV_EXPORTS_W StereoBinaryBM : public StereoMatcher
-{
-public:
-    enum { PREFILTER_NORMALIZED_RESPONSE = 0,
-           PREFILTER_XSOBEL              = 1
-         };
-
-    CV_WRAP virtual int getPreFilterType() const = 0;
-    CV_WRAP virtual void setPreFilterType(int preFilterType) = 0;
-
-    CV_WRAP virtual int getPreFilterSize() const = 0;
-    CV_WRAP virtual void setPreFilterSize(int preFilterSize) = 0;
-
-    CV_WRAP virtual int getPreFilterCap() const = 0;
-    CV_WRAP virtual void setPreFilterCap(int preFilterCap) = 0;
-
-    CV_WRAP virtual int getTextureThreshold() const = 0;
-    CV_WRAP virtual void setTextureThreshold(int textureThreshold) = 0;
-
-    CV_WRAP virtual int getUniquenessRatio() const = 0;
-    CV_WRAP virtual void setUniquenessRatio(int uniquenessRatio) = 0;
-
-    CV_WRAP virtual int getSmallerBlockSize() const = 0;
-    CV_WRAP virtual void setSmallerBlockSize(int blockSize) = 0;
-
-    CV_WRAP virtual Rect getROI1() const = 0;
-    CV_WRAP virtual void setROI1(Rect roi1) = 0;
-
-    CV_WRAP virtual Rect getROI2() const = 0;
-    CV_WRAP virtual void setROI2(Rect roi2) = 0;
-
-    /** @brief Creates StereoBM object
-
-    @param numDisparities the disparity search range. For each pixel algorithm will find the best
-    disparity from 0 (default minimum disparity) to numDisparities. The search range can then be
-    shifted by changing the minimum disparity.
-    @param blockSize the linear size of the blocks compared by the algorithm. The size should be odd
-    (as the block is centered at the current pixel). Larger block size implies smoother, though less
-    accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher
-    chance for algorithm to find a wrong correspondence.
-
-    The function create StereoBM object. You can then call StereoBM::compute() to compute disparity for
-    a specific stereo pair.
-     */
-    CV_WRAP static Ptr<StereoBinaryBM> create(int numDisparities = 0, int blockSize = 21);
-};
-
-/** @brief The class implements the modified H. Hirschmuller algorithm @cite HH08 that differs from the original
-one as follows:
-
-   By default, the algorithm is single-pass, which means that you consider only 5 directions
-instead of 8. Set mode=StereoSGBM::MODE_HH in createStereoSGBM to run the full variant of the
-algorithm but beware that it may consume a lot of memory.
-   The algorithm matches blocks, not individual pixels. Though, setting blockSize=1 reduces the
-blocks to single pixels.
-   Mutual information cost function is not implemented. Instead, a simpler Birchfield-Tomasi
-sub-pixel metric from @cite BT98 is used. Though, the color images are supported as well.
-   Some pre- and post- processing steps from K. Konolige algorithm StereoBM are included, for
-example: pre-filtering (StereoBM::PREFILTER_XSOBEL type) and post-filtering (uniqueness
-check, quadratic interpolation and speckle filtering).
-
-@note
-   -   (Python) An example illustrating the use of the StereoSGBM matching algorithm can be found
-        at opencv_source_code/samples/python2/stereo_match.py
- */
-class CV_EXPORTS_W StereoBinarySGBM : public StereoMatcher
+namespace cv
 {
-public:
-    enum
-    {
-        MODE_SGBM = 0,
-        MODE_HH   = 1
-    };
-
-    CV_WRAP virtual int getPreFilterCap() const = 0;
-    CV_WRAP virtual void setPreFilterCap(int preFilterCap) = 0;
-
-    CV_WRAP virtual int getUniquenessRatio() const = 0;
-    CV_WRAP virtual void setUniquenessRatio(int uniquenessRatio) = 0;
-
-    CV_WRAP virtual int getP1() const = 0;
-    CV_WRAP virtual void setP1(int P1) = 0;
-
-    CV_WRAP virtual int getP2() const = 0;
-    CV_WRAP virtual void setP2(int P2) = 0;
-
-    CV_WRAP virtual int getMode() const = 0;
-    CV_WRAP virtual void setMode(int mode) = 0;
-
-    /** @brief Creates StereoSGBM object
-
-    @param minDisparity Minimum possible disparity value. Normally, it is zero but sometimes
-    rectification algorithms can shift images, so this parameter needs to be adjusted accordingly.
-    @param numDisparities Maximum disparity minus minimum disparity. The value is always greater than
-    zero. In the current implementation, this parameter must be divisible by 16.
-    @param blockSize Matched block size. It must be an odd number \>=1 . Normally, it should be
-    somewhere in the 3..11 range.
-    @param P1 The first parameter controlling the disparity smoothness. See below.
-    @param P2 The second parameter controlling the disparity smoothness. The larger the values are,
-    the smoother the disparity is. P1 is the penalty on the disparity change by plus or minus 1
-    between neighbor pixels. P2 is the penalty on the disparity change by more than 1 between neighbor
-    pixels. The algorithm requires P2 \> P1 . See stereo_match.cpp sample where some reasonably good
-    P1 and P2 values are shown (like 8\*number_of_image_channels\*SADWindowSize\*SADWindowSize and
-    32\*number_of_image_channels\*SADWindowSize\*SADWindowSize , respectively).
-    @param disp12MaxDiff Maximum allowed difference (in integer pixel units) in the left-right
-    disparity check. Set it to a non-positive value to disable the check.
-    @param preFilterCap Truncation value for the prefiltered image pixels. The algorithm first
-    computes x-derivative at each pixel and clips its value by [-preFilterCap, preFilterCap] interval.
-    The result values are passed to the Birchfield-Tomasi pixel cost function.
-    @param uniquenessRatio Margin in percentage by which the best (minimum) computed cost function
-    value should "win" the second best value to consider the found match correct. Normally, a value
-    within the 5-15 range is good enough.
-    @param speckleWindowSize Maximum size of smooth disparity regions to consider their noise speckles
-    and invalidate. Set it to 0 to disable speckle filtering. Otherwise, set it somewhere in the
-    50-200 range.
-    @param speckleRange Maximum disparity variation within each connected component. If you do speckle
-    filtering, set the parameter to a positive value, it will be implicitly multiplied by 16.
-    Normally, 1 or 2 is good enough.
-    @param mode Set it to StereoSGBM::MODE_HH to run the full-scale two-pass dynamic programming
-    algorithm. It will consume O(W\*H\*numDisparities) bytes, which is large for 640x480 stereo and
-    huge for HD-size pictures. By default, it is set to false .
-
-    The first constructor initializes StereoSGBM with all the default parameters. So, you only have to
-    set StereoSGBM::numDisparities at minimum. The second constructor enables you to set each parameter
-    to a custom value.
-     */
-    CV_WRAP static Ptr<StereoBinarySGBM> create(int minDisparity, int numDisparities, int blockSize,
-                                          int P1 = 0, int P2 = 0, int disp12MaxDiff = 0,
-                                          int preFilterCap = 0, int uniquenessRatio = 0,
-                                          int speckleWindowSize = 0, int speckleRange = 0,
-                                          int mode = StereoBinarySGBM::MODE_SGBM);
-};
-
+	namespace stereo
+	{
+		    
+//! @addtogroup stereo
+//! @{
+//		 void correctMatches( InputArray F, InputArray points1, InputArray points2,
+		//	OutputArray newPoints1, OutputArray newPoints2 );
+
+		/** @brief Filters off small noise blobs (speckles) in the disparity map
+
+		@param img The input 16-bit signed disparity image
+		@param newVal The disparity value used to paint-off the speckles
+		@param maxSpeckleSize The maximum speckle size to consider it a speckle. Larger blobs are not
+		affected by the algorithm
+		@param maxDiff Maximum difference between neighbor disparity pixels to put them into the same
+		blob. Note that since StereoBM, StereoSGBM and may be other algorithms return a fixed-point
+		disparity map, where disparity values are multiplied by 16, this scale factor should be taken into
+		account when specifying this parameter value.
+		@param buf The optional temporary buffer to avoid memory allocation within the function.
+		*/
+		 void filterSpeckles( InputOutputArray img, double newVal,
+			int maxSpeckleSize, double maxDiff,
+			InputOutputArray buf = noArray() );
+
+		//! computes valid disparity ROI from the valid ROIs of the rectified images (that are returned by cv::stereoRectify())
+		 Rect getValidDisparityROI( Rect roi1, Rect roi2,
+			int minDisparity, int numberOfDisparities,
+			int SADWindowSize );
+
+		//! validates disparity using the left-right check. The matrix "cost" should be computed by the stereo correspondence algorithm
+		 void validateDisparity( InputOutputArray disparity, InputArray cost,
+			int minDisparity, int numberOfDisparities,
+			int disp12MaxDisp = 1 );
+
+		/** @brief The base class for stereo correspondence algorithms.
+		*/
+		class StereoMatcher : public Algorithm
+		{
+		public:
+			enum { DISP_SHIFT = 4,
+				DISP_SCALE = (1 << DISP_SHIFT)
+			};
+
+			/** @brief Computes disparity map for the specified stereo pair
+
+			@param left Left 8-bit single-channel image.
+			@param right Right image of the same size and the same type as the left one.
+			@param disparity Output disparity map. It has the same size as the input images. Some algorithms,
+			like StereoBM or StereoSGBM compute 16-bit fixed-point disparity map (where each disparity value
+			has 4 fractional bits), whereas other algorithms output 32-bit floating-point disparity map.
+			*/
+			 virtual void compute( InputArray left, InputArray right,
+				OutputArray disparity ) = 0;
+
+			 virtual int getMinDisparity() const = 0;
+			 virtual void setMinDisparity(int minDisparity) = 0;
+
+			 virtual int getNumDisparities() const = 0;
+			 virtual void setNumDisparities(int numDisparities) = 0;
+
+			 virtual int getBlockSize() const = 0;
+			 virtual void setBlockSize(int blockSize) = 0;
+
+			 virtual int getSpeckleWindowSize() const = 0;
+			 virtual void setSpeckleWindowSize(int speckleWindowSize) = 0;
+
+			 virtual int getSpeckleRange() const = 0;
+			 virtual void setSpeckleRange(int speckleRange) = 0;
+
+			 virtual int getDisp12MaxDiff() const = 0;
+			 virtual void setDisp12MaxDiff(int disp12MaxDiff) = 0;
+		};
+
+
+		/** @brief Class for computing stereo correspondence using the block matching algorithm, introduced and
+		contributed to OpenCV by K. Konolige.
+		*/
+		class StereoBinaryBM : public StereoMatcher
+		{
+		public:
+			enum { PREFILTER_NORMALIZED_RESPONSE = 0,
+				PREFILTER_XSOBEL              = 1
+			};
+
+			 virtual int getPreFilterType() const = 0;
+			 virtual void setPreFilterType(int preFilterType) = 0;
+
+			 virtual int getPreFilterSize() const = 0;
+			 virtual void setPreFilterSize(int preFilterSize) = 0;
+
+			 virtual int getPreFilterCap() const = 0;
+			 virtual void setPreFilterCap(int preFilterCap) = 0;
+
+			 virtual int getTextureThreshold() const = 0;
+			 virtual void setTextureThreshold(int textureThreshold) = 0;
+
+			 virtual int getUniquenessRatio() const = 0;
+			 virtual void setUniquenessRatio(int uniquenessRatio) = 0;
+
+			 virtual int getSmallerBlockSize() const = 0;
+			 virtual void setSmallerBlockSize(int blockSize) = 0;
+
+			 virtual Rect getROI1() const = 0;
+			 virtual void setROI1(Rect roi1) = 0;
+
+			 virtual Rect getROI2() const = 0;
+			 virtual void setROI2(Rect roi2) = 0;
+
+			/** @brief Creates StereoBM object
+
+			@param numDisparities the disparity search range. For each pixel algorithm will find the best
+			disparity from 0 (default minimum disparity) to numDisparities. The search range can then be
+			shifted by changing the minimum disparity.
+			@param blockSize the linear size of the blocks compared by the algorithm. The size should be odd
+			(as the block is centered at the current pixel). Larger block size implies smoother, though less
+			accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher
+			chance for algorithm to find a wrong correspondence.
+
+			The function create StereoBM object. You can then call StereoBM::compute() to compute disparity for
+			a specific stereo pair.
+			*/
+			CV_EXPORTS static Ptr< cv::stereo::StereoBinaryBM > create(int numDisparities = 0, int blockSize = 21);
+		};
+
+		/** @brief The class implements the modified H. Hirschmuller algorithm @cite HH08 that differs from the original
+		one as follows:
+
+		-   By default, the algorithm is single-pass, which means that you consider only 5 directions
+		instead of 8. Set mode=StereoSGBM::MODE_HH in createStereoSGBM to run the full variant of the
+		algorithm but beware that it may consume a lot of memory.
+		-   The algorithm matches blocks, not individual pixels. Though, setting blockSize=1 reduces the
+		blocks to single pixels.
+		-   Mutual information cost function is not implemented. Instead, a simpler Birchfield-Tomasi
+		sub-pixel metric from @cite BT98 is used. Though, the color images are supported as well.
+		-   Some pre- and post- processing steps from K. Konolige algorithm StereoBM are included, for
+		example: pre-filtering (StereoBM::PREFILTER_XSOBEL type) and post-filtering (uniqueness
+		check, quadratic interpolation and speckle filtering).
+
+		@note
+		-   (Python) An example illustrating the use of the StereoSGBM matching algorithm can be found
+		at opencv_source_code/samples/python2/stereo_match.py
+		*/
+		class StereoBinarySGBM : public StereoMatcher
+		{
+		public:
+			enum
+			{
+				MODE_SGBM = 0,
+				MODE_HH   = 1
+			};
+
+			 virtual int getPreFilterCap() const = 0;
+			 virtual void setPreFilterCap(int preFilterCap) = 0;
+
+			 virtual int getUniquenessRatio() const = 0;
+			 virtual void setUniquenessRatio(int uniquenessRatio) = 0;
+
+			 virtual int getP1() const = 0;
+			 virtual void setP1(int P1) = 0;
+
+			 virtual int getP2() const = 0;
+			 virtual void setP2(int P2) = 0;
+
+			 virtual int getMode() const = 0;
+			 virtual void setMode(int mode) = 0;
+
+			/** @brief Creates StereoSGBM object
+
+			@param minDisparity Minimum possible disparity value. Normally, it is zero but sometimes
+			rectification algorithms can shift images, so this parameter needs to be adjusted accordingly.
+			@param numDisparities Maximum disparity minus minimum disparity. The value is always greater than
+			zero. In the current implementation, this parameter must be divisible by 16.
+			@param blockSize Matched block size. It must be an odd number \>=1 . Normally, it should be
+			somewhere in the 3..11 range.
+			@param P1 The first parameter controlling the disparity smoothness. See below.
+			@param P2 The second parameter controlling the disparity smoothness. The larger the values are,
+			the smoother the disparity is. P1 is the penalty on the disparity change by plus or minus 1
+			between neighbor pixels. P2 is the penalty on the disparity change by more than 1 between neighbor
+			pixels. The algorithm requires P2 \> P1 . See stereo_match.cpp sample where some reasonably good
+			P1 and P2 values are shown (like 8\*number_of_image_channels\*SADWindowSize\*SADWindowSize and
+			32\*number_of_image_channels\*SADWindowSize\*SADWindowSize , respectively).
+			@param disp12MaxDiff Maximum allowed difference (in integer pixel units) in the left-right
+			disparity check. Set it to a non-positive value to disable the check.
+			@param preFilterCap Truncation value for the prefiltered image pixels. The algorithm first
+			computes x-derivative at each pixel and clips its value by [-preFilterCap, preFilterCap] interval.
+			The result values are passed to the Birchfield-Tomasi pixel cost function.
+			@param uniquenessRatio Margin in percentage by which the best (minimum) computed cost function
+			value should "win" the second best value to consider the found match correct. Normally, a value
+			within the 5-15 range is good enough.
+			@param speckleWindowSize Maximum size of smooth disparity regions to consider their noise speckles
+			and invalidate. Set it to 0 to disable speckle filtering. Otherwise, set it somewhere in the
+			50-200 range.
+			@param speckleRange Maximum disparity variation within each connected component. If you do speckle
+			filtering, set the parameter to a positive value, it will be implicitly multiplied by 16.
+			Normally, 1 or 2 is good enough.
+			@param mode Set it to StereoSGBM::MODE_HH to run the full-scale two-pass dynamic programming
+			algorithm. It will consume O(W\*H\*numDisparities) bytes, which is large for 640x480 stereo and
+			huge for HD-size pictures. By default, it is set to false .
+
+			The first constructor initializes StereoSGBM with all the default parameters. So, you only have to
+			set StereoSGBM::numDisparities at minimum. The second constructor enables you to set each parameter
+			to a custom value.
+			*/
+			 CV_EXPORTS static Ptr<cv::stereo::StereoBinarySGBM> create(int minDisparity, int numDisparities, int blockSize,
+				int P1 = 0, int P2 = 0, int disp12MaxDiff = 0,
+				int preFilterCap = 0, int uniquenessRatio = 0,
+				int speckleWindowSize = 0, int speckleRange = 0,
+				int mode = StereoBinarySGBM::MODE_SGBM);
+		};
+		//! @}
+	}//sterep
 } // cv

 #ifndef DISABLE_OPENCV_24_COMPATIBILITY
-#include "opencv2/calib3d/calib3d_c.h"
+#include "opencv2/stereo/stereo_c.h"
 #endif

 #endif
+
--- a/modules/stereo/include/opencv2/stereo/stereo.hpp
+++ b/modules/stereo/include/opencv2/stereo/stereo.hpp
@@ -46,3 +46,4 @@
 #endif

 #include "opencv2/stereo.hpp"
+
--- a/modules/stereo/include/opencv2/stereo/stereo_c.h
+++ b/modules/stereo/include/opencv2/stereo/stereo_c.h
@@ -52,17 +52,14 @@ extern "C" {

 /** @addtogroup stereo_c
  @{
-  */
+  **/

-/****************************************************************************************\
-*									 Stereo										         *
-\****************************************************************************************/
-/* stereo correspondence parameters and functions */
+	//! stereo correspondence parameters and functions 

 #define CV_STEREO_BM_NORMALIZED_RESPONSE  0
 #define CV_STEREO_BM_XSOBEL               1

-/* Block matching algorithm structure */
+//! Block matching algorithm structure 
 typedef struct CvStereoBinaryBMState
 {
    // pre-filtering (normalization of input images)
@@ -109,14 +106,15 @@ CVAPI(void) cvReleaseStereoBinaryBMState( CvStereoBinaryBMState** state );
 CVAPI(void) cvFindStereoCorrespondenceBinaryBM( const CvArr* left, const CvArr* right,
                                          CvArr* disparity, CvStereoBinaryBMState* state );

-CVAPI(CvRect) cvGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
+CVAPI(CvRect) cvStereoBinaryGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
                                      int numberOfDisparities, int SADWindowSize );

-CVAPI(void) cvValidateDisparity( CvArr* disparity, const CvArr* cost,
+CVAPI(void) cvStereoBinaryValidateDisparity( CvArr* disparity, const CvArr* cost,
                                 int minDisparity, int numberOfDisparities,
                                 int disp12MaxDiff CV_DEFAULT(1) );
-
+/** @} stereo_c */
 #ifdef __cplusplus
 } // extern "C"
 #endif
 #endif /* __OPENCV_STEREO_C_H__ */
+
--- a/modules/stereo/src/compat_binary_stereo.cpp
+++ b/modules/stereo/src/compat_binary_stereo.cpp
@@ -92,7 +92,7 @@ void cvFindStereoCorrespondenceBinaryBM( const CvArr* leftarr, const CvArr* righ

    CV_Assert( state != 0 );

-    cv::Ptr<cv::StereoBinaryBM> sm = cv::StereoBinaryBM::create(state->numberOfDisparities,
+	cv::Ptr<cv::stereo::StereoBinaryBM> sm = cv::stereo::StereoBinaryBM::create(state->numberOfDisparities,
                                                       state->SADWindowSize);
    sm->setPreFilterType(state->preFilterType);
    sm->setPreFilterSize(state->preFilterSize);
@@ -108,17 +108,18 @@ void cvFindStereoCorrespondenceBinaryBM( const CvArr* leftarr, const CvArr* righ
    sm->compute(left, right, disp);
 }

-CvRect cvGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
+CvRect cvStereoBinaryGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
                              int numberOfDisparities, int SADWindowSize )
 {
-    return (CvRect)cv::getValidDisparityROI( roi1, roi2, minDisparity,
+	return (CvRect)cv::stereo::getValidDisparityROI( roi1, roi2, minDisparity,
                                            numberOfDisparities, SADWindowSize );
 }

-void cvValidateDisparity( CvArr* _disp, const CvArr* _cost, int minDisparity,
+void cvStereoBinaryValidateDisparity( CvArr* _disp, const CvArr* _cost, int minDisparity,
                         int numberOfDisparities, int disp12MaxDiff )
 {
    cv::Mat disp = cv::cvarrToMat(_disp), cost = cv::cvarrToMat(_cost);
-    cv::validateDisparity( disp, cost, minDisparity, numberOfDisparities, disp12MaxDiff );
+    cv::stereo::validateDisparity( disp, cost, minDisparity, numberOfDisparities, disp12MaxDiff );
 }

+
--- a/modules/stereo/src/precomp.hpp
+++ b/modules/stereo/src/precomp.hpp
@@ -45,9 +45,10 @@
 #include "opencv2/stereo.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/features2d.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.hpp"
-#include "opencv2/core.hpp"
+#include "opencv2/core/cvdef.h"
 #include "opencv2/highgui.hpp"

 #include <algorithm>

--- a/modules/stereo/src/stereo_binary_bm.cpp
+++ b/modules/stereo/src/stereo_binary_bm.cpp
@@ -51,686 +51,687 @@

 namespace cv
 {
-
-	struct StereoBinaryBMParams
+	namespace stereo
 	{
-		StereoBinaryBMParams(int _numDisparities = 64, int _SADWindowSize = 9)
+		struct StereoBinaryBMParams
 		{
-			preFilterType = StereoBinaryBM::PREFILTER_XSOBEL;
-			preFilterSize = 9;
-			preFilterCap = 31;
-			SADWindowSize = _SADWindowSize;
-			minDisparity = 0;
-			numDisparities = _numDisparities > 0 ? _numDisparities : 64;
-			textureThreshold = 10;
-			uniquenessRatio = 15;
-			speckleRange = speckleWindowSize = 0;
-			roi1 = roi2 = Rect(0, 0, 0, 0);
-			disp12MaxDiff = -1;
-			dispType = CV_16S;
-		}
-
-		int preFilterType;
-		int preFilterSize;
-		int preFilterCap;
-		int SADWindowSize;
-		int minDisparity;
-		int numDisparities;
-		int textureThreshold;
-		int uniquenessRatio;
-		int speckleRange;
-		int speckleWindowSize;
-		Rect roi1, roi2;
-		int disp12MaxDiff;
-		int dispType;
-	};
-
-	static void prefilterNorm(const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf)
-	{
-		int x, y, wsz2 = winsize / 2;
-		int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32);
-		int scale_g = winsize*winsize / 8, scale_s = (1024 + scale_g) / (scale_g * 2);
-		const int OFS = 256 * 5, TABSZ = OFS * 2 + 256;
-		uchar tab[TABSZ];
-		const uchar* sptr = src.ptr();
-		int srcstep = (int)src.step;
-		Size size = src.size();
-
-		scale_g *= scale_s;
-
-		for (x = 0; x < TABSZ; x++)
-			tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero * 2 : x - OFS + ftzero);
-
-		for (x = 0; x < size.width; x++)
-			vsum[x] = (ushort)(sptr[x] * (wsz2 + 2));
+			StereoBinaryBMParams(int _numDisparities = 64, int _SADWindowSize = 9)
+			{
+				preFilterType = StereoBinaryBM::PREFILTER_XSOBEL;
+				preFilterSize = 9;
+				preFilterCap = 31;
+				SADWindowSize = _SADWindowSize;
+				minDisparity = 0;
+				numDisparities = _numDisparities > 0 ? _numDisparities : 64;
+				textureThreshold = 10;
+				uniquenessRatio = 15;
+				speckleRange = speckleWindowSize = 0;
+				roi1 = roi2 = Rect(0, 0, 0, 0);
+				disp12MaxDiff = -1;
+				dispType = CV_16S;
+			}

-		for (y = 1; y < wsz2; y++)
+			int preFilterType;
+			int preFilterSize;
+			int preFilterCap;
+			int SADWindowSize;
+			int minDisparity;
+			int numDisparities;
+			int textureThreshold;
+			int uniquenessRatio;
+			int speckleRange;
+			int speckleWindowSize;
+			Rect roi1, roi2;
+			int disp12MaxDiff;
+			int dispType;
+		};
+
+		static void prefilterNorm(const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf)
 		{
-			for (x = 0; x < size.width; x++)
-				vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
-		}
+			int x, y, wsz2 = winsize / 2;
+			int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32);
+			int scale_g = winsize*winsize / 8, scale_s = (1024 + scale_g) / (scale_g * 2);
+			const int OFS = 256 * 5, TABSZ = OFS * 2 + 256;
+			uchar tab[TABSZ];
+			const uchar* sptr = src.ptr();
+			int srcstep = (int)src.step;
+			Size size = src.size();

-		for (y = 0; y < size.height; y++)
-		{
-			const uchar* top = sptr + srcstep*MAX(y - wsz2 - 1, 0);
-			const uchar* bottom = sptr + srcstep*MIN(y + wsz2, size.height - 1);
-			const uchar* prev = sptr + srcstep*MAX(y - 1, 0);
-			const uchar* curr = sptr + srcstep*y;
-			const uchar* next = sptr + srcstep*MIN(y + 1, size.height - 1);
-			uchar* dptr = dst.ptr<uchar>(y);
+			scale_g *= scale_s;
+
+			for (x = 0; x < TABSZ; x++)
+				tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero * 2 : x - OFS + ftzero);

 			for (x = 0; x < size.width; x++)
-				vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);
+				vsum[x] = (ushort)(sptr[x] * (wsz2 + 2));

-			for (x = 0; x <= wsz2; x++)
+			for (y = 1; y < wsz2; y++)
 			{
-				vsum[-x - 1] = vsum[0];
-				vsum[size.width + x] = vsum[size.width - 1];
+				for (x = 0; x < size.width; x++)
+					vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
 			}

-			int sum = vsum[0] * (wsz2 + 1);
-			for (x = 1; x <= wsz2; x++)
-				sum += vsum[x];
+			for (y = 0; y < size.height; y++)
+			{
+				const uchar* top = sptr + srcstep*MAX(y - wsz2 - 1, 0);
+				const uchar* bottom = sptr + srcstep*MIN(y + wsz2, size.height - 1);
+				const uchar* prev = sptr + srcstep*MAX(y - 1, 0);
+				const uchar* curr = sptr + srcstep*y;
+				const uchar* next = sptr + srcstep*MIN(y + 1, size.height - 1);
+				uchar* dptr = dst.ptr<uchar>(y);

-			int val = ((curr[0] * 5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
-			dptr[0] = tab[val + OFS];
+				for (x = 0; x < size.width; x++)
+					vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);
+
+				for (x = 0; x <= wsz2; x++)
+				{
+					vsum[-x - 1] = vsum[0];
+					vsum[size.width + x] = vsum[size.width - 1];
+				}
+
+				int sum = vsum[0] * (wsz2 + 1);
+				for (x = 1; x <= wsz2; x++)
+					sum += vsum[x];
+
+				int val = ((curr[0] * 5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
+				dptr[0] = tab[val + OFS];
+
+				for (x = 1; x < size.width - 1; x++)
+				{
+					sum += vsum[x + wsz2] - vsum[x - wsz2 - 1];
+					val = ((curr[x] * 4 + curr[x - 1] + curr[x + 1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
+					dptr[x] = tab[val + OFS];
+				}

-			for (x = 1; x < size.width - 1; x++)
-			{
 				sum += vsum[x + wsz2] - vsum[x - wsz2 - 1];
-				val = ((curr[x] * 4 + curr[x - 1] + curr[x + 1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
+				val = ((curr[x] * 5 + curr[x - 1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
 				dptr[x] = tab[val + OFS];
 			}
-
-			sum += vsum[x + wsz2] - vsum[x - wsz2 - 1];
-			val = ((curr[x] * 5 + curr[x - 1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
-			dptr[x] = tab[val + OFS];
 		}
-	}

-	static void
-		prefilterXSobel(const Mat& src, Mat& dst, int ftzero)
-	{
-		int x, y;
-		const int OFS = 256 * 4, TABSZ = OFS * 2 + 256;
-		uchar tab[TABSZ];
-		Size size = src.size();
+		static void
+			prefilterXSobel(const Mat& src, Mat& dst, int ftzero)
+		{
+			int x, y;
+			const int OFS = 256 * 4, TABSZ = OFS * 2 + 256;
+			uchar tab[TABSZ];
+			Size size = src.size();

-		for (x = 0; x < TABSZ; x++)
-			tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero * 2 : x - OFS + ftzero);
-		uchar val0 = tab[0 + OFS];
+			for (x = 0; x < TABSZ; x++)
+				tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero * 2 : x - OFS + ftzero);
+			uchar val0 = tab[0 + OFS];

 #if CV_SSE2
-		volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+			volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 #endif

-		for (y = 0; y < size.height - 1; y += 2)
-		{
-			const uchar* srow1 = src.ptr<uchar>(y);
-			const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
-			const uchar* srow2 = y < size.height - 1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
-			const uchar* srow3 = y < size.height - 2 ? srow1 + src.step * 2 : srow1;
-			uchar* dptr0 = dst.ptr<uchar>(y);
-			uchar* dptr1 = dptr0 + dst.step;
+			for (y = 0; y < size.height - 1; y += 2)
+			{
+				const uchar* srow1 = src.ptr<uchar>(y);
+				const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
+				const uchar* srow2 = y < size.height - 1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
+				const uchar* srow3 = y < size.height - 2 ? srow1 + src.step * 2 : srow1;
+				uchar* dptr0 = dst.ptr<uchar>(y);
+				uchar* dptr1 = dptr0 + dst.step;

-			dptr0[0] = dptr0[size.width - 1] = dptr1[0] = dptr1[size.width - 1] = val0;
-			x = 1;
+				dptr0[0] = dptr0[size.width - 1] = dptr1[0] = dptr1[size.width - 1] = val0;
+				x = 1;

 #if CV_SSE2
-			if (useSIMD)
-			{
-				__m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16((short)ftzero),
-					ftz2 = _mm_set1_epi8(cv::saturate_cast<uchar>(ftzero * 2));
-				for (; x <= size.width - 9; x += 8)
+				if (useSIMD)
 				{
-					__m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z);
-					__m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z);
-					__m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z);
-					__m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z);
+					__m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16((short)ftzero),
+						ftz2 = _mm_set1_epi8(cv::saturate_cast<uchar>(ftzero * 2));
+					for (; x <= size.width - 9; x += 8)
+					{
+						__m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z);
+						__m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z);
+						__m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z);
+						__m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z);

-					d0 = _mm_sub_epi16(d0, c0);
-					d1 = _mm_sub_epi16(d1, c1);
+						d0 = _mm_sub_epi16(d0, c0);
+						d1 = _mm_sub_epi16(d1, c1);

-					__m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
-					__m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z);
-					__m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
-					__m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z);
+						__m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
+						__m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z);
+						__m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
+						__m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z);

-					d2 = _mm_sub_epi16(d2, c2);
-					d3 = _mm_sub_epi16(d3, c3);
+						d2 = _mm_sub_epi16(d2, c2);
+						d3 = _mm_sub_epi16(d3, c3);

-					__m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1)));
-					__m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2)));
-					v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz));
-					v0 = _mm_min_epu8(v0, ftz2);
+						__m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1)));
+						__m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2)));
+						v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz));
+						v0 = _mm_min_epu8(v0, ftz2);

-					_mm_storel_epi64((__m128i*)(dptr0 + x), v0);
-					_mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0));
+						_mm_storel_epi64((__m128i*)(dptr0 + x), v0);
+						_mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0));
+					}
 				}
-			}
 #endif

-			for (; x < size.width - 1; x++)
-			{
-				int d0 = srow0[x + 1] - srow0[x - 1], d1 = srow1[x + 1] - srow1[x - 1],
-					d2 = srow2[x + 1] - srow2[x - 1], d3 = srow3[x + 1] - srow3[x - 1];
-				int v0 = tab[d0 + d1 * 2 + d2 + OFS];
-				int v1 = tab[d1 + d2 * 2 + d3 + OFS];
-				dptr0[x] = (uchar)v0;
-				dptr1[x] = (uchar)v1;
+				for (; x < size.width - 1; x++)
+				{
+					int d0 = srow0[x + 1] - srow0[x - 1], d1 = srow1[x + 1] - srow1[x - 1],
+						d2 = srow2[x + 1] - srow2[x - 1], d3 = srow3[x + 1] - srow3[x - 1];
+					int v0 = tab[d0 + d1 * 2 + d2 + OFS];
+					int v1 = tab[d1 + d2 * 2 + d3 + OFS];
+					dptr0[x] = (uchar)v0;
+					dptr1[x] = (uchar)v1;
+				}
 			}
-		}

-		for (; y < size.height; y++)
-		{
-			uchar* dptr = dst.ptr<uchar>(y);
-			for (x = 0; x < size.width; x++)
-				dptr[x] = val0;
-		}
-	}
-
-	static const int DISPARITY_SHIFT = 4;
-
-	static void
-		findStereoCorrespondenceBM(const Mat& left, const Mat& right,
-		Mat& disp, Mat& cost, const StereoBinaryBMParams& state,
-		uchar* buf, int _dy0, int _dy1)
-	{
-		const int ALIGN = 16;
-		int x, y, d;
-		int wsz = state.SADWindowSize, wsz2 = wsz / 2;
-		int dy0 = MIN(_dy0, wsz2 + 1), dy1 = MIN(_dy1, wsz2 + 1);
-		int ndisp = state.numDisparities;
-		int mindisp = state.minDisparity;
-		int lofs = MAX(ndisp - 1 + mindisp, 0);
-		int rofs = -MIN(ndisp - 1 + mindisp, 0);
-		int width = left.cols, height = left.rows;
-		int width1 = width - rofs - ndisp + 1;
-		int ftzero = state.preFilterCap;
-		int textureThreshold = state.textureThreshold;
-		int uniquenessRatio = state.uniquenessRatio;
-		short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
-
-		int *sad, *hsad0, *hsad, *hsad_sub, *htext;
-		uchar *cbuf0, *cbuf;
-		const uchar* lptr0 = left.ptr() + lofs;
-		const uchar* rptr0 = right.ptr() + rofs;
-		const uchar *lptr, *lptr_sub, *rptr;
-		short* dptr = disp.ptr<short>();
-		int sstep = (int)left.step;
-		int dstep = (int)(disp.step / sizeof(dptr[0]));
-		int cstep = (height + dy0 + dy1)*ndisp;
-		int costbuf = 0;
-		int coststep = cost.data ? (int)(cost.step / sizeof(costbuf)) : 0;
-		const int TABSZ = 256;
-		uchar tab[TABSZ];
-
-		sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
-		hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
-		htext = (int*)alignPtr((int*)(hsad0 + (height + dy1)*ndisp) + wsz2 + 2, ALIGN);
-		cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
-
-		for (x = 0; x < TABSZ; x++)
-			tab[x] = (uchar)std::abs(x - ftzero);
-
-		// initialize buffers
-		memset(hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]));
-		memset(htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]));
-
-		for (x = -wsz2 - 1; x < wsz2; x++)
-		{
-			hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
-			lptr = lptr0 + std::min(std::max(x, -lofs), width - lofs - 1) - dy0*sstep;
-			rptr = rptr0 + std::min(std::max(x, -rofs), width - rofs - 1) - dy0*sstep;
-			for (y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep)
+			for (; y < size.height; y++)
 			{
-				int lval = lptr[0];
-				for (d = 0; d < ndisp; d++)
-				{
-					int diff = std::abs(lval - rptr[d]);
-					cbuf[d] = (uchar)diff;
-					hsad[d] = (int)(hsad[d] + diff);
-				}
-				htext[y] += tab[lval];
+				uchar* dptr = dst.ptr<uchar>(y);
+				for (x = 0; x < size.width; x++)
+					dptr[x] = val0;
 			}
 		}

-		// initialize the left and right borders of the disparity map
-		for (y = 0; y < height; y++)
-		{
-			for (x = 0; x < lofs; x++)
-				dptr[y*dstep + x] = FILTERED;
-			for (x = lofs + width1; x < width; x++)
-				dptr[y*dstep + x] = FILTERED;
-		}
-		dptr += lofs;
+		static const int DISPARITY_SHIFT = 4;

-		for (x = 0; x < width1; x++, dptr++)
+		static void
+			findStereoCorrespondenceBM(const Mat& left, const Mat& right,
+			Mat& disp, Mat& cost, const StereoBinaryBMParams& state,
+			uchar* buf, int _dy0, int _dy1)
 		{
-			int* costptr = cost.data ? cost.ptr<int>() + lofs + x : &costbuf;
-			int x0 = x - wsz2 - 1, x1 = x + wsz2;
-			const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
-			cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
-			hsad = hsad0 - dy0*ndisp;
-			lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width - 1 - lofs) - dy0*sstep;
-			lptr = lptr0 + MIN(MAX(x1, -lofs), width - 1 - lofs) - dy0*sstep;
-			rptr = rptr0 + MIN(MAX(x1, -rofs), width - 1 - rofs) - dy0*sstep;
-
-			for (y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
-				hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep)
+			const int ALIGN = 16;
+			int x, y, d;
+			int wsz = state.SADWindowSize, wsz2 = wsz / 2;
+			int dy0 = MIN(_dy0, wsz2 + 1), dy1 = MIN(_dy1, wsz2 + 1);
+			int ndisp = state.numDisparities;
+			int mindisp = state.minDisparity;
+			int lofs = MAX(ndisp - 1 + mindisp, 0);
+			int rofs = -MIN(ndisp - 1 + mindisp, 0);
+			int width = left.cols, height = left.rows;
+			int width1 = width - rofs - ndisp + 1;
+			int ftzero = state.preFilterCap;
+			int textureThreshold = state.textureThreshold;
+			int uniquenessRatio = state.uniquenessRatio;
+			short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
+
+			int *sad, *hsad0, *hsad, *hsad_sub, *htext;
+			uchar *cbuf0, *cbuf;
+			const uchar* lptr0 = left.ptr() + lofs;
+			const uchar* rptr0 = right.ptr() + rofs;
+			const uchar *lptr, *lptr_sub, *rptr;
+			short* dptr = disp.ptr<short>();
+			int sstep = (int)left.step;
+			int dstep = (int)(disp.step / sizeof(dptr[0]));
+			int cstep = (height + dy0 + dy1)*ndisp;
+			int costbuf = 0;
+			int coststep = cost.data ? (int)(cost.step / sizeof(costbuf)) : 0;
+			const int TABSZ = 256;
+			uchar tab[TABSZ];
+
+			sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
+			hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
+			htext = (int*)alignPtr((int*)(hsad0 + (height + dy1)*ndisp) + wsz2 + 2, ALIGN);
+			cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
+
+			for (x = 0; x < TABSZ; x++)
+				tab[x] = (uchar)std::abs(x - ftzero);
+
+			// initialize buffers
+			memset(hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]));
+			memset(htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]));
+
+			for (x = -wsz2 - 1; x < wsz2; x++)
 			{
-				int lval = lptr[0];
-				for (d = 0; d < ndisp; d++)
+				hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
+				lptr = lptr0 + std::min(std::max(x, -lofs), width - lofs - 1) - dy0*sstep;
+				rptr = rptr0 + std::min(std::max(x, -rofs), width - rofs - 1) - dy0*sstep;
+				for (y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep)
 				{
-					int diff = std::abs(lval - rptr[d]);
-					cbuf[d] = (uchar)diff;
-					hsad[d] = hsad[d] + diff - cbuf_sub[d];
+					int lval = lptr[0];
+					for (d = 0; d < ndisp; d++)
+					{
+						int diff = std::abs(lval - rptr[d]);
+						cbuf[d] = (uchar)diff;
+						hsad[d] = (int)(hsad[d] + diff);
+					}
+					htext[y] += tab[lval];
 				}
-				htext[y] += tab[lval] - tab[lptr_sub[0]];
 			}

-			// fill borders
-			for (y = dy1; y <= wsz2; y++)
-				htext[height + y] = htext[height + dy1 - 1];
-			for (y = -wsz2 - 1; y < -dy0; y++)
-				htext[y] = htext[-dy0];
-
-			// initialize sums
-			int tsum = 0;
+			// initialize the left and right borders of the disparity map
+			for (y = 0; y < height; y++)
 			{
-				for (d = 0; d < ndisp; d++)
-					sad[d] = (int)(hsad0[d - ndisp*dy0] * (wsz2 + 2 - dy0));
-
-				hsad = hsad0 + (1 - dy0)*ndisp;
-				for (y = 1 - dy0; y < wsz2; y++, hsad += ndisp)
-					for (d = 0; d < ndisp; d++)
-						sad[d] = (int)(sad[d] + hsad[d]);
-				
-				for (y = -wsz2 - 1; y < wsz2; y++)
-					tsum += htext[y];
+				for (x = 0; x < lofs; x++)
+					dptr[y*dstep + x] = FILTERED;
+				for (x = lofs + width1; x < width; x++)
+					dptr[y*dstep + x] = FILTERED;
 			}
-			// finally, start the real processing
+			dptr += lofs;
+
+			for (x = 0; x < width1; x++, dptr++)
 			{
-				for (y = 0; y < height; y++)
+				int* costptr = cost.data ? cost.ptr<int>() + lofs + x : &costbuf;
+				int x0 = x - wsz2 - 1, x1 = x + wsz2;
+				const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
+				cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
+				hsad = hsad0 - dy0*ndisp;
+				lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width - 1 - lofs) - dy0*sstep;
+				lptr = lptr0 + MIN(MAX(x1, -lofs), width - 1 - lofs) - dy0*sstep;
+				rptr = rptr0 + MIN(MAX(x1, -rofs), width - 1 - rofs) - dy0*sstep;
+
+				for (y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
+					hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep)
 				{
-					int minsad = INT_MAX, mind = -1;
-					hsad = hsad0 + MIN(y + wsz2, height + dy1 - 1)*ndisp;
-					hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
-
+					int lval = lptr[0];
 					for (d = 0; d < ndisp; d++)
 					{
-						int currsad = sad[d] + hsad[d] - hsad_sub[d];
-						sad[d] = currsad;
-						if (currsad < minsad)
-						{
-							minsad = currsad;
-							mind = d;
-						}
+						int diff = std::abs(lval - rptr[d]);
+						cbuf[d] = (uchar)diff;
+						hsad[d] = hsad[d] + diff - cbuf_sub[d];
 					}
+					htext[y] += tab[lval] - tab[lptr_sub[0]];
+				}

-					tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
-					if (tsum < textureThreshold)
-					{
-						dptr[y*dstep] = FILTERED;
-						continue;
-					}
+				// fill borders
+				for (y = dy1; y <= wsz2; y++)
+					htext[height + y] = htext[height + dy1 - 1];
+				for (y = -wsz2 - 1; y < -dy0; y++)
+					htext[y] = htext[-dy0];

-					if (uniquenessRatio > 0)
+				// initialize sums
+				int tsum = 0;
+				{
+					for (d = 0; d < ndisp; d++)
+						sad[d] = (int)(hsad0[d - ndisp*dy0] * (wsz2 + 2 - dy0));
+
+					hsad = hsad0 + (1 - dy0)*ndisp;
+					for (y = 1 - dy0; y < wsz2; y++, hsad += ndisp)
+						for (d = 0; d < ndisp; d++)
+							sad[d] = (int)(sad[d] + hsad[d]);
+
+					for (y = -wsz2 - 1; y < wsz2; y++)
+						tsum += htext[y];
+				}
+				// finally, start the real processing
+				{
+					for (y = 0; y < height; y++)
 					{
-						int thresh = minsad + (minsad * uniquenessRatio / 100);
+						int minsad = INT_MAX, mind = -1;
+						hsad = hsad0 + MIN(y + wsz2, height + dy1 - 1)*ndisp;
+						hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
+
 						for (d = 0; d < ndisp; d++)
 						{
-							if ((d < mind - 1 || d > mind + 1) && sad[d] <= thresh)
-								break;
+							int currsad = sad[d] + hsad[d] - hsad_sub[d];
+							sad[d] = currsad;
+							if (currsad < minsad)
+							{
+								minsad = currsad;
+								mind = d;
+							}
 						}
-						if (d < ndisp)
+
+						tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
+						if (tsum < textureThreshold)
 						{
 							dptr[y*dstep] = FILTERED;
 							continue;
 						}
-					}

-			{
-				sad[-1] = sad[1];
-				sad[ndisp] = sad[ndisp - 2];
-				int p = sad[mind + 1], n = sad[mind - 1];
-				d = p + n - 2 * sad[mind] + std::abs(p - n);
-				dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp) * 256 + (d != 0 ? (p - n) * 256 / d : 0) + 15) >> 4);
-				costptr[y*coststep] = sad[mind];
-			}
+						if (uniquenessRatio > 0)
+						{
+							int thresh = minsad + (minsad * uniquenessRatio / 100);
+							for (d = 0; d < ndisp; d++)
+							{
+								if ((d < mind - 1 || d > mind + 1) && sad[d] <= thresh)
+									break;
+							}
+							if (d < ndisp)
+							{
+								dptr[y*dstep] = FILTERED;
+								continue;
+							}
+						}
+
+						{
+							sad[-1] = sad[1];
+							sad[ndisp] = sad[ndisp - 2];
+							int p = sad[mind + 1], n = sad[mind - 1];
+							d = p + n - 2 * sad[mind] + std::abs(p - n);
+							dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp) * 256 + (d != 0 ? (p - n) * 256 / d : 0) + 15) >> 4);
+							costptr[y*coststep] = sad[mind];
+						}
+					}
 				}
 			}
 		}
-	}


-	struct PrefilterInvoker : public ParallelLoopBody
-	{
-		PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
-			uchar* buf0, uchar* buf1, StereoBinaryBMParams* _state)
+		struct PrefilterInvoker : public ParallelLoopBody
 		{
-			imgs0[0] = &left0; imgs0[1] = &right0;
-			imgs[0] = &left; imgs[1] = &right;
-			buf[0] = buf0; buf[1] = buf1;
-			state = _state;
-		}
-
-		void operator()(const Range& range) const
-		{
-			for (int i = range.start; i < range.end; i++)
+			PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
+				uchar* buf0, uchar* buf1, StereoBinaryBMParams* _state)
 			{
-				if (state->preFilterType == StereoBinaryBM::PREFILTER_NORMALIZED_RESPONSE)
-					prefilterNorm(*imgs0[i], *imgs[i], state->preFilterSize, state->preFilterCap, buf[i]);
-				else
-					prefilterXSobel(*imgs0[i], *imgs[i], state->preFilterCap);
+				imgs0[0] = &left0; imgs0[1] = &right0;
+				imgs[0] = &left; imgs[1] = &right;
+				buf[0] = buf0; buf[1] = buf1;
+				state = _state;
 			}
-		}

-		const Mat* imgs0[2];
-		Mat* imgs[2];
-		uchar* buf[2];
-		StereoBinaryBMParams* state;
-	};
+			void operator()(const Range& range) const
+			{
+				for (int i = range.start; i < range.end; i++)
+				{
+					if (state->preFilterType == StereoBinaryBM::PREFILTER_NORMALIZED_RESPONSE)
+						prefilterNorm(*imgs0[i], *imgs[i], state->preFilterSize, state->preFilterCap, buf[i]);
+					else
+						prefilterXSobel(*imgs0[i], *imgs[i], state->preFilterCap);
+				}
+			}

-	struct FindStereoCorrespInvoker : public ParallelLoopBody
-	{
-		FindStereoCorrespInvoker(const Mat& _left, const Mat& _right,
-			Mat& _disp, StereoBinaryBMParams* _state,
-			int _nstripes, size_t _stripeBufSize,
-			bool _useShorts, Rect _validDisparityRect,
-			Mat& _slidingSumBuf, Mat& _cost)
-		{
-			left = &_left; right = &_right;
-			disp = &_disp; state = _state;
-			nstripes = _nstripes; stripeBufSize = _stripeBufSize;
-			useShorts = _useShorts;
-			validDisparityRect = _validDisparityRect;
-			slidingSumBuf = &_slidingSumBuf;
-			cost = &_cost;
-		}
+			const Mat* imgs0[2];
+			Mat* imgs[2];
+			uchar* buf[2];
+			StereoBinaryBMParams* state;
+		};

-		void operator()(const Range& range) const
+		struct FindStereoCorrespInvoker : public ParallelLoopBody
 		{
-			int cols = left->cols, rows = left->rows;
-			int _row0 = std::min(cvRound(range.start * rows / nstripes), rows);
-			int _row1 = std::min(cvRound(range.end * rows / nstripes), rows);
-			uchar *ptr = slidingSumBuf->ptr() + range.start * stripeBufSize;
-			int FILTERED = (state->minDisparity - 1) * 16;
-
-			Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
-			if (roi.height == 0)
-				return;
-			int row0 = roi.y;
-			int row1 = roi.y + roi.height;
-
-			Mat part;
-			if (row0 > _row0)
+			FindStereoCorrespInvoker(const Mat& _left, const Mat& _right,
+				Mat& _disp, StereoBinaryBMParams* _state,
+				int _nstripes, size_t _stripeBufSize,
+				bool _useShorts, Rect _validDisparityRect,
+				Mat& _slidingSumBuf, Mat& _cost)
 			{
-				part = disp->rowRange(_row0, row0);
-				part = Scalar::all(FILTERED);
+				left = &_left; right = &_right;
+				disp = &_disp; state = _state;
+				nstripes = _nstripes; stripeBufSize = _stripeBufSize;
+				useShorts = _useShorts;
+				validDisparityRect = _validDisparityRect;
+				slidingSumBuf = &_slidingSumBuf;
+				cost = &_cost;
 			}
-			if (_row1 > row1)
+
+			void operator()(const Range& range) const
 			{
-				part = disp->rowRange(row1, _row1);
-				part = Scalar::all(FILTERED);
-			}
+				int cols = left->cols, rows = left->rows;
+				int _row0 = std::min(cvRound(range.start * rows / nstripes), rows);
+				int _row1 = std::min(cvRound(range.end * rows / nstripes), rows);
+				uchar *ptr = slidingSumBuf->ptr() + range.start * stripeBufSize;
+				int FILTERED = (state->minDisparity - 1) * 16;
+
+				Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
+				if (roi.height == 0)
+					return;
+				int row0 = roi.y;
+				int row1 = roi.y + roi.height;
+
+				Mat part;
+				if (row0 > _row0)
+				{
+					part = disp->rowRange(_row0, row0);
+					part = Scalar::all(FILTERED);
+				}
+				if (_row1 > row1)
+				{
+					part = disp->rowRange(row1, _row1);
+					part = Scalar::all(FILTERED);
+				}

-			Mat left_i = left->rowRange(row0, row1);
-			Mat right_i = right->rowRange(row0, row1);
-			Mat disp_i = disp->rowRange(row0, row1);
-			Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
+				Mat left_i = left->rowRange(row0, row1);
+				Mat right_i = right->rowRange(row0, row1);
+				Mat disp_i = disp->rowRange(row0, row1);
+				Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();

-			findStereoCorrespondenceBM(left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1);
+				findStereoCorrespondenceBM(left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1);

-			if (state->disp12MaxDiff >= 0)
-				validateDisparity(disp_i, cost_i, state->minDisparity, state->numDisparities, state->disp12MaxDiff);
+				if (state->disp12MaxDiff >= 0)
+					validateDisparity(disp_i, cost_i, state->minDisparity, state->numDisparities, state->disp12MaxDiff);

-			if (roi.x > 0)
-			{
-				part = disp_i.colRange(0, roi.x);
-				part = Scalar::all(FILTERED);
-			}
-			if (roi.x + roi.width < cols)
-			{
-				part = disp_i.colRange(roi.x + roi.width, cols);
-				part = Scalar::all(FILTERED);
+				if (roi.x > 0)
+				{
+					part = disp_i.colRange(0, roi.x);
+					part = Scalar::all(FILTERED);
+				}
+				if (roi.x + roi.width < cols)
+				{
+					part = disp_i.colRange(roi.x + roi.width, cols);
+					part = Scalar::all(FILTERED);
+				}
 			}
-		}

-	protected:
-		const Mat *left, *right;
-		Mat* disp, *slidingSumBuf, *cost;
-		StereoBinaryBMParams *state;
+		protected:
+			const Mat *left, *right;
+			Mat* disp, *slidingSumBuf, *cost;
+			StereoBinaryBMParams *state;

-		int nstripes;
-		size_t stripeBufSize;
-		bool useShorts;
-		Rect validDisparityRect;
-	};
+			int nstripes;
+			size_t stripeBufSize;
+			bool useShorts;
+			Rect validDisparityRect;
+		};

-	class StereoBinaryBMImpl : public StereoBinaryBM
-	{
-	public:
-		StereoBinaryBMImpl()
+		class StereoBinaryBMImpl : public StereoBinaryBM
 		{
-			params = StereoBinaryBMParams();
-		}
+		public:
+			StereoBinaryBMImpl()
+			{
+				params = StereoBinaryBMParams();
+			}

-		StereoBinaryBMImpl(int _numDisparities, int _SADWindowSize)
-		{
-			params = StereoBinaryBMParams(_numDisparities, _SADWindowSize);
-		}
+			StereoBinaryBMImpl(int _numDisparities, int _SADWindowSize)
+			{
+				params = StereoBinaryBMParams(_numDisparities, _SADWindowSize);
+			}

-		void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr)
-		{
-			int dtype = disparr.fixedType() ? disparr.type() : params.dispType;
-			Size leftsize = leftarr.size();
+			void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr)
+			{
+				int dtype = disparr.fixedType() ? disparr.type() : params.dispType;
+				Size leftsize = leftarr.size();

-			if (leftarr.size() != rightarr.size())
-				CV_Error(Error::StsUnmatchedSizes, "All the images must have the same size");
+				if (leftarr.size() != rightarr.size())
+					CV_Error(Error::StsUnmatchedSizes, "All the images must have the same size");

-			if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1)
-				CV_Error(Error::StsUnsupportedFormat, "Both input images must have CV_8UC1");
+				if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1)
+					CV_Error(Error::StsUnsupportedFormat, "Both input images must have CV_8UC1");

-			if (dtype != CV_16SC1 && dtype != CV_32FC1)
-				CV_Error(Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format");
+				if (dtype != CV_16SC1 && dtype != CV_32FC1)
+					CV_Error(Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format");

-			if (params.preFilterType != PREFILTER_NORMALIZED_RESPONSE &&
-				params.preFilterType != PREFILTER_XSOBEL)
-				CV_Error(Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE");
+				if (params.preFilterType != PREFILTER_NORMALIZED_RESPONSE &&
+					params.preFilterType != PREFILTER_XSOBEL)
+					CV_Error(Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE");

-			if (params.preFilterSize < 5 || params.preFilterSize > 255 || params.preFilterSize % 2 == 0)
-				CV_Error(Error::StsOutOfRange, "preFilterSize must be odd and be within 5..255");
+				if (params.preFilterSize < 5 || params.preFilterSize > 255 || params.preFilterSize % 2 == 0)
+					CV_Error(Error::StsOutOfRange, "preFilterSize must be odd and be within 5..255");

-			if (params.preFilterCap < 1 || params.preFilterCap > 63)
-				CV_Error(Error::StsOutOfRange, "preFilterCap must be within 1..63");
+				if (params.preFilterCap < 1 || params.preFilterCap > 63)
+					CV_Error(Error::StsOutOfRange, "preFilterCap must be within 1..63");

-			if (params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 ||
-				params.SADWindowSize >= std::min(leftsize.width, leftsize.height))
-				CV_Error(Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height");
+				if (params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 ||
+					params.SADWindowSize >= std::min(leftsize.width, leftsize.height))
+					CV_Error(Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height");

-			if (params.numDisparities <= 0 || params.numDisparities % 16 != 0)
-				CV_Error(Error::StsOutOfRange, "numDisparities must be positive and divisble by 16");
+				if (params.numDisparities <= 0 || params.numDisparities % 16 != 0)
+					CV_Error(Error::StsOutOfRange, "numDisparities must be positive and divisble by 16");

-			if (params.textureThreshold < 0)
-				CV_Error(Error::StsOutOfRange, "texture threshold must be non-negative");
+				if (params.textureThreshold < 0)
+					CV_Error(Error::StsOutOfRange, "texture threshold must be non-negative");

-			if (params.uniquenessRatio < 0)
-				CV_Error(Error::StsOutOfRange, "uniqueness ratio must be non-negative");
+				if (params.uniquenessRatio < 0)
+					CV_Error(Error::StsOutOfRange, "uniqueness ratio must be non-negative");

-			int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT;
+				int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT;


-			Mat left0 = leftarr.getMat(), right0 = rightarr.getMat();
-			disparr.create(left0.size(), dtype);
-			Mat disp0 = disparr.getMat();
+				Mat left0 = leftarr.getMat(), right0 = rightarr.getMat();
+				disparr.create(left0.size(), dtype);
+				Mat disp0 = disparr.getMat();

-			preFilteredImg0.create(left0.size(), CV_8U);
-			preFilteredImg1.create(left0.size(), CV_8U);
-			cost.create(left0.size(), CV_16S);
+				preFilteredImg0.create(left0.size(), CV_8U);
+				preFilteredImg1.create(left0.size(), CV_8U);
+				cost.create(left0.size(), CV_16S);

-			Mat left = preFilteredImg0, right = preFilteredImg1;
+				Mat left = preFilteredImg0, right = preFilteredImg1;

-			int mindisp = params.minDisparity;
-			int ndisp = params.numDisparities;
+				int mindisp = params.minDisparity;
+				int ndisp = params.numDisparities;

-			int width = left0.cols;
-			int height = left0.rows;
-			int lofs = std::max(ndisp - 1 + mindisp, 0);
-			int rofs = -std::min(ndisp - 1 + mindisp, 0);
-			int width1 = width - rofs - ndisp + 1;
+				int width = left0.cols;
+				int height = left0.rows;
+				int lofs = std::max(ndisp - 1 + mindisp, 0);
+				int rofs = -std::min(ndisp - 1 + mindisp, 0);
+				int width1 = width - rofs - ndisp + 1;

-			if (lofs >= width || rofs >= width || width1 < 1)
-			{
-				disp0 = Scalar::all(FILTERED * (disp0.type() < CV_32F ? 1 : 1. / (1 << DISPARITY_SHIFT)));
-				return;
-			}
+				if (lofs >= width || rofs >= width || width1 < 1)
+				{
+					disp0 = Scalar::all(FILTERED * (disp0.type() < CV_32F ? 1 : 1. / (1 << DISPARITY_SHIFT)));
+					return;
+				}

-			Mat disp = disp0;
-			if (dtype == CV_32F)
-			{
-				dispbuf.create(disp0.size(), CV_16S);
-				disp = dispbuf;
-			}
+				Mat disp = disp0;
+				if (dtype == CV_32F)
+				{
+					dispbuf.create(disp0.size(), CV_16S);
+					disp = dispbuf;
+				}

-			int wsz = params.SADWindowSize;
-			int bufSize0 = (int)((ndisp + 2)*sizeof(int));
-			bufSize0 += (int)((height + wsz + 2)*ndisp*sizeof(int));
-			bufSize0 += (int)((height + wsz + 2)*sizeof(int));
-			bufSize0 += (int)((height + wsz + 2)*ndisp*(wsz + 2)*sizeof(uchar) + 256);
+				int wsz = params.SADWindowSize;
+				int bufSize0 = (int)((ndisp + 2)*sizeof(int));
+				bufSize0 += (int)((height + wsz + 2)*ndisp*sizeof(int));
+				bufSize0 += (int)((height + wsz + 2)*sizeof(int));
+				bufSize0 += (int)((height + wsz + 2)*ndisp*(wsz + 2)*sizeof(uchar) + 256);

-			int bufSize1 = (int)((width + params.preFilterSize + 2) * sizeof(int) + 256);
-			int bufSize2 = 0;
-			if (params.speckleRange >= 0 && params.speckleWindowSize > 0)
-				bufSize2 = width*height*(sizeof(Point_<short>) + sizeof(int) + sizeof(uchar));
+				int bufSize1 = (int)((width + params.preFilterSize + 2) * sizeof(int) + 256);
+				int bufSize2 = 0;
+				if (params.speckleRange >= 0 && params.speckleWindowSize > 0)
+					bufSize2 = width*height*(sizeof(Point_<short>) + sizeof(int) + sizeof(uchar));

 #if CV_SSE2
-			bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2);
+				bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2);
 #else
-			const bool useShorts = false;
+				const bool useShorts = false;
 #endif

-			const double SAD_overhead_coeff = 10.0;
-			double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
-			double maxStripeSize = std::min(std::max(N0 / (width * ndisp), (wsz - 1) * SAD_overhead_coeff), (double)height);
-			int nstripes = cvCeil(height / maxStripeSize);
-			int bufSize = std::max(bufSize0 * nstripes, std::max(bufSize1 * 2, bufSize2));
+				const double SAD_overhead_coeff = 10.0;
+				double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
+				double maxStripeSize = std::min(std::max(N0 / (width * ndisp), (wsz - 1) * SAD_overhead_coeff), (double)height);
+				int nstripes = cvCeil(height / maxStripeSize);
+				int bufSize = std::max(bufSize0 * nstripes, std::max(bufSize1 * 2, bufSize2));

-			if (slidingSumBuf.cols < bufSize)
-				slidingSumBuf.create(1, bufSize, CV_8U);
+				if (slidingSumBuf.cols < bufSize)
+					slidingSumBuf.create(1, bufSize, CV_8U);

-			uchar *_buf = slidingSumBuf.ptr();
+				uchar *_buf = slidingSumBuf.ptr();

-			parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, &params), 1);
+				parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, &params), 1);

-			Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
-			validDisparityRect = getValidDisparityROI(R1.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
-				R2.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
-				params.minDisparity, params.numDisparities,
-				params.SADWindowSize);
+				Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
+				validDisparityRect = getValidDisparityROI(R1.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
+					R2.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
+					params.minDisparity, params.numDisparities,
+					params.SADWindowSize);

-			parallel_for_(Range(0, nstripes),
-				FindStereoCorrespInvoker(left, right, disp, &params, nstripes,
-				bufSize0, useShorts, validDisparityRect,
-				slidingSumBuf, cost));
+				parallel_for_(Range(0, nstripes),
+					FindStereoCorrespInvoker(left, right, disp, &params, nstripes,
+					bufSize0, useShorts, validDisparityRect,
+					slidingSumBuf, cost));

-			if (params.speckleRange >= 0 && params.speckleWindowSize > 0)
-				filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
+				if (params.speckleRange >= 0 && params.speckleWindowSize > 0)
+					filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);

-			if (disp0.data != disp.data)
-				disp.convertTo(disp0, disp0.type(), 1. / (1 << DISPARITY_SHIFT), 0);
-		}
+				if (disp0.data != disp.data)
+					disp.convertTo(disp0, disp0.type(), 1. / (1 << DISPARITY_SHIFT), 0);
+			}

-		int getMinDisparity() const { return params.minDisparity; }
-		void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
+			int getMinDisparity() const { return params.minDisparity; }
+			void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }

-		int getNumDisparities() const { return params.numDisparities; }
-		void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
+			int getNumDisparities() const { return params.numDisparities; }
+			void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }

-		int getBlockSize() const { return params.SADWindowSize; }
-		void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
+			int getBlockSize() const { return params.SADWindowSize; }
+			void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }

-		int getSpeckleWindowSize() const { return params.speckleWindowSize; }
-		void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
+			int getSpeckleWindowSize() const { return params.speckleWindowSize; }
+			void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }

-		int getSpeckleRange() const { return params.speckleRange; }
-		void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
+			int getSpeckleRange() const { return params.speckleRange; }
+			void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }

-		int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
-		void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
+			int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
+			void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }

-		int getPreFilterType() const { return params.preFilterType; }
-		void setPreFilterType(int preFilterType) { params.preFilterType = preFilterType; }
+			int getPreFilterType() const { return params.preFilterType; }
+			void setPreFilterType(int preFilterType) { params.preFilterType = preFilterType; }

-		int getPreFilterSize() const { return params.preFilterSize; }
-		void setPreFilterSize(int preFilterSize) { params.preFilterSize = preFilterSize; }
+			int getPreFilterSize() const { return params.preFilterSize; }
+			void setPreFilterSize(int preFilterSize) { params.preFilterSize = preFilterSize; }

-		int getPreFilterCap() const { return params.preFilterCap; }
-		void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
+			int getPreFilterCap() const { return params.preFilterCap; }
+			void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }

-		int getTextureThreshold() const { return params.textureThreshold; }
-		void setTextureThreshold(int textureThreshold) { params.textureThreshold = textureThreshold; }
+			int getTextureThreshold() const { return params.textureThreshold; }
+			void setTextureThreshold(int textureThreshold) { params.textureThreshold = textureThreshold; }

-		int getUniquenessRatio() const { return params.uniquenessRatio; }
-		void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
+			int getUniquenessRatio() const { return params.uniquenessRatio; }
+			void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }

-		int getSmallerBlockSize() const { return 0; }
-		void setSmallerBlockSize(int) {}
+			int getSmallerBlockSize() const { return 0; }
+			void setSmallerBlockSize(int) {}

-		Rect getROI1() const { return params.roi1; }
-		void setROI1(Rect roi1) { params.roi1 = roi1; }
+			Rect getROI1() const { return params.roi1; }
+			void setROI1(Rect roi1) { params.roi1 = roi1; }

-		Rect getROI2() const { return params.roi2; }
-		void setROI2(Rect roi2) { params.roi2 = roi2; }
+			Rect getROI2() const { return params.roi2; }
+			void setROI2(Rect roi2) { params.roi2 = roi2; }

-		void write(FileStorage& fs) const
-		{
-			fs << "name" << name_
-				<< "minDisparity" << params.minDisparity
-				<< "numDisparities" << params.numDisparities
-				<< "blockSize" << params.SADWindowSize
-				<< "speckleWindowSize" << params.speckleWindowSize
-				<< "speckleRange" << params.speckleRange
-				<< "disp12MaxDiff" << params.disp12MaxDiff
-				<< "preFilterType" << params.preFilterType
-				<< "preFilterSize" << params.preFilterSize
-				<< "preFilterCap" << params.preFilterCap
-				<< "textureThreshold" << params.textureThreshold
-				<< "uniquenessRatio" << params.uniquenessRatio;
-		}
+			void write(FileStorage& fs) const
+			{
+				fs << "name" << name_
+					<< "minDisparity" << params.minDisparity
+					<< "numDisparities" << params.numDisparities
+					<< "blockSize" << params.SADWindowSize
+					<< "speckleWindowSize" << params.speckleWindowSize
+					<< "speckleRange" << params.speckleRange
+					<< "disp12MaxDiff" << params.disp12MaxDiff
+					<< "preFilterType" << params.preFilterType
+					<< "preFilterSize" << params.preFilterSize
+					<< "preFilterCap" << params.preFilterCap
+					<< "textureThreshold" << params.textureThreshold
+					<< "uniquenessRatio" << params.uniquenessRatio;
+			}

-		void read(const FileNode& fn)
-		{
-			FileNode n = fn["name"];
-			CV_Assert(n.isString() && String(n) == name_);
-			params.minDisparity = (int)fn["minDisparity"];
-			params.numDisparities = (int)fn["numDisparities"];
-			params.SADWindowSize = (int)fn["blockSize"];
-			params.speckleWindowSize = (int)fn["speckleWindowSize"];
-			params.speckleRange = (int)fn["speckleRange"];
-			params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
-			params.preFilterType = (int)fn["preFilterType"];
-			params.preFilterSize = (int)fn["preFilterSize"];
-			params.preFilterCap = (int)fn["preFilterCap"];
-			params.textureThreshold = (int)fn["textureThreshold"];
-			params.uniquenessRatio = (int)fn["uniquenessRatio"];
-			params.roi1 = params.roi2 = Rect();
-		}
+			void read(const FileNode& fn)
+			{
+				FileNode n = fn["name"];
+				CV_Assert(n.isString() && String(n) == name_);
+				params.minDisparity = (int)fn["minDisparity"];
+				params.numDisparities = (int)fn["numDisparities"];
+				params.SADWindowSize = (int)fn["blockSize"];
+				params.speckleWindowSize = (int)fn["speckleWindowSize"];
+				params.speckleRange = (int)fn["speckleRange"];
+				params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
+				params.preFilterType = (int)fn["preFilterType"];
+				params.preFilterSize = (int)fn["preFilterSize"];
+				params.preFilterCap = (int)fn["preFilterCap"];
+				params.textureThreshold = (int)fn["textureThreshold"];
+				params.uniquenessRatio = (int)fn["uniquenessRatio"];
+				params.roi1 = params.roi2 = Rect();
+			}

-		StereoBinaryBMParams params;
-		Mat preFilteredImg0, preFilteredImg1, cost, dispbuf;
-		Mat slidingSumBuf;
+			StereoBinaryBMParams params;
+			Mat preFilteredImg0, preFilteredImg1, cost, dispbuf;
+			Mat slidingSumBuf;

-		static const char* name_;
-	};
+			static const char* name_;
+		};

-	const char* StereoBinaryBMImpl::name_ = "StereoMatcher.BM";
+		const char* StereoBinaryBMImpl::name_ = "StereoMatcher.BM";

-	Ptr<StereoBinaryBM> StereoBinaryBM::create(int _numDisparities, int _SADWindowSize)
-	{
-		return makePtr<StereoBinaryBMImpl>(_numDisparities, _SADWindowSize);
+		Ptr<StereoBinaryBM> StereoBinaryBM::create(int _numDisparities, int _SADWindowSize)
+		{
+			return makePtr<StereoBinaryBMImpl>(_numDisparities, _SADWindowSize);
+		}
 	}
-
 }

 /* End of file. */

--- a/modules/stereo/src/stereo_binary_sgbm.cpp
+++ b/modules/stereo/src/stereo_binary_sgbm.cpp
@@ -42,1162 +42,1163 @@
 //M*/

 /*
- This is a variation of
- "Stereo Processing by Semiglobal Matching and Mutual Information"
- by Heiko Hirschmuller.
+This is a variation of
+"Stereo Processing by Semiglobal Matching and Mutual Information"
+by Heiko Hirschmuller.

- We match blocks rather than individual pixels, thus the algorithm is called
- SGBM (Semi-global block matching)
- */
+We match blocks rather than individual pixels, thus the algorithm is called
+SGBM (Semi-global block matching)
+*/

 #include "precomp.hpp"
 #include <limits.h>

 namespace cv
 {
-
-typedef uchar PixType;
-typedef short CostType;
-typedef short DispType;
-
-enum { NR = 16, NR2 = NR/2 };
-
-
-struct StereoBinarySGBMParams
-{
-    StereoBinarySGBMParams()
-    {
-        minDisparity = numDisparities = 0;
-        SADWindowSize = 0;
-        P1 = P2 = 0;
-        disp12MaxDiff = 0;
-        preFilterCap = 0;
-        uniquenessRatio = 0;
-        speckleWindowSize = 0;
-        speckleRange = 0;
-        mode = StereoBinarySGBM::MODE_SGBM;
-    }
-
-    StereoBinarySGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
-                      int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
-                      int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
-                      int _mode )
-    {
-        minDisparity = _minDisparity;
-        numDisparities = _numDisparities;
-        SADWindowSize = _SADWindowSize;
-        P1 = _P1;
-        P2 = _P2;
-        disp12MaxDiff = _disp12MaxDiff;
-        preFilterCap = _preFilterCap;
-        uniquenessRatio = _uniquenessRatio;
-        speckleWindowSize = _speckleWindowSize;
-        speckleRange = _speckleRange;
-        mode = _mode;
-    }
-
-    int minDisparity;
-    int numDisparities;
-    int SADWindowSize;
-    int preFilterCap;
-    int uniquenessRatio;
-    int P1;
-    int P2;
-    int speckleWindowSize;
-    int speckleRange;
-    int disp12MaxDiff;
-    int mode;
-};
-
-/*
- For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
- and for each disparity minD<=d<maxD the function
- computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
- row1[x] and row2[x-d]. The subpixel algorithm from
- "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
- is used, hence the suffix BT.
-
- the temporary buffer should contain width2*2 elements
- */
-static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
-                            int minD, int maxD, CostType* cost,
-                            PixType* buffer, const PixType* tab,
-                            int tabOfs, int )
-{
-    int x, c, width = img1.cols, cn = img1.channels();
-    int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
-    int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
-    int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
-    const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
-    PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
-
-    tab += tabOfs;
-
-    for( c = 0; c < cn*2; c++ )
-    {
-        prow1[width*c] = prow1[width*c + width-1] =
-        prow2[width*c] = prow2[width*c + width-1] = tab[0];
-    }
-
-    int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
-    int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
-
-    if( cn == 1 )
-    {
-        for( x = 1; x < width-1; x++ )
-        {
-            prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
-            prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
-
-            prow1[x+width] = row1[x];
-            prow2[width-1-x+width] = row2[x];
-        }
-    }
-    else
-    {
-        for( x = 1; x < width-1; x++ )
-        {
-            prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
-            prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
-            prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
-
-            prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
-            prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
-            prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
-
-            prow1[x+width*3] = row1[x*3];
-            prow1[x+width*4] = row1[x*3+1];
-            prow1[x+width*5] = row1[x*3+2];
-
-            prow2[width-1-x+width*3] = row2[x*3];
-            prow2[width-1-x+width*4] = row2[x*3+1];
-            prow2[width-1-x+width*5] = row2[x*3+2];
-        }
-    }
-
-    memset( cost, 0, width1*D*sizeof(cost[0]) );
-
-    buffer -= minX2;
-    cost -= minX1*D + minD; // simplify the cost indices inside the loop
+	namespace stereo
+	{
+		typedef uchar PixType;
+		typedef short CostType;
+		typedef short DispType;
+
+		enum { NR = 16, NR2 = NR/2 };
+
+
+		struct StereoBinarySGBMParams
+		{
+			StereoBinarySGBMParams()
+			{
+				minDisparity = numDisparities = 0;
+				SADWindowSize = 0;
+				P1 = P2 = 0;
+				disp12MaxDiff = 0;
+				preFilterCap = 0;
+				uniquenessRatio = 0;
+				speckleWindowSize = 0;
+				speckleRange = 0;
+				mode = StereoBinarySGBM::MODE_SGBM;
+			}
+
+			StereoBinarySGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
+				int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
+				int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
+				int _mode )
+			{
+				minDisparity = _minDisparity;
+				numDisparities = _numDisparities;
+				SADWindowSize = _SADWindowSize;
+				P1 = _P1;
+				P2 = _P2;
+				disp12MaxDiff = _disp12MaxDiff;
+				preFilterCap = _preFilterCap;
+				uniquenessRatio = _uniquenessRatio;
+				speckleWindowSize = _speckleWindowSize;
+				speckleRange = _speckleRange;
+				mode = _mode;
+			}
+
+			int minDisparity;
+			int numDisparities;
+			int SADWindowSize;
+			int preFilterCap;
+			int uniquenessRatio;
+			int P1;
+			int P2;
+			int speckleWindowSize;
+			int speckleRange;
+			int disp12MaxDiff;
+			int mode;
+		};
+
+		/*
+		For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
+		and for each disparity minD<=d<maxD the function
+		computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
+		row1[x] and row2[x-d]. The subpixel algorithm from
+		"Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
+		is used, hence the suffix BT.
+
+		the temporary buffer should contain width2*2 elements
+		*/
+		static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
+			int minD, int maxD, CostType* cost,
+			PixType* buffer, const PixType* tab,
+			int tabOfs, int )
+		{
+			int x, c, width = img1.cols, cn = img1.channels();
+			int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
+			int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
+			int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
+			const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
+			PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
+
+			tab += tabOfs;
+
+			for( c = 0; c < cn*2; c++ )
+			{
+				prow1[width*c] = prow1[width*c + width-1] =
+					prow2[width*c] = prow2[width*c + width-1] = tab[0];
+			}
+
+			int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
+			int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
+
+			if( cn == 1 )
+			{
+				for( x = 1; x < width-1; x++ )
+				{
+					prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
+					prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
+
+					prow1[x+width] = row1[x];
+					prow2[width-1-x+width] = row2[x];
+				}
+			}
+			else
+			{
+				for( x = 1; x < width-1; x++ )
+				{
+					prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
+					prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
+					prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
+
+					prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
+					prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
+					prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
+
+					prow1[x+width*3] = row1[x*3];
+					prow1[x+width*4] = row1[x*3+1];
+					prow1[x+width*5] = row1[x*3+2];
+
+					prow2[width-1-x+width*3] = row2[x*3];
+					prow2[width-1-x+width*4] = row2[x*3+1];
+					prow2[width-1-x+width*5] = row2[x*3+2];
+				}
+			}
+
+			memset( cost, 0, width1*D*sizeof(cost[0]) );
+
+			buffer -= minX2;
+			cost -= minX1*D + minD; // simplify the cost indices inside the loop

 #if CV_SSE2
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+			volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 #endif

 #if 1
-    for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
-    {
-        int diff_scale = c < cn ? 0 : 2;
-
-        // precompute
-        //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
-        //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
-        for( x = minX2; x < maxX2; x++ )
-        {
-            int v = prow2[x];
-            int vl = x > 0 ? (v + prow2[x-1])/2 : v;
-            int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
-            int v0 = std::min(vl, vr); v0 = std::min(v0, v);
-            int v1 = std::max(vl, vr); v1 = std::max(v1, v);
-            buffer[x] = (PixType)v0;
-            buffer[x + width2] = (PixType)v1;
-        }
-
-        for( x = minX1; x < maxX1; x++ )
-        {
-            int u = prow1[x];
-            int ul = x > 0 ? (u + prow1[x-1])/2 : u;
-            int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
-            int u0 = std::min(ul, ur); u0 = std::min(u0, u);
-            int u1 = std::max(ul, ur); u1 = std::max(u1, u);
-
-        #if CV_SSE2
-            if( useSIMD )
-            {
-                __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
-                __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
-                __m128i ds = _mm_cvtsi32_si128(diff_scale);
-
-                for( int d = minD; d < maxD; d += 16 )
-                {
-                    __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
-                    __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
-                    __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
-                    __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
-                    __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
-                    __m128i diff = _mm_min_epu8(c0, c1);
-
-                    c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
-                    c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
-
-                    _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
-                    _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
-                }
-            }
-            else
-        #endif
-            {
-                for( int d = minD; d < maxD; d++ )
-                {
-                    int v = prow2[width-x-1 + d];
-                    int v0 = buffer[width-x-1 + d];
-                    int v1 = buffer[width-x-1 + d + width2];
-                    int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
-                    int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
-
-                    cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
-                }
-            }
-        }
-    }
+			for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
+			{
+				int diff_scale = c < cn ? 0 : 2;
+
+				// precompute
+				//   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
+				//   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
+				for( x = minX2; x < maxX2; x++ )
+				{
+					int v = prow2[x];
+					int vl = x > 0 ? (v + prow2[x-1])/2 : v;
+					int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
+					int v0 = std::min(vl, vr); v0 = std::min(v0, v);
+					int v1 = std::max(vl, vr); v1 = std::max(v1, v);
+					buffer[x] = (PixType)v0;
+					buffer[x + width2] = (PixType)v1;
+				}
+
+				for( x = minX1; x < maxX1; x++ )
+				{
+					int u = prow1[x];
+					int ul = x > 0 ? (u + prow1[x-1])/2 : u;
+					int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
+					int u0 = std::min(ul, ur); u0 = std::min(u0, u);
+					int u1 = std::max(ul, ur); u1 = std::max(u1, u);
+
+#if CV_SSE2
+					if( useSIMD )
+					{
+						__m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
+						__m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
+						__m128i ds = _mm_cvtsi32_si128(diff_scale);
+
+						for( int d = minD; d < maxD; d += 16 )
+						{
+							__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
+							__m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
+							__m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
+							__m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
+							__m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
+							__m128i diff = _mm_min_epu8(c0, c1);
+
+							c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
+							c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
+
+							_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
+							_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
+						}
+					}
+					else
+#endif
+					{
+						for( int d = minD; d < maxD; d++ )
+						{
+							int v = prow2[width-x-1 + d];
+							int v0 = buffer[width-x-1 + d];
+							int v1 = buffer[width-x-1 + d + width2];
+							int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
+							int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
+
+							cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
+						}
+					}
+				}
+			}
 #else
-    for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
-    {
-        for( x = minX1; x < maxX1; x++ )
-        {
-            int u = prow1[x];
-        #if CV_SSE2
-            if( useSIMD )
-            {
-                __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
-
-                for( int d = minD; d < maxD; d += 16 )
-                {
-                    __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
-                    __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
-                    __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
-                    __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
-
-                    _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
-                    _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
-                }
-            }
-            else
-        #endif
-            {
-                for( int d = minD; d < maxD; d++ )
-                {
-                    int v = prow2[width-1-x + d];
-                    cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
-                }
-            }
-        }
-    }
+			for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
+			{
+				for( x = minX1; x < maxX1; x++ )
+				{
+					int u = prow1[x];
+#if CV_SSE2
+					if( useSIMD )
+					{
+						__m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
+
+						for( int d = minD; d < maxD; d += 16 )
+						{
+							__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
+							__m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
+							__m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
+							__m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
+
+							_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
+							_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
+						}
+					}
+					else
+#endif
+					{
+						for( int d = minD; d < maxD; d++ )
+						{
+							int v = prow2[width-1-x + d];
+							cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
+						}
+					}
+				}
+			}
+#endif
+		}
+
+
+		/*
+		computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
+		that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
+		minD <= d < maxD.
+		disp2full is the reverse disparity map, that is:
+		disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
+
+		note that disp1buf will have the same size as the roi and
+		disp2full will have the same size as img1 (or img2).
+		On exit disp2buf is not the final disparity, it is an intermediate result that becomes
+		final after all the tiles are processed.
+
+		the disparity in disp1buf is written with sub-pixel accuracy
+		(4 fractional bits, see StereoSGBM::DISP_SCALE),
+		using quadratic interpolation, while the disparity in disp2buf
+		is written as is, without interpolation.
+
+		disp2cost also has the same size as img1 (or img2).
+		It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
+		*/
+		static void computeDisparityBinarySGBM( const Mat& img1, const Mat& img2,
+			Mat& disp1, const StereoBinarySGBMParams& params,
+			Mat& buffer )
+		{
+#if CV_SSE2
+			static const uchar LSBTab[] =
+			{
+				0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+				5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+			};
+
+			volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 #endif
-}

+			const int ALIGN = 16;
+			const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
+			const int DISP_SCALE = (1 << DISP_SHIFT);
+			const CostType MAX_COST = SHRT_MAX;
+
+			int minD = params.minDisparity, maxD = minD + params.numDisparities;
+			Size SADWindowSize;
+			SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
+			int ftzero = std::max(params.preFilterCap, 15) | 1;
+			int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
+			int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
+			int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
+			int k, width = disp1.cols, height = disp1.rows;
+			int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
+			int D = maxD - minD, width1 = maxX1 - minX1;
+			int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
+			int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
+			bool fullDP = params.mode == StereoBinarySGBM::MODE_HH;
+			int npasses = fullDP ? 2 : 1;
+			const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
+			PixType clipTab[TAB_SIZE];
+
+			for( k = 0; k < TAB_SIZE; k++ )
+				clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
+
+			if( minX1 >= maxX1 )
+			{
+				disp1 = Scalar::all(INVALID_DISP_SCALED);
+				return;
+			}
+
+			CV_Assert( D % 16 == 0 );
+
+			// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
+			// if you change NR, please, modify the loop as well.
+			int D2 = D+16, NRD2 = NR2*D2;
+
+			// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
+			// for 8-way dynamic programming we need the current row and
+			// the previous row, i.e. 2 rows in total
+			const int NLR = 2;
+			const int LrBorder = NLR - 1;
+
+			// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
+			// we keep pixel difference cost (C) and the summary cost over NR directions (S).
+			// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
+			size_t costBufSize = width1*D;
+			size_t CSBufSize = costBufSize*(fullDP ? height : 1);
+			size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
+			int hsumBufNRows = SH2*2 + 2;
+			size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
+				costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
+				CSBufSize*2*sizeof(CostType) + // C, S
+				width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
+				width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
+
+			if( buffer.empty() || !buffer.isContinuous() ||
+				buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
+				buffer.create(1, (int)totalBufSize, CV_8U);
+
+			// summary cost over different (nDirs) directions
+			CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
+			CostType* Sbuf = Cbuf + CSBufSize;
+			CostType* hsumBuf = Sbuf + CSBufSize;
+			CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
+
+			CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
+			DispType* disp2ptr = (DispType*)(disp2cost + width);
+			PixType* tempBuf = (PixType*)(disp2ptr + width);
+
+			// add P2 to every C(x,y). it saves a few operations in the inner loops
+			for( k = 0; k < width1*D; k++ )
+				Cbuf[k] = (CostType)P2;
+
+			for( int pass = 1; pass <= npasses; pass++ )
+			{
+				int x1, y1, x2, y2, dx, dy;
+
+				if( pass == 1 )
+				{
+					y1 = 0; y2 = height; dy = 1;
+					x1 = 0; x2 = width1; dx = 1;
+				}
+				else
+				{
+					y1 = height-1; y2 = -1; dy = -1;
+					x1 = width1-1; x2 = -1; dx = -1;
+				}
+
+				CostType *Lr[NLR]={0}, *minLr[NLR]={0};
+
+				for( k = 0; k < NLR; k++ )
+				{
+					// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
+					// and will occasionally use negative indices with the arrays
+					// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
+					// however, then the alignment will be imperfect, i.e. bad for SSE,
+					// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
+					Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
+					memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
+					minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
+					memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
+				}
+
+				for( int y = y1; y != y2; y += dy )
+				{
+					int x, d;
+					DispType* disp1ptr = disp1.ptr<DispType>(y);
+					CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
+					CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
+
+					if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
+					{
+						int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
+
+						for( k = dy1; k <= dy2; k++ )
+						{
+							CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
+
+							if( k < height )
+							{
+								calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
+
+								memset(hsumAdd, 0, D*sizeof(CostType));
+								for( x = 0; x <= SW2*D; x += D )
+								{
+									int scale = x == 0 ? SW2 + 1 : 1;
+									for( d = 0; d < D; d++ )
+										hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
+								}
+
+								if( y > 0 )
+								{
+									const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
+									const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
+
+									for( x = D; x < width1*D; x += D )
+									{
+										const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+										const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);

-/*
- computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
- that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
- minD <= d < maxD.
- disp2full is the reverse disparity map, that is:
- disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
-
- note that disp1buf will have the same size as the roi and
- disp2full will have the same size as img1 (or img2).
- On exit disp2buf is not the final disparity, it is an intermediate result that becomes
- final after all the tiles are processed.
-
- the disparity in disp1buf is written with sub-pixel accuracy
- (4 fractional bits, see StereoSGBM::DISP_SCALE),
- using quadratic interpolation, while the disparity in disp2buf
- is written as is, without interpolation.
-
- disp2cost also has the same size as img1 (or img2).
- It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
- */
-static void computeDisparityBinarySGBM( const Mat& img1, const Mat& img2,
-                                 Mat& disp1, const StereoBinarySGBMParams& params,
-                                 Mat& buffer )
-{
 #if CV_SSE2
-    static const uchar LSBTab[] =
-    {
-        0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-    };
-
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+										if( useSIMD )
+										{
+											for( d = 0; d < D; d += 8 )
+											{
+												__m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
+												__m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
+												hv = _mm_adds_epi16(_mm_subs_epi16(hv,
+													_mm_load_si128((const __m128i*)(pixSub + d))),
+													_mm_load_si128((const __m128i*)(pixAdd + d)));
+												Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
+													_mm_load_si128((const __m128i*)(hsumSub + x + d))),
+													hv);
+												_mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
+												_mm_store_si128((__m128i*)(C + x + d), Cx);
+											}
+										}
+										else
 #endif
+										{
+											for( d = 0; d < D; d++ )
+											{
+												int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+												C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
+											}
+										}
+									}
+								}
+								else
+								{
+									for( x = D; x < width1*D; x += D )
+									{
+										const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
+										const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+
+										for( d = 0; d < D; d++ )
+											hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+									}
+								}
+							}
+
+							if( y == 0 )
+							{
+								int scale = k == 0 ? SH2 + 1 : 1;
+								for( x = 0; x < width1*D; x++ )
+									C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+							}
+						}
+
+						// also, clear the S buffer
+						for( k = 0; k < width1*D; k++ )
+							S[k] = 0;
+					}
+
+					// clear the left and the right borders
+					memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
+					memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
+					memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
+					memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
+
+					/*
+					[formula 13 in the paper]
+					compute L_r(p, d) = C(p, d) +
+					min(L_r(p-r, d),
+					L_r(p-r, d-1) + P1,
+					L_r(p-r, d+1) + P1,
+					min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
+					where p = (x,y), r is one of the directions.
+					we process all the directions at once:
+					0: r=(-dx, 0)
+					1: r=(-1, -dy)
+					2: r=(0, -dy)
+					3: r=(1, -dy)
+					4: r=(-2, -dy)
+					5: r=(-1, -dy*2)
+					6: r=(1, -dy*2)
+					7: r=(2, -dy)
+					*/
+					for( x = x1; x != x2; x += dx )
+					{
+						int xm = x*NR2, xd = xm*D2;
+
+						int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
+						int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
+
+						CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
+						CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
+						CostType* Lr_p2 = Lr[1] + xd + D2*2;
+						CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
+
+						Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
+							Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
+
+						CostType* Lr_p = Lr[0] + xd;
+						const CostType* Cp = C + x*D;
+						CostType* Sp = S + x*D;

-    const int ALIGN = 16;
-    const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
-    const int DISP_SCALE = (1 << DISP_SHIFT);
-    const CostType MAX_COST = SHRT_MAX;
-
-    int minD = params.minDisparity, maxD = minD + params.numDisparities;
-    Size SADWindowSize;
-    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
-    int ftzero = std::max(params.preFilterCap, 15) | 1;
-    int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
-    int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
-    int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
-    int k, width = disp1.cols, height = disp1.rows;
-    int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = maxD - minD, width1 = maxX1 - minX1;
-    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
-    bool fullDP = params.mode == StereoBinarySGBM::MODE_HH;
-    int npasses = fullDP ? 2 : 1;
-    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
-    PixType clipTab[TAB_SIZE];
-
-    for( k = 0; k < TAB_SIZE; k++ )
-        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
-
-    if( minX1 >= maxX1 )
-    {
-        disp1 = Scalar::all(INVALID_DISP_SCALED);
-        return;
-    }
-
-    CV_Assert( D % 16 == 0 );
-
-    // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
-    // if you change NR, please, modify the loop as well.
-    int D2 = D+16, NRD2 = NR2*D2;
-
-    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
-    // for 8-way dynamic programming we need the current row and
-    // the previous row, i.e. 2 rows in total
-    const int NLR = 2;
-    const int LrBorder = NLR - 1;
-
-    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
-    // we keep pixel difference cost (C) and the summary cost over NR directions (S).
-    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
-    size_t costBufSize = width1*D;
-    size_t CSBufSize = costBufSize*(fullDP ? height : 1);
-    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
-    int hsumBufNRows = SH2*2 + 2;
-    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
-    CSBufSize*2*sizeof(CostType) + // C, S
-    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
-    width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
-
-    if( buffer.empty() || !buffer.isContinuous() ||
-        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
-        buffer.create(1, (int)totalBufSize, CV_8U);
-
-    // summary cost over different (nDirs) directions
-    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
-    CostType* Sbuf = Cbuf + CSBufSize;
-    CostType* hsumBuf = Sbuf + CSBufSize;
-    CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
-
-    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
-    DispType* disp2ptr = (DispType*)(disp2cost + width);
-    PixType* tempBuf = (PixType*)(disp2ptr + width);
-
-    // add P2 to every C(x,y). it saves a few operations in the inner loops
-    for( k = 0; k < width1*D; k++ )
-        Cbuf[k] = (CostType)P2;
-
-    for( int pass = 1; pass <= npasses; pass++ )
-    {
-        int x1, y1, x2, y2, dx, dy;
-
-        if( pass == 1 )
-        {
-            y1 = 0; y2 = height; dy = 1;
-            x1 = 0; x2 = width1; dx = 1;
-        }
-        else
-        {
-            y1 = height-1; y2 = -1; dy = -1;
-            x1 = width1-1; x2 = -1; dx = -1;
-        }
-
-        CostType *Lr[NLR]={0}, *minLr[NLR]={0};
-
-        for( k = 0; k < NLR; k++ )
-        {
-            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
-            // and will occasionally use negative indices with the arrays
-            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
-            // however, then the alignment will be imperfect, i.e. bad for SSE,
-            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
-            Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
-            memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
-            minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
-            memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
-        }
-
-        for( int y = y1; y != y2; y += dy )
-        {
-            int x, d;
-            DispType* disp1ptr = disp1.ptr<DispType>(y);
-            CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
-            CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
-
-            if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
-            {
-                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
-
-                for( k = dy1; k <= dy2; k++ )
-                {
-                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
-
-                    if( k < height )
-                    {
-                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
-
-                        memset(hsumAdd, 0, D*sizeof(CostType));
-                        for( x = 0; x <= SW2*D; x += D )
-                        {
-                            int scale = x == 0 ? SW2 + 1 : 1;
-                            for( d = 0; d < D; d++ )
-                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
-                        }
-
-                        if( y > 0 )
-                        {
-                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
-
-                            for( x = D; x < width1*D; x += D )
-                            {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
-
-                            #if CV_SSE2
-                                if( useSIMD )
-                                {
-                                    for( d = 0; d < D; d += 8 )
-                                    {
-                                        __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
-                                        __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
-                                        hv = _mm_adds_epi16(_mm_subs_epi16(hv,
-                                                                           _mm_load_si128((const __m128i*)(pixSub + d))),
-                                                            _mm_load_si128((const __m128i*)(pixAdd + d)));
-                                        Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
-                                                                           _mm_load_si128((const __m128i*)(hsumSub + x + d))),
-                                                            hv);
-                                        _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
-                                        _mm_store_si128((__m128i*)(C + x + d), Cx);
-                                    }
-                                }
-                                else
-                            #endif
-                                {
-                                    for( d = 0; d < D; d++ )
-                                    {
-                                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                                        C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            for( x = D; x < width1*D; x += D )
-                            {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
-
-                                for( d = 0; d < D; d++ )
-                                    hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                            }
-                        }
-                    }
-
-                    if( y == 0 )
-                    {
-                        int scale = k == 0 ? SH2 + 1 : 1;
-                        for( x = 0; x < width1*D; x++ )
-                            C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
-                    }
-                }
-
-                // also, clear the S buffer
-                for( k = 0; k < width1*D; k++ )
-                    S[k] = 0;
-            }
-
-            // clear the left and the right borders
-            memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
-            memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
-            memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
-            memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
-
-            /*
-             [formula 13 in the paper]
-             compute L_r(p, d) = C(p, d) +
-             min(L_r(p-r, d),
-             L_r(p-r, d-1) + P1,
-             L_r(p-r, d+1) + P1,
-             min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
-             where p = (x,y), r is one of the directions.
-             we process all the directions at once:
-             0: r=(-dx, 0)
-             1: r=(-1, -dy)
-             2: r=(0, -dy)
-             3: r=(1, -dy)
-             4: r=(-2, -dy)
-             5: r=(-1, -dy*2)
-             6: r=(1, -dy*2)
-             7: r=(2, -dy)
-             */
-            for( x = x1; x != x2; x += dx )
-            {
-                int xm = x*NR2, xd = xm*D2;
-
-                int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
-                int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
-
-                CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
-                CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
-                CostType* Lr_p2 = Lr[1] + xd + D2*2;
-                CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
-
-                Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
-                Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
-
-                CostType* Lr_p = Lr[0] + xd;
-                const CostType* Cp = C + x*D;
-                CostType* Sp = S + x*D;
-
-            #if CV_SSE2
-                if( useSIMD )
-                {
-                    __m128i _P1 = _mm_set1_epi16((short)P1);
-
-                    __m128i _delta0 = _mm_set1_epi16((short)delta0);
-                    __m128i _delta1 = _mm_set1_epi16((short)delta1);
-                    __m128i _delta2 = _mm_set1_epi16((short)delta2);
-                    __m128i _delta3 = _mm_set1_epi16((short)delta3);
-                    __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
-
-                    for( d = 0; d < D; d += 8 )
-                    {
-                        __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
-                        __m128i L0, L1, L2, L3;
-
-                        L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
-                        L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
-                        L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
-                        L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
-
-                        L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
-                        L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
-
-                        L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
-                        L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
-
-                        L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
-                        L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
-
-                        L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
-                        L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
-
-                        L0 = _mm_min_epi16(L0, _delta0);
-                        L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
-
-                        L1 = _mm_min_epi16(L1, _delta1);
-                        L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
-
-                        L2 = _mm_min_epi16(L2, _delta2);
-                        L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
-
-                        L3 = _mm_min_epi16(L3, _delta3);
-                        L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
-
-                        _mm_store_si128( (__m128i*)(Lr_p + d), L0);
-                        _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
-                        _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
-                        _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
-
-                        __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
-                        __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
-                        t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
-                        _minL0 = _mm_min_epi16(_minL0, t0);
-
-                        __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
-
-                        L0 = _mm_adds_epi16(L0, L1);
-                        L2 = _mm_adds_epi16(L2, L3);
-                        Sval = _mm_adds_epi16(Sval, L0);
-                        Sval = _mm_adds_epi16(Sval, L2);
-
-                        _mm_store_si128((__m128i*)(Sp + d), Sval);
-                    }
-
-                    _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
-                    _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
-                }
-                else
-            #endif
-                {
-                    int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
-
-                    for( d = 0; d < D; d++ )
-                    {
-                        int Cpd = Cp[d], L0, L1, L2, L3;
-
-                        L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
-                        L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
-                        L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
-                        L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
-
-                        Lr_p[d] = (CostType)L0;
-                        minL0 = std::min(minL0, L0);
-
-                        Lr_p[d + D2] = (CostType)L1;
-                        minL1 = std::min(minL1, L1);
-
-                        Lr_p[d + D2*2] = (CostType)L2;
-                        minL2 = std::min(minL2, L2);
-
-                        Lr_p[d + D2*3] = (CostType)L3;
-                        minL3 = std::min(minL3, L3);
-
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
-                    }
-                    minLr[0][xm] = (CostType)minL0;
-                    minLr[0][xm+1] = (CostType)minL1;
-                    minLr[0][xm+2] = (CostType)minL2;
-                    minLr[0][xm+3] = (CostType)minL3;
-                }
-            }
-
-            if( pass == npasses )
-            {
-                for( x = 0; x < width; x++ )
-                {
-                    disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
-                    disp2cost[x] = MAX_COST;
-                }
-
-                for( x = width1 - 1; x >= 0; x-- )
-                {
-                    CostType* Sp = S + x*D;
-                    int minS = MAX_COST, bestDisp = -1;
-
-                    if( npasses == 1 )
-                    {
-                        int xm = x*NR2, xd = xm*D2;
-
-                        int minL0 = MAX_COST;
-                        int delta0 = minLr[0][xm + NR2] + P2;
-                        CostType* Lr_p0 = Lr[0] + xd + NRD2;
-                        Lr_p0[-1] = Lr_p0[D] = MAX_COST;
-                        CostType* Lr_p = Lr[0] + xd;
-
-                        const CostType* Cp = C + x*D;
-
-                    #if CV_SSE2
-                        if( useSIMD )
-                        {
-                            __m128i _P1 = _mm_set1_epi16((short)P1);
-                            __m128i _delta0 = _mm_set1_epi16((short)delta0);
-
-                            __m128i _minL0 = _mm_set1_epi16((short)minL0);
-                            __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
-                            __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
-
-                            for( d = 0; d < D; d += 8 )
-                            {
-                                __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
-
-                                L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
-                                L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
-                                L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
-                                L0 = _mm_min_epi16(L0, _delta0);
-                                L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
-
-                                _mm_store_si128((__m128i*)(Lr_p + d), L0);
-                                _minL0 = _mm_min_epi16(_minL0, L0);
-                                L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
-                                _mm_store_si128((__m128i*)(Sp + d), L0);
-
-                                __m128i mask = _mm_cmpgt_epi16(_minS, L0);
-                                _minS = _mm_min_epi16(_minS, L0);
-                                _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
-                                _d8 = _mm_adds_epi16(_d8, _8);
-                            }
-
-                            short CV_DECL_ALIGNED(16) bestDispBuf[8];
-                            _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
-
-                            _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
-                            _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
-                            _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
-
-                            __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
-                            qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
-                            qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
-
-                            minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
-                            minS = (CostType)_mm_cvtsi128_si32(qS);
-
-                            qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
-                            qS = _mm_cmpeq_epi16(_minS, qS);
-                            int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
-
-                            bestDisp = bestDispBuf[LSBTab[idx]];
-                        }
-                        else
-                    #endif
-                        {
-                            for( d = 0; d < D; d++ )
-                            {
-                                int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
-
-                                Lr_p[d] = (CostType)L0;
-                                minL0 = std::min(minL0, L0);
-
-                                int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
-                                if( Sval < minS )
-                                {
-                                    minS = Sval;
-                                    bestDisp = d;
-                                }
-                            }
-                            minLr[0][xm] = (CostType)minL0;
-                        }
-                    }
-                    else
-                    {
-                        for( d = 0; d < D; d++ )
-                        {
-                            int Sval = Sp[d];
-                            if( Sval < minS )
-                            {
-                                minS = Sval;
-                                bestDisp = d;
-                            }
-                        }
-                    }
-
-                    for( d = 0; d < D; d++ )
-                    {
-                        if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
-                            break;
-                    }
-                    if( d < D )
-                        continue;
-                    d = bestDisp;
-                    int _x2 = x + minX1 - d - minD;
-                    if( disp2cost[_x2] > minS )
-                    {
-                        disp2cost[_x2] = (CostType)minS;
-                        disp2ptr[_x2] = (DispType)(d + minD);
-                    }
-
-                    if( 0 < d && d < D-1 )
-                    {
-                        // do subpixel quadratic interpolation:
-                        //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
-                        //   then find minimum of the parabola.
-                        int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
-                        d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
-                    }
-                    else
-                        d *= DISP_SCALE;
-                    disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
-                }
-
-                for( x = minX1; x < maxX1; x++ )
-                {
-                    // we round the computed disparity both towards -inf and +inf and check
-                    // if either of the corresponding disparities in disp2 is consistent.
-                    // This is to give the computed disparity a chance to look valid if it is.
-                    int d1 = disp1ptr[x];
-                    if( d1 == INVALID_DISP_SCALED )
-                        continue;
-                    int _d = d1 >> DISP_SHIFT;
-                    int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
-                    int _x = x - _d, x_ = x - d_;
-                    if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
-                       0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
-                        disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
-                }
-            }
-
-            // now shift the cyclic buffers
-            std::swap( Lr[0], Lr[1] );
-            std::swap( minLr[0], minLr[1] );
-        }
-    }
-}
+#if CV_SSE2
+						if( useSIMD )
+						{
+							__m128i _P1 = _mm_set1_epi16((short)P1);

-class StereoBinarySGBMImpl : public StereoBinarySGBM
-{
-public:
-    StereoBinarySGBMImpl()
-    {
-        params = StereoBinarySGBMParams();
-    }
-
-    StereoBinarySGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
-                    int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
-                    int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
-                    int _mode )
-    {
-        params = StereoBinarySGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
-                                   _P1, _P2, _disp12MaxDiff, _preFilterCap,
-                                   _uniquenessRatio, _speckleWindowSize, _speckleRange,
-                                   _mode );
-    }
-
-    void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
-    {
-        Mat left = leftarr.getMat(), right = rightarr.getMat();
-        CV_Assert( left.size() == right.size() && left.type() == right.type() &&
-                   left.depth() == CV_8U );
-
-        disparr.create( left.size(), CV_16S );
-        Mat disp = disparr.getMat();
-
-        computeDisparityBinarySGBM( left, right, disp, params, buffer );
-        medianBlur(disp, disp, 3);
-
-        if( params.speckleWindowSize > 0 )
-            filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
-                           StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
-    }
-
-    int getMinDisparity() const { return params.minDisparity; }
-    void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
-
-    int getNumDisparities() const { return params.numDisparities; }
-    void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
-
-    int getBlockSize() const { return params.SADWindowSize; }
-    void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
-
-    int getSpeckleWindowSize() const { return params.speckleWindowSize; }
-    void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
-
-    int getSpeckleRange() const { return params.speckleRange; }
-    void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
-
-    int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
-    void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
-
-    int getPreFilterCap() const { return params.preFilterCap; }
-    void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
-
-    int getUniquenessRatio() const { return params.uniquenessRatio; }
-    void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
-
-    int getP1() const { return params.P1; }
-    void setP1(int P1) { params.P1 = P1; }
-
-    int getP2() const { return params.P2; }
-    void setP2(int P2) { params.P2 = P2; }
-
-    int getMode() const { return params.mode; }
-    void setMode(int mode) { params.mode = mode; }
-
-    void write(FileStorage& fs) const
-    {
-        fs << "name" << name_
-        << "minDisparity" << params.minDisparity
-        << "numDisparities" << params.numDisparities
-        << "blockSize" << params.SADWindowSize
-        << "speckleWindowSize" << params.speckleWindowSize
-        << "speckleRange" << params.speckleRange
-        << "disp12MaxDiff" << params.disp12MaxDiff
-        << "preFilterCap" << params.preFilterCap
-        << "uniquenessRatio" << params.uniquenessRatio
-        << "P1" << params.P1
-        << "P2" << params.P2
-        << "mode" << params.mode;
-    }
-
-    void read(const FileNode& fn)
-    {
-        FileNode n = fn["name"];
-        CV_Assert( n.isString() && String(n) == name_ );
-        params.minDisparity = (int)fn["minDisparity"];
-        params.numDisparities = (int)fn["numDisparities"];
-        params.SADWindowSize = (int)fn["blockSize"];
-        params.speckleWindowSize = (int)fn["speckleWindowSize"];
-        params.speckleRange = (int)fn["speckleRange"];
-        params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
-        params.preFilterCap = (int)fn["preFilterCap"];
-        params.uniquenessRatio = (int)fn["uniquenessRatio"];
-        params.P1 = (int)fn["P1"];
-        params.P2 = (int)fn["P2"];
-        params.mode = (int)fn["mode"];
-    }
-
-    StereoBinarySGBMParams params;
-    Mat buffer;
-    static const char* name_;
-};
-
-const char* StereoBinarySGBMImpl::name_ = "StereoMatcher.SGBM";
-
-
-Ptr<StereoBinarySGBM> StereoBinarySGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
-                                 int P1, int P2, int disp12MaxDiff,
-                                 int preFilterCap, int uniquenessRatio,
-                                 int speckleWindowSize, int speckleRange,
-                                 int mode)
-{
-    return Ptr<StereoBinarySGBM>(
-        new StereoBinarySGBMImpl(minDisparity, numDisparities, SADWindowSize,
-                           P1, P2, disp12MaxDiff,
-                           preFilterCap, uniquenessRatio,
-                           speckleWindowSize, speckleRange,
-                           mode));
-}
+							__m128i _delta0 = _mm_set1_epi16((short)delta0);
+							__m128i _delta1 = _mm_set1_epi16((short)delta1);
+							__m128i _delta2 = _mm_set1_epi16((short)delta2);
+							__m128i _delta3 = _mm_set1_epi16((short)delta3);
+							__m128i _minL0 = _mm_set1_epi16((short)MAX_COST);

-Rect getValidDisparityROI( Rect roi1, Rect roi2,
-                          int minDisparity,
-                          int numberOfDisparities,
-                          int SADWindowSize )
-{
-    int SW2 = SADWindowSize/2;
-    int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
+							for( d = 0; d < D; d += 8 )
+							{
+								__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
+								__m128i L0, L1, L2, L3;

-    int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
-    int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
-    int ymin = std::max(roi1.y, roi2.y) + SW2;
-    int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
+								L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
+								L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
+								L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
+								L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));

-    Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
+								L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
+								L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));

-    return r.width > 0 && r.height > 0 ? r : Rect();
-}
+								L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
+								L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));

-typedef cv::Point_<short> Point2s;
+								L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
+								L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));

-template <typename T>
-void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
-{
-    using namespace cv;
-
-    int width = img.cols, height = img.rows, npixels = width*height;
-    size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
-    if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
-        _buf.create(1, (int)bufSize, CV_8U);
-
-    uchar* buf = _buf.ptr();
-    int i, j, dstep = (int)(img.step/sizeof(T));
-    int* labels = (int*)buf;
-    buf += npixels*sizeof(labels[0]);
-    Point2s* wbuf = (Point2s*)buf;
-    buf += npixels*sizeof(wbuf[0]);
-    uchar* rtype = (uchar*)buf;
-    int curlabel = 0;
-
-    // clear out label assignments
-    memset(labels, 0, npixels*sizeof(labels[0]));
-
-    for( i = 0; i < height; i++ )
-    {
-        T* ds = img.ptr<T>(i);
-        int* ls = labels + width*i;
-
-        for( j = 0; j < width; j++ )
-        {
-            if( ds[j] != newVal )   // not a bad disparity
-            {
-                if( ls[j] )     // has a label, check for bad label
-                {
-                    if( rtype[ls[j]] ) // small region, zero out disparity
-                        ds[j] = (T)newVal;
-                }
-                // no label, assign and propagate
-                else
-                {
-                    Point2s* ws = wbuf; // initialize wavefront
-                    Point2s p((short)j, (short)i);  // current pixel
-                    curlabel++; // next label
-                    int count = 0;  // current region size
-                    ls[j] = curlabel;
-
-                    // wavefront propagation
-                    while( ws >= wbuf ) // wavefront not empty
-                    {
-                        count++;
-                        // put neighbors onto wavefront
-                        T* dpp = &img.at<T>(p.y, p.x);
-                        T dp = *dpp;
-                        int* lpp = labels + width*p.y + p.x;
-
-                        if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
-                        {
-                            lpp[+width] = curlabel;
-                            *ws++ = Point2s(p.x, p.y+1);
-                        }
-
-                        if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
-                        {
-                            lpp[-width] = curlabel;
-                            *ws++ = Point2s(p.x, p.y-1);
-                        }
-
-                        if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
-                        {
-                            lpp[+1] = curlabel;
-                            *ws++ = Point2s(p.x+1, p.y);
-                        }
-
-                        if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
-                        {
-                            lpp[-1] = curlabel;
-                            *ws++ = Point2s(p.x-1, p.y);
-                        }
-
-                        // pop most recent and propagate
-                        // NB: could try least recent, maybe better convergence
-                        p = *--ws;
-                    }
-
-                    // assign label type
-                    if( count <= maxSpeckleSize )   // speckle region
-                    {
-                        rtype[ls[j]] = 1;   // small region label
-                        ds[j] = (T)newVal;
-                    }
-                    else
-                        rtype[ls[j]] = 0;   // large region label
-                }
-            }
-        }
-    }
-}
+								L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
+								L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
+
+								L0 = _mm_min_epi16(L0, _delta0);
+								L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
+
+								L1 = _mm_min_epi16(L1, _delta1);
+								L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
+
+								L2 = _mm_min_epi16(L2, _delta2);
+								L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);

+								L3 = _mm_min_epi16(L3, _delta3);
+								L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
+
+								_mm_store_si128( (__m128i*)(Lr_p + d), L0);
+								_mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
+								_mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
+								_mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
+
+								__m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
+								__m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
+								t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
+								_minL0 = _mm_min_epi16(_minL0, t0);
+
+								__m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
+
+								L0 = _mm_adds_epi16(L0, L1);
+								L2 = _mm_adds_epi16(L2, L3);
+								Sval = _mm_adds_epi16(Sval, L0);
+								Sval = _mm_adds_epi16(Sval, L2);
+
+								_mm_store_si128((__m128i*)(Sp + d), Sval);
+							}
+
+							_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
+							_mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
+						}
+						else
+#endif
+						{
+							int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
+
+							for( d = 0; d < D; d++ )
+							{
+								int Cpd = Cp[d], L0, L1, L2, L3;
+
+								L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
+								L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
+								L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
+								L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
+
+								Lr_p[d] = (CostType)L0;
+								minL0 = std::min(minL0, L0);
+
+								Lr_p[d + D2] = (CostType)L1;
+								minL1 = std::min(minL1, L1);
+
+								Lr_p[d + D2*2] = (CostType)L2;
+								minL2 = std::min(minL2, L2);
+
+								Lr_p[d + D2*3] = (CostType)L3;
+								minL3 = std::min(minL3, L3);
+
+								Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
+							}
+							minLr[0][xm] = (CostType)minL0;
+							minLr[0][xm+1] = (CostType)minL1;
+							minLr[0][xm+2] = (CostType)minL2;
+							minLr[0][xm+3] = (CostType)minL3;
+						}
+					}
+
+					if( pass == npasses )
+					{
+						for( x = 0; x < width; x++ )
+						{
+							disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
+							disp2cost[x] = MAX_COST;
+						}
+
+						for( x = width1 - 1; x >= 0; x-- )
+						{
+							CostType* Sp = S + x*D;
+							int minS = MAX_COST, bestDisp = -1;
+
+							if( npasses == 1 )
+							{
+								int xm = x*NR2, xd = xm*D2;
+
+								int minL0 = MAX_COST;
+								int delta0 = minLr[0][xm + NR2] + P2;
+								CostType* Lr_p0 = Lr[0] + xd + NRD2;
+								Lr_p0[-1] = Lr_p0[D] = MAX_COST;
+								CostType* Lr_p = Lr[0] + xd;
+
+								const CostType* Cp = C + x*D;
+
+#if CV_SSE2
+								if( useSIMD )
+								{
+									__m128i _P1 = _mm_set1_epi16((short)P1);
+									__m128i _delta0 = _mm_set1_epi16((short)delta0);
+
+									__m128i _minL0 = _mm_set1_epi16((short)minL0);
+									__m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
+									__m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
+
+									for( d = 0; d < D; d += 8 )
+									{
+										__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
+
+										L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
+										L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
+										L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
+										L0 = _mm_min_epi16(L0, _delta0);
+										L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
+
+										_mm_store_si128((__m128i*)(Lr_p + d), L0);
+										_minL0 = _mm_min_epi16(_minL0, L0);
+										L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
+										_mm_store_si128((__m128i*)(Sp + d), L0);
+
+										__m128i mask = _mm_cmpgt_epi16(_minS, L0);
+										_minS = _mm_min_epi16(_minS, L0);
+										_bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
+										_d8 = _mm_adds_epi16(_d8, _8);
+									}
+
+									short CV_DECL_ALIGNED(16) bestDispBuf[8];
+									_mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
+
+									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
+									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
+									_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
+
+									__m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
+									qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
+									qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
+
+									minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
+									minS = (CostType)_mm_cvtsi128_si32(qS);
+
+									qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
+									qS = _mm_cmpeq_epi16(_minS, qS);
+									int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
+
+									bestDisp = bestDispBuf[LSBTab[idx]];
+								}
+								else
+#endif
+								{
+									for( d = 0; d < D; d++ )
+									{
+										int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
+
+										Lr_p[d] = (CostType)L0;
+										minL0 = std::min(minL0, L0);
+
+										int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
+										if( Sval < minS )
+										{
+											minS = Sval;
+											bestDisp = d;
+										}
+									}
+									minLr[0][xm] = (CostType)minL0;
+								}
+							}
+							else
+							{
+								for( d = 0; d < D; d++ )
+								{
+									int Sval = Sp[d];
+									if( Sval < minS )
+									{
+										minS = Sval;
+										bestDisp = d;
+									}
+								}
+							}
+
+							for( d = 0; d < D; d++ )
+							{
+								if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
+									break;
+							}
+							if( d < D )
+								continue;
+							d = bestDisp;
+							int _x2 = x + minX1 - d - minD;
+							if( disp2cost[_x2] > minS )
+							{
+								disp2cost[_x2] = (CostType)minS;
+								disp2ptr[_x2] = (DispType)(d + minD);
+							}
+
+							if( 0 < d && d < D-1 )
+							{
+								// do subpixel quadratic interpolation:
+								//   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
+								//   then find minimum of the parabola.
+								int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
+								d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
+							}
+							else
+								d *= DISP_SCALE;
+							disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
+						}
+
+						for( x = minX1; x < maxX1; x++ )
+						{
+							// we round the computed disparity both towards -inf and +inf and check
+							// if either of the corresponding disparities in disp2 is consistent.
+							// This is to give the computed disparity a chance to look valid if it is.
+							int d1 = disp1ptr[x];
+							if( d1 == INVALID_DISP_SCALED )
+								continue;
+							int _d = d1 >> DISP_SHIFT;
+							int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
+							int _x = x - _d, x_ = x - d_;
+							if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
+								0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
+								disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
+						}
+					}
+
+					// now shift the cyclic buffers
+					std::swap( Lr[0], Lr[1] );
+					std::swap( minLr[0], minLr[1] );
+				}
+			}
+		}
+
+		class StereoBinarySGBMImpl : public StereoBinarySGBM
+		{
+		public:
+			StereoBinarySGBMImpl()
+			{
+				params = StereoBinarySGBMParams();
+			}
+
+			StereoBinarySGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
+				int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
+				int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
+				int _mode )
+			{
+				params = StereoBinarySGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
+					_P1, _P2, _disp12MaxDiff, _preFilterCap,
+					_uniquenessRatio, _speckleWindowSize, _speckleRange,
+					_mode );
+			}
+
+			void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
+			{
+				Mat left = leftarr.getMat(), right = rightarr.getMat();
+				CV_Assert( left.size() == right.size() && left.type() == right.type() &&
+					left.depth() == CV_8U );
+
+				disparr.create( left.size(), CV_16S );
+				Mat disp = disparr.getMat();
+
+				computeDisparityBinarySGBM( left, right, disp, params, buffer );
+				medianBlur(disp, disp, 3);
+
+				if( params.speckleWindowSize > 0 )
+					filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
+					StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
+			}
+
+			int getMinDisparity() const { return params.minDisparity; }
+			void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
+
+			int getNumDisparities() const { return params.numDisparities; }
+			void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
+
+			int getBlockSize() const { return params.SADWindowSize; }
+			void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
+
+			int getSpeckleWindowSize() const { return params.speckleWindowSize; }
+			void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
+
+			int getSpeckleRange() const { return params.speckleRange; }
+			void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
+
+			int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
+			void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
+
+			int getPreFilterCap() const { return params.preFilterCap; }
+			void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
+
+			int getUniquenessRatio() const { return params.uniquenessRatio; }
+			void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
+
+			int getP1() const { return params.P1; }
+			void setP1(int P1) { params.P1 = P1; }
+
+			int getP2() const { return params.P2; }
+			void setP2(int P2) { params.P2 = P2; }
+
+			int getMode() const { return params.mode; }
+			void setMode(int mode) { params.mode = mode; }
+
+			void write(FileStorage& fs) const
+			{
+				fs << "name" << name_
+					<< "minDisparity" << params.minDisparity
+					<< "numDisparities" << params.numDisparities
+					<< "blockSize" << params.SADWindowSize
+					<< "speckleWindowSize" << params.speckleWindowSize
+					<< "speckleRange" << params.speckleRange
+					<< "disp12MaxDiff" << params.disp12MaxDiff
+					<< "preFilterCap" << params.preFilterCap
+					<< "uniquenessRatio" << params.uniquenessRatio
+					<< "P1" << params.P1
+					<< "P2" << params.P2
+					<< "mode" << params.mode;
+			}
+
+			void read(const FileNode& fn)
+			{
+				FileNode n = fn["name"];
+				CV_Assert( n.isString() && String(n) == name_ );
+				params.minDisparity = (int)fn["minDisparity"];
+				params.numDisparities = (int)fn["numDisparities"];
+				params.SADWindowSize = (int)fn["blockSize"];
+				params.speckleWindowSize = (int)fn["speckleWindowSize"];
+				params.speckleRange = (int)fn["speckleRange"];
+				params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
+				params.preFilterCap = (int)fn["preFilterCap"];
+				params.uniquenessRatio = (int)fn["uniquenessRatio"];
+				params.P1 = (int)fn["P1"];
+				params.P2 = (int)fn["P2"];
+				params.mode = (int)fn["mode"];
+			}
+
+			StereoBinarySGBMParams params;
+			Mat buffer;
+			static const char* name_;
+		};
+
+		const char* StereoBinarySGBMImpl::name_ = "StereoMatcher.SGBM";
+
+
+		Ptr<StereoBinarySGBM> StereoBinarySGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
+			int P1, int P2, int disp12MaxDiff,
+			int preFilterCap, int uniquenessRatio,
+			int speckleWindowSize, int speckleRange,
+			int mode)
+		{
+			return Ptr<StereoBinarySGBM>(
+				new StereoBinarySGBMImpl(minDisparity, numDisparities, SADWindowSize,
+				P1, P2, disp12MaxDiff,
+				preFilterCap, uniquenessRatio,
+				speckleWindowSize, speckleRange,
+				mode));
+		}
+
+		Rect getValidDisparityROI( Rect roi1, Rect roi2,
+			int minDisparity,
+			int numberOfDisparities,
+			int SADWindowSize )
+		{
+			int SW2 = SADWindowSize/2;
+			int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
+
+			int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
+			int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
+			int ymin = std::max(roi1.y, roi2.y) + SW2;
+			int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
+
+			Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
+
+			return r.width > 0 && r.height > 0 ? r : Rect();
+		}
+
+		typedef cv::Point_<short> Point2s;
+
+		template <typename T>
+		void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
+		{
+			using namespace cv;
+
+			int width = img.cols, height = img.rows, npixels = width*height;
+			size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
+			if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
+				_buf.create(1, (int)bufSize, CV_8U);
+
+			uchar* buf = _buf.ptr();
+			int i, j, dstep = (int)(img.step/sizeof(T));
+			int* labels = (int*)buf;
+			buf += npixels*sizeof(labels[0]);
+			Point2s* wbuf = (Point2s*)buf;
+			buf += npixels*sizeof(wbuf[0]);
+			uchar* rtype = (uchar*)buf;
+			int curlabel = 0;
+
+			// clear out label assignments
+			memset(labels, 0, npixels*sizeof(labels[0]));
+
+			for( i = 0; i < height; i++ )
+			{
+				T* ds = img.ptr<T>(i);
+				int* ls = labels + width*i;
+
+				for( j = 0; j < width; j++ )
+				{
+					if( ds[j] != newVal )   // not a bad disparity
+					{
+						if( ls[j] )     // has a label, check for bad label
+						{
+							if( rtype[ls[j]] ) // small region, zero out disparity
+								ds[j] = (T)newVal;
+						}
+						// no label, assign and propagate
+						else
+						{
+							Point2s* ws = wbuf; // initialize wavefront
+							Point2s p((short)j, (short)i);  // current pixel
+							curlabel++; // next label
+							int count = 0;  // current region size
+							ls[j] = curlabel;
+
+							// wavefront propagation
+							while( ws >= wbuf ) // wavefront not empty
+							{
+								count++;
+								// put neighbors onto wavefront
+								T* dpp = &img.at<T>(p.y, p.x);
+								T dp = *dpp;
+								int* lpp = labels + width*p.y + p.x;
+
+								if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
+								{
+									lpp[+width] = curlabel;
+									*ws++ = Point2s(p.x, p.y+1);
+								}
+
+								if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
+								{
+									lpp[-width] = curlabel;
+									*ws++ = Point2s(p.x, p.y-1);
+								}
+
+								if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
+								{
+									lpp[+1] = curlabel;
+									*ws++ = Point2s(p.x+1, p.y);
+								}
+
+								if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
+								{
+									lpp[-1] = curlabel;
+									*ws++ = Point2s(p.x-1, p.y);
+								}
+
+								// pop most recent and propagate
+								// NB: could try least recent, maybe better convergence
+								p = *--ws;
+							}
+
+							// assign label type
+							if( count <= maxSpeckleSize )   // speckle region
+							{
+								rtype[ls[j]] = 1;   // small region label
+								ds[j] = (T)newVal;
+							}
+							else
+								rtype[ls[j]] = 0;   // large region label
+						}
+					}
+				}
+			}
+		}
+	}
 }

-void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
-                         double _maxDiff, InputOutputArray __buf )
+void cv::stereo::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
+								double _maxDiff, InputOutputArray __buf )
 {
-    Mat img = _img.getMat();
-    int type = img.type();
-    Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
-    CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
+	Mat img = _img.getMat();
+	int type = img.type();
+	Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
+	CV_Assert( type == CV_8UC1 || type == CV_16SC1 );

-    int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
+	int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);

 #if IPP_VERSION_X100 >= 801
-    CV_IPP_CHECK()
-    {
-        Ipp32s bufsize = 0;
-        IppiSize roisize = { img.cols, img.rows };
-        IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;
-
-        if (!__buf.needed() && (type == CV_8UC1 || type == CV_16SC1))
-        {
-            IppStatus status = ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize);
-            Ipp8u * buffer = ippsMalloc_8u(bufsize);
-
-            if ((int)status >= 0)
-            {
-                if (type == CV_8UC1)
-                    status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,
-                                                      (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer);
-                else
-                    status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,
-                                                       (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer);
-            }
-
-            if (status >= 0)
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
+	CV_IPP_CHECK()
+	{
+		Ipp32s bufsize = 0;
+		IppiSize roisize = { img.cols, img.rows };
+		IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;
+
+		if (!__buf.needed() && (type == CV_8UC1 || type == CV_16SC1))
+		{
+			IppStatus status = ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize);
+			Ipp8u * buffer = ippsMalloc_8u(bufsize);
+
+			if ((int)status >= 0)
+			{
+				if (type == CV_8UC1)
+					status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,
+					(Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer);
+				else
+					status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,
+					(Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer);
+			}
+
+			if (status >= 0)
+			{
+				CV_IMPL_ADD(CV_IMPL_IPP);
+				return;
+			}
+			setIppErrorStatus();
+		}
+	}
 #endif

-    if (type == CV_8UC1)
-        filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
-    else
-        filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
+	if (type == CV_8UC1)
+		filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
+	else
+		filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
 }

-void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
-                            int numberOfDisparities, int disp12MaxDiff )
+void cv::stereo::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
+								   int numberOfDisparities, int disp12MaxDiff )
 {
-    Mat disp = _disp.getMat(), cost = _cost.getMat();
-    int cols = disp.cols, rows = disp.rows;
-    int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
-    int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
-    AutoBuffer<int> _disp2buf(cols*2);
-    int* disp2buf = _disp2buf;
-    int* disp2cost = disp2buf + cols;
-    const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
-    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
-    int costType = cost.type();
-
-    disp12MaxDiff *= DISP_SCALE;
-
-    CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
-              (costType == CV_16S || costType == CV_32S) &&
-              disp.size() == cost.size() );
-
-    for( int y = 0; y < rows; y++ )
-    {
-        short* dptr = disp.ptr<short>(y);
-
-        for( x = 0; x < cols; x++ )
-        {
-            disp2buf[x] = INVALID_DISP_SCALED;
-            disp2cost[x] = INT_MAX;
-        }
-
-        if( costType == CV_16S )
-        {
-            const short* cptr = cost.ptr<short>(y);
-
-            for( x = minX1; x < maxX1; x++ )
-            {
-                int d = dptr[x], c = cptr[x];
-                int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
-
-                if( disp2cost[x2] > c )
-                {
-                    disp2cost[x2] = c;
-                    disp2buf[x2] = d;
-                }
-            }
-        }
-        else
-        {
-            const int* cptr = cost.ptr<int>(y);
-
-            for( x = minX1; x < maxX1; x++ )
-            {
-                int d = dptr[x], c = cptr[x];
-                int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
-
-                if( disp2cost[x2] < c )
-                {
-                    disp2cost[x2] = c;
-                    disp2buf[x2] = d;
-                }
-            }
-        }
-
-        for( x = minX1; x < maxX1; x++ )
-        {
-            // we round the computed disparity both towards -inf and +inf and check
-            // if either of the corresponding disparities in disp2 is consistent.
-            // This is to give the computed disparity a chance to look valid if it is.
-            int d = dptr[x];
-            if( d == INVALID_DISP_SCALED )
-                continue;
-            int d0 = d >> DISP_SHIFT;
-            int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
-            int x0 = x - d0, x1 = x - d1;
-            if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
-                (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
-                dptr[x] = (short)INVALID_DISP_SCALED;
-        }
-    }
+	Mat disp = _disp.getMat(), cost = _cost.getMat();
+	int cols = disp.cols, rows = disp.rows;
+	int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
+	int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
+	AutoBuffer<int> _disp2buf(cols*2);
+	int* disp2buf = _disp2buf;
+	int* disp2cost = disp2buf + cols;
+	const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
+	int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
+	int costType = cost.type();
+
+	disp12MaxDiff *= DISP_SCALE;
+
+	CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
+		(costType == CV_16S || costType == CV_32S) &&
+		disp.size() == cost.size() );
+
+	for( int y = 0; y < rows; y++ )
+	{
+		short* dptr = disp.ptr<short>(y);
+
+		for( x = 0; x < cols; x++ )
+		{
+			disp2buf[x] = INVALID_DISP_SCALED;
+			disp2cost[x] = INT_MAX;
+		}
+
+		if( costType == CV_16S )
+		{
+			const short* cptr = cost.ptr<short>(y);
+
+			for( x = minX1; x < maxX1; x++ )
+			{
+				int d = dptr[x], c = cptr[x];
+				int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
+
+				if( disp2cost[x2] > c )
+				{
+					disp2cost[x2] = c;
+					disp2buf[x2] = d;
+				}
+			}
+		}
+		else
+		{
+			const int* cptr = cost.ptr<int>(y);
+
+			for( x = minX1; x < maxX1; x++ )
+			{
+				int d = dptr[x], c = cptr[x];
+				int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
+
+				if( disp2cost[x2] < c )
+				{
+					disp2cost[x2] = c;
+					disp2buf[x2] = d;
+				}
+			}
+		}
+
+		for( x = minX1; x < maxX1; x++ )
+		{
+			// we round the computed disparity both towards -inf and +inf and check
+			// if either of the corresponding disparities in disp2 is consistent.
+			// This is to give the computed disparity a chance to look valid if it is.
+			int d = dptr[x];
+			if( d == INVALID_DISP_SCALED )
+				continue;
+			int d0 = d >> DISP_SHIFT;
+			int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
+			int x0 = x - d0, x1 = x - d1;
+			if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
+				(0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
+				dptr[x] = (short)INVALID_DISP_SCALED;
+		}
+	}
 }

--- a/modules/stereo/test/test_precomp.hpp
+++ b/modules/stereo/test/test_precomp.hpp
@@ -11,10 +11,21 @@

 #include <iostream>
 #include "opencv2/ts.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/stereo.hpp"
 #include "opencv2/imgcodecs.hpp"
+
+
+#include "opencv2/stereo.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/private.hpp"
+#include "opencv2/core/cvdef.h"
 #include "opencv2/core.hpp"
 #include "opencv2/highgui.hpp"

+#include <algorithm>
+#include <cmath>
+
+
+
 #endif