update some of the functions in ocl module to the latest version

0fdb55a5 · yao · 3fb3851c · 0fdb55a5 · 0fdb55a5 · 0fdb55a5
Commit 0fdb55a5 authored Sep 03, 2012 by yao
24 changed files
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -901,7 +901,7 @@ namespace cv
 			oclMat dx_buf, dy_buf;
 			oclMat edgeBuf;
 			oclMat trackBuf1, trackBuf2;
-			oclMat counter;
+			void * counter;
 			Ptr<FilterEngine_GPU> filterDX, filterDY;
 		};
@@ -981,6 +981,9 @@ namespace cv
            int nlevels;
        protected:
+            // initialize buffers; only need to do once in case of multiscale detection
+            void init_buffer(const oclMat& img, Size win_stride);
            void computeBlockHistograms(const oclMat& img);
            void computeGradient(const oclMat& img, oclMat& grad, oclMat& qangle);
@@ -1004,7 +1007,11 @@ namespace cv
            // Gradients conputation results
            oclMat grad, qangle;
-            std::vector<oclMat> image_scales;
+            // scaled image
+            oclMat image_scale;
+            // effect size of input image (might be different from original size after scaling)
+            Size effect_size;
        };
        //! Speeded up robust features, port from GPU module.

--- a/modules/ocl/perf/test_blend.cpp
+++ b/modules/ocl/perf/test_blend.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include <iomanip>
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+PARAM_TEST_CASE(Blend, MatType, int)
+{
+	int type;
+	int channels;
+	std::vector<cv::ocl::Info> oclinfo;
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		channels = GET_PARAM(1);
+		//int devnums = getDevice(oclinfo);
+		//CV_Assert(devnums > 0);
+		//cv::ocl::setBinpath(CLBINPATH);
+	}
+};
+TEST_P(Blend, Performance)
+{
+	cv::Size size(MWIDTH, MHEIGHT);
+	cv::Mat img1_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
+	cv::Mat img2_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
+	cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
+	cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
+	cv::ocl::oclMat gimg1(size, CV_MAKETYPE(type, channels)), gimg2(size, CV_MAKETYPE(type, channels)), gweights1(size, CV_32F), gweights2(size, CV_32F);
+	cv::ocl::oclMat gdst(size, CV_MAKETYPE(type, channels));
+	double totalgputick_all = 0;
+	double totalgputick_kernel = 0;
+	double t1 = 0;
+	double t2 = 0;
+	for (int j = 0; j < LOOP_TIMES + 1; j ++) //LOOP_TIMES=100
+	{
+		t1 = (double)cvGetTickCount();
+		cv::ocl::oclMat gimg1 = cv::ocl::oclMat(img1_host);
+		cv::ocl::oclMat gimg2 = cv::ocl::oclMat(img2_host);
+		cv::ocl::oclMat gweights1 = cv::ocl::oclMat(weights1);
+		cv::ocl::oclMat gweights2 = cv::ocl::oclMat(weights1);
+		t2 = (double)cvGetTickCount();
+		cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, gdst);
+		t2 = (double)cvGetTickCount() - t2;
+		cv::Mat m;
+		gdst.download(m);
+		t1 = (double)cvGetTickCount() - t1;
+		if (j == 0)
+		{
+			continue;
+		}
+		totalgputick_all = t1 + totalgputick_all;
+		totalgputick_kernel = t2 + totalgputick_kernel;
+	};
+	cout << "average gpu total  runtime is  " << totalgputick_all / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfering  is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+}
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
+                            Values(CV_8U, CV_32F), Values(1, 4)));
+#endif
\ No newline at end of file
--- a/modules/ocl/perf/test_canny.cpp
+++ b/modules/ocl/perf/test_canny.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include <iomanip>
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+#define FILTER_IMAGE "../../../samples/gpu/road.png"
+#ifndef MWC_TEST_UTILITY
+#define MWC_TEST_UTILITY
+// Param class
+#ifndef IMPLEMENT_PARAM_CLASS
+#define IMPLEMENT_PARAM_CLASS(name, type) \
+class name \
+	{ \
+	public: \
+	name ( type arg = type ()) : val_(arg) {} \
+	operator type () const {return val_;} \
+	private: \
+	type val_; \
+	}; \
+	inline void PrintTo( name param, std::ostream* os) \
+	{ \
+	*os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
+	}
+IMPLEMENT_PARAM_CLASS(Channels, int)
+#endif // IMPLEMENT_PARAM_CLASS
+#endif // MWC_TEST_UTILITY
+////////////////////////////////////////////////////////
+// Canny1
+IMPLEMENT_PARAM_CLASS(AppertureSize, int);
+IMPLEMENT_PARAM_CLASS(L2gradient, bool);
+PARAM_TEST_CASE(Canny1, AppertureSize, L2gradient)
+{
+	int apperture_size;
+	bool useL2gradient;
+	//std::vector<cv::ocl::Info> oclinfo;
+	virtual void SetUp()
+	{
+		apperture_size = GET_PARAM(0);
+		useL2gradient = GET_PARAM(1);
+		//int devnums = getDevice(oclinfo);
+		//CV_Assert(devnums > 0);
+	}
+};
+TEST_P(Canny1, Performance)
+{
+	cv::Mat img = readImage(FILTER_IMAGE,cv::IMREAD_GRAYSCALE);
+	ASSERT_FALSE(img.empty());
+	double low_thresh = 100.0;
+	double high_thresh = 150.0;
+	cv::Mat edges_gold;
+	cv::ocl::oclMat edges;
+    double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t1=0;
+	double t2=0;
+	for(int j = 0; j < LOOP_TIMES+1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1		
+		cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);//upload
+		t2=(double)cvGetTickCount();//kernel
+		cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		edges.download (cpu_dst);//download
+		t1 = (double)cvGetTickCount() - t1;//gpu end1
+		if(j == 0)
+			continue;
+		totalgputick=t1+totalgputick;
+		totalgputick_kernel=t2+totalgputick_kernel;	
+	}
+	cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny1, testing::Combine(
+						testing::Values(AppertureSize(3), AppertureSize(5)),
+						testing::Values(L2gradient(false), L2gradient(true))));
+#endif  //Have opencl
\ No newline at end of file
--- a/modules/ocl/perf/test_columnsum.cpp
+++ b/modules/ocl/perf/test_columnsum.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//	   Fangfang Bai fangfang@multicorewareinc.com
+//    
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include <iomanip>
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+///////////////////////////////////////////////////////////////////////////////
+/// ColumnSum
+#ifdef HAVE_OPENCL
+////////////////////////////////////////////////////////////////////////
+// ColumnSum
+PARAM_TEST_CASE(ColumnSum)
+{
+	cv::Mat src;
+	//std::vector<cv::ocl::Info> oclinfo;
+	virtual void SetUp()
+	{
+		//int devnums = getDevice(oclinfo);
+		//CV_Assert(devnums > 0);
+	}
+};
+TEST_F(ColumnSum, Performance)
+{
+	cv::Size size(MWIDTH,MHEIGHT);
+    cv::Mat src = randomMat(size, CV_32FC1);
+    cv::ocl::oclMat d_dst;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t1=0;
+	double t2=0;
+	for(int j = 0; j < LOOP_TIMES+1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+        cv::ocl::oclMat d_src(src);		
+		t2=(double)cvGetTickCount();//kernel
+		cv::ocl::columnSum(d_src,d_dst);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		d_dst.download (cpu_dst);//download
+		t1 = (double)cvGetTickCount() - t1;//gpu end1
+		if(j == 0)
+			continue;
+		totalgputick=t1+totalgputick;
+		totalgputick_kernel=t2+totalgputick_kernel;	
+	}
+	cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+#endif 
\ No newline at end of file
--- a/modules/ocl/perf/test_fft.cpp
+++ b/modules/ocl/perf/test_fft.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfangbai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+using namespace std;
+#ifdef HAVE_CLAMDFFT
+////////////////////////////////////////////////////////////////////////////
+// Dft
+PARAM_TEST_CASE(Dft, cv::Size, bool) 
+{
+	cv::Size dft_size;
+	bool	 dft_rows;
+	vector<cv::ocl::Info> info;
+	virtual void SetUp()
+	{
+		dft_size = GET_PARAM(0);
+		dft_rows = GET_PARAM(1);
+		cv::ocl::getDevice(info);
+	}
+};
+TEST_P(Dft, C2C)
+{
+	cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0);
+	int flags = 0;
+	flags |= dft_rows ? cv::DFT_ROWS : 0;
+	cv::ocl::oclMat d_b;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t1=0;
+	double t2=0;
+	for(int j = 0; j < LOOP_TIMES+1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+		cv::ocl::oclMat ga=cv::ocl::oclMat(a);//upload
+		t2=(double)cvGetTickCount();//kernel
+		cv::ocl::dft(ga, d_b, a.size(), flags);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		d_b.download (cpu_dst);//download
+		t1 = (double)cvGetTickCount() - t1;//gpu end1
+		if(j == 0)
+			continue;
+		totalgputick=t1+totalgputick;	
+		totalgputick_kernel=t2+totalgputick_kernel;	
+	}
+	cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+TEST_P(Dft, R2CthenC2R)
+{
+	cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0);
+	int flags = 0;
+	//flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet
+	cv::ocl::oclMat d_b, d_c;
+	cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
+	cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT);
+	EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
+}
+//INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine(
+//						testing::Values(cv::Size(1280, 1024), cv::Size(1920, 1080),cv::Size(1800, 1500)),
+//						testing::Values(false, true)));
+#endif // HAVE_CLAMDFFT
--- a/modules/ocl/perf/test_gemm.cpp
+++ b/modules/ocl/perf/test_gemm.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+using namespace std;
+#ifdef HAVE_CLAMDBLAS
+////////////////////////////////////////////////////////////////////////////
+// GEMM
+PARAM_TEST_CASE(Gemm, int, cv::Size, int) 
+{
+	int      type;
+	cv::Size mat_size;
+	int		 flags;
+	vector<cv::ocl::Info> info;
+	virtual void SetUp()
+	{
+		type     = GET_PARAM(0);
+		mat_size = GET_PARAM(1);
+		flags    = GET_PARAM(2);
+		cv::ocl::getDevice(info);
+	}
+};
+TEST_P(Gemm, Performance)
+{
+	cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
+	cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
+	cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
+	cv::ocl::oclMat ocl_dst;	
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t1=0;
+	double t2=0;
+	for(int j = 0; j < LOOP_TIMES+1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+		cv::ocl::oclMat ga = cv::ocl::oclMat(a);//upload
+		cv::ocl::oclMat gb = cv::ocl::oclMat(b);//upload
+		cv::ocl::oclMat gc = cv::ocl::oclMat(c);//upload
+		t2=(double)cvGetTickCount();//kernel
+		cv::ocl::gemm(ga, gb, 1.0,gc, 1.0, ocl_dst, flags);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		ocl_dst.download (cpu_dst);//download
+		t1 = (double)cvGetTickCount() - t1;//gpu end
+		if(j == 0)
+			continue;
+		totalgputick=t1+totalgputick;	
+		totalgputick_kernel=t2+totalgputick_kernel;	
+	}
+	cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
+						testing::Values(CV_32FC1, CV_32FC2/* , CV_64FC1, CV_64FC2*/),
+						testing::Values(cv::Size(512, 512), cv::Size(1024, 1024)),
+						testing::Values(0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_1_T + cv::GEMM_2_T)));
+#endif
\ No newline at end of file
--- a/modules/ocl/perf/test_hog.cpp
+++ b/modules/ocl/perf/test_hog.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//		Fangfang BAI, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include "opencv2/core/core.hpp"
+#include <iomanip>
+using namespace std;
+#ifdef HAVE_OPENCL
+PARAM_TEST_CASE(HOG,cv::Size,int)
+{
+	cv::Size winSize;
+	int type;
+	std::vector<cv::ocl::Info> oclinfo;
+	virtual void SetUp()
+	{
+		winSize = GET_PARAM(0);
+		type = GET_PARAM(1);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+	}
+};
+TEST_P(HOG, GetDescriptors)
+{
+	// Load image
+	cv::Mat img_rgb = readImage("D:road.png");
+	ASSERT_FALSE(img_rgb.empty());
+	// Convert image
+	cv::Mat img;
+	switch (type)
+	{
+	case CV_8UC1:
+		cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+		break;
+	case CV_8UC4:
+	default:
+		cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+		break;
+	}
+		// HOGs
+	cv::ocl::HOGDescriptor ocl_hog;
+	ocl_hog.gamma_correction = true;
+	// Compute descriptor
+	cv::ocl::oclMat d_descriptors;
+	//down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t1=0;
+	double t2=0;
+	for(int j = 0; j < LOOP_TIMES+1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+		cv::ocl::oclMat d_img=cv::ocl::oclMat(img);//upload
+		t2=(double)cvGetTickCount();//kernel
+		ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat down_descriptors;
+		d_descriptors.download(down_descriptors);
+		t1 = (double)cvGetTickCount() - t1;//gpu end1
+		if(j == 0)
+			continue;
+		totalgputick=t1+totalgputick;
+		totalgputick_kernel=t2+totalgputick_kernel;	
+	}
+	cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+TEST_P(HOG, Detect)
+{
+	// Load image
+	cv::Mat img_rgb = readImage("D:road.png");
+	ASSERT_FALSE(img_rgb.empty());
+	// Convert image
+	cv::Mat img;
+	switch (type)
+	{
+	case CV_8UC1:
+		cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+		break;
+	case CV_8UC4:
+	default:
+		cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+		break;
+	}
+    // HOGs
+	if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128)))
+		winSize = cv::Size(64, 128);
+	cv::ocl::HOGDescriptor ocl_hog(winSize);
+	ocl_hog.gamma_correction = true;
+	cv::HOGDescriptor hog;
+	hog.winSize = winSize;
+	hog.gammaCorrection = true;
+	if (winSize.width == 48 && winSize.height == 96)
+	{
+		// daimler's base
+		ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector48x96());
+		hog.setSVMDetector(hog.getDaimlerPeopleDetector());
+	}
+	else if (winSize.width == 64 && winSize.height == 128)
+	{
+		ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector64x128());
+		hog.setSVMDetector(hog.getDefaultPeopleDetector());
+	}
+	else
+	{
+		ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+		hog.setSVMDetector(hog.getDefaultPeopleDetector());
+	}
+	// OpenCL detection
+	std::vector<cv::Point> d_v_locations;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t1=0;
+	double t2=0;
+	for(int j = 0; j < LOOP_TIMES+1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+		cv::ocl::oclMat d_img=cv::ocl::oclMat(img);//upload
+		t2=(double)cvGetTickCount();//kernel
+		ocl_hog.detect(d_img, d_v_locations, 0);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		t1 = (double)cvGetTickCount() - t1;//gpu end1		
+		if(j == 0)
+			continue;
+		totalgputick=t1+totalgputick;
+		totalgputick_kernel=t2+totalgputick_kernel;	
+	}
+	cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
+						testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
+						testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
+#endif //HAVE_OPENCL
--- a/modules/ocl/perf/test_match_template.cpp
+++ b/modules/ocl/perf/test_match_template.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include <iomanip>
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+#ifndef MWC_TEST_UTILITY
+#define MWC_TEST_UTILITY
+//////// Utility
+#ifndef DIFFERENT_SIZES
+#else
+#undef DIFFERENT_SIZES
+#endif
+#define DIFFERENT_SIZES testing::Values(cv::Size(256, 256), cv::Size(3000, 3000))
+// Param class
+#ifndef IMPLEMENT_PARAM_CLASS
+#define IMPLEMENT_PARAM_CLASS(name, type) \
+class name \
+{ \
+public: \
+	name ( type arg = type ()) : val_(arg) {} \
+	operator type () const {return val_;} \
+private: \
+	type val_; \
+}; \
+	inline void PrintTo( name param, std::ostream* os) \
+{ \
+	*os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
+}
+IMPLEMENT_PARAM_CLASS(Channels, int)
+#endif // IMPLEMENT_PARAM_CLASS
+#endif // MWC_TEST_UTILITY
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate
+#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
+IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
+const char* TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
+PARAM_TEST_CASE(MatchTemplate, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+	cv::Size size;
+	cv::Size templ_size;
+	int cn;
+	int method;
+	//vector<cv::ocl::Info> oclinfo;
+	virtual void SetUp()
+	{
+		size = GET_PARAM(0);
+		templ_size = GET_PARAM(1);
+		cn = GET_PARAM(2);
+		method = GET_PARAM(3);
+		//int devnums = getDevice(oclinfo);
+		//CV_Assert(devnums > 0);
+	}
+};
+struct MatchTemplate8U : MatchTemplate {};
+TEST_P(MatchTemplate8U, Performance)
+{
+	std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
+	std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
+	std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
+	std::cout << "Channels: " << cn << std::endl;
+	cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
+	cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
+	cv::Mat dst_gold;
+	cv::ocl::oclMat dst;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t1=0;
+	double t2=0;
+	for(int j = 0; j < LOOP_TIMES+1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
+		cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
+		t2=(double)cvGetTickCount();//kernel
+		cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		dst.download (cpu_dst);//download
+		t1 = (double)cvGetTickCount() - t1;//gpu end1
+		if(j == 0)
+			continue;
+		totalgputick=t1+totalgputick;	
+		totalgputick_kernel=t2+totalgputick_kernel;	
+	}
+	cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+struct MatchTemplate32F : MatchTemplate {};
+TEST_P(MatchTemplate32F, Performance)
+{
+	std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
+	std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
+	std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
+	std::cout << "Channels: " << cn << std::endl;
+	cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
+	cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
+	cv::Mat dst_gold;
+	cv::ocl::oclMat dst;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t1=0;
+	double t2=0;
+	for(int j = 0; j < LOOP_TIMES; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
+		cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
+		t2=(double)cvGetTickCount();//kernel
+		cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		dst.download (cpu_dst);//download
+		t1 = (double)cvGetTickCount() - t1;//gpu end1		
+		totalgputick=t1+totalgputick;
+		totalgputick_kernel=t2+totalgputick_kernel;	
+	}
+   cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+   cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, 
+	testing::Combine(
+    testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT),cv::Size(1800, 1500)),
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+    testing::Values(Channels(1), Channels(4)/*, Channels(3)*/),
+	ALL_TEMPLATE_METHODS
+	)
+);
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
+    testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT),cv::Size(1800, 1500)),
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+    testing::Values(Channels(1), Channels(4) /*, Channels(3)*/),
+    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
+#endif //HAVE_OPENCL
\ No newline at end of file
--- a/modules/ocl/perf/test_pyrdown.cpp
+++ b/modules/ocl/perf/test_pyrdown.cpp
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    fangfang bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include <iomanip>
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+PARAM_TEST_CASE(PyrDown, MatType, int)
+{
+	int type;
+	int channels;
+	//src mat
+	cv::Mat mat1;
+	cv::Mat dst;
+	//std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		channels = GET_PARAM(1);
+		//int devnums = getDevice(oclinfo);
+		//CV_Assert(devnums > 0);
+	}
+};
+#define VARNAME(A) string(#A);
+////////////////////////////////PyrDown/////////////////////////////////////////////////
+TEST_P(PyrDown, Mat)
+{
+	cv::Size size(MWIDTH, MHEIGHT);
+	cv::RNG &rng = TS::ptr()->get_rng();
+	mat1 = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
+	cv::ocl::oclMat gdst;
+	double totalgputick = 0;
+	double totalgputick_kernel = 0;
+	double t1 = 0;
+	double t2 = 0;
+	for (int j = 0; j < LOOP_TIMES + 1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+		cv::ocl::oclMat gmat1(mat1);
+		t2 = (double)cvGetTickCount(); //kernel
+		cv::ocl::pyrDown(gmat1, gdst);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		gdst.download(cpu_dst);
+		t1 = (double)cvGetTickCount() - t1;//gpu end1
+		if (j == 0)
+		{
+			continue;
+		}
+		totalgputick = t1 + totalgputick;
+		totalgputick_kernel = t2 + totalgputick_kernel;
+	}
+	cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+}
+//********test****************
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
+                            Values(CV_8U, CV_32F), Values(1, 4)));
+#endif // HAVE_OPENCL
--- a/modules/ocl/perf/test_pyrup.cpp
+++ b/modules/ocl/perf/test_pyrup.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    fangfang bai fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "opencv2/core/core.hpp"
+#include "precomp.hpp"
+#include <iomanip>
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+PARAM_TEST_CASE(PyrUp, MatType, int)
+{
+	int type;
+	int channels;
+	//std::vector<cv::ocl::Info> oclinfo;
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		channels = GET_PARAM(1);
+		//int devnums = getDevice(oclinfo);
+		//CV_Assert(devnums > 0);
+	}
+};
+TEST_P(PyrUp, Performance)
+{
+	cv::Size size(MWIDTH, MHEIGHT);
+	cv::Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+	cv::Mat dst_gold;
+	cv::ocl::oclMat dst;
+	double totalgputick = 0;
+	double totalgputick_kernel = 0;
+	double t1 = 0;
+	double t2 = 0;
+	for (int j = 0; j < LOOP_TIMES + 1; j ++)
+	{
+		t1 = (double)cvGetTickCount();//gpu start1
+		cv::ocl::oclMat srcMat = cv::ocl::oclMat(src);//upload
+		t2 = (double)cvGetTickCount(); //kernel
+		cv::ocl::pyrUp(srcMat, dst);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		dst.download(cpu_dst); //download
+		t1 = (double)cvGetTickCount() - t1;//gpu end1
+		if (j == 0)
+		{
+			continue;
+		}
+		totalgputick = t1 + totalgputick;
+		totalgputick_kernel = t2 + totalgputick_kernel;
+	}
+	cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+}
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, Combine(
+                            Values(CV_8U, CV_32F), Values(1, 4)));
+#endif // HAVE_OPENCL
\ No newline at end of file
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@ -88,11 +88,11 @@ void cv::ocl::CannyBuf::create(const Size& image_size, int apperture_size)
        Mat kx, ky;
        if (!filterDX)
        {
-			filterDX = createDerivFilter_GPU(CV_32F, CV_32F, 1, 0, apperture_size, BORDER_REPLICATE);
+            filterDX = createDerivFilter_GPU(CV_8U, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
        }
        if (!filterDY)
        {
-            filterDY = createDerivFilter_GPU(CV_32F, CV_32F, 0, 1, apperture_size, BORDER_REPLICATE);
+            filterDY = createDerivFilter_GPU(CV_8U, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
        }
    }
    edgeBuf.create(image_size.height + 2, image_size.width + 2, CV_32FC1);
@@ -100,7 +100,10 @@ void cv::ocl::CannyBuf::create(const Size& image_size, int apperture_size)
    trackBuf1.create(1, image_size.width * image_size.height, CV_16UC2);
    trackBuf2.create(1, image_size.width * image_size.height, CV_16UC2);
-	counter.create(1,1, CV_32SC1);
+    float counter_f [1] = { 0 };
+    int err = 0;
+    counter = clCreateBuffer( Context::getContext()->impl->clContext, CL_MEM_COPY_HOST_PTR, sizeof(float), counter_f, &err );
+    openCLSafeCall(err);
 }
 void cv::ocl::CannyBuf::release()
@@ -112,7 +115,7 @@ void cv::ocl::CannyBuf::release()
    edgeBuf.release();
    trackBuf1.release();
    trackBuf2.release();
-	counter.release();
+    openCLFree(counter);
 }
 namespace cv { namespace ocl {
@@ -125,9 +128,9 @@ namespace cv { namespace ocl {
        void calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int rows, int cols, float low_thresh, float high_thresh);
-        void edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, oclMat& counter, int rows, int cols);
+        void edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, void * counter, int rows, int cols);
-        void edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, oclMat& counter, int rows, int cols);
+        void edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, void * counter, int rows, int cols);
        void getEdges_gpu(oclMat& map, oclMat& dst, int rows, int cols);
    }
@@ -164,11 +167,10 @@ void cv::ocl::Canny(const oclMat& src, CannyBuf& buf, oclMat& dst, double low_th
        std::swap( low_thresh, high_thresh );
    dst.create(src.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
+    //dst.setTo(Scalar::all(0));
    buf.create(src.size(), apperture_size);
-    buf.edgeBuf.setTo(Scalar::all(0));
+    //buf.edgeBuf.setTo(Scalar::all(0));
-	buf.counter.setTo(Scalar::all(0));
    if (apperture_size == 3)
    {
@@ -178,17 +180,8 @@ void cv::ocl::Canny(const oclMat& src, CannyBuf& buf, oclMat& dst, double low_th
    }
    else
    {
-		// FIXME:
+        buf.filterDX->apply(src, buf.dx);
-		// current ocl implementation requires the src and dst having same type
+        buf.filterDY->apply(src, buf.dy);
-		// convertTo is time consuming so this may be optimized later.
-		oclMat src_omat32f = src;
-		src.convertTo(src_omat32f, CV_32F); // FIXME
-        buf.filterDX->apply(src_omat32f, buf.dx);
-        buf.filterDY->apply(src_omat32f, buf.dy);
-		buf.dx.convertTo(buf.dx, CV_32S); // FIXME
-		buf.dy.convertTo(buf.dy, CV_32S); // FIXME
        calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
    }
@@ -210,12 +203,11 @@ void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& d
        std::swap( low_thresh, high_thresh);
    dst.create(dx.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
+    //dst.setTo(Scalar::all(0));
    buf.dx = dx; buf.dy = dy;
    buf.create(dx.size(), -1);
-    buf.edgeBuf.setTo(Scalar::all(0));
+    //buf.edgeBuf.setTo(Scalar::all(0));
-	buf.counter.setTo(Scalar::all(0));
    calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
@@ -342,7 +334,7 @@ void canny::calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int ro
    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
-void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, oclMat& counter, int rows, int cols)
+void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, void * counter, int rows, int cols)
 {
    Context *clCxt = map.clCxt;
    string kernelName = "edgesHysteresisLocal";
@@ -350,7 +342,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, oclMat& counter,
    args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
-	args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&counter));
    args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
    args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
@@ -362,10 +354,10 @@ void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, oclMat& counter,
    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
-void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, oclMat& counter, int rows, int cols)
+void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, void * counter, int rows, int cols)
 {
-	unsigned int count = Mat(counter).at<unsigned int>(0);
+    unsigned int count;
+    openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(float), &count, NULL, NULL, NULL));
    Context *clCxt = map.clCxt;
    string kernelName = "edgesHysteresisGlobal";
    vector< pair<size_t, const void *> > args;
@@ -375,13 +367,13 @@ void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, ocl
    while(count > 0)
    {
-		counter.setTo(0);
+        //counter.setTo(0);
        args.clear();
        size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
        args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
        args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data));
-		args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&counter));
        args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
        args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
        args.push_back( make_pair( sizeof(cl_int), (void *)&count));
@@ -389,7 +381,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, ocl
        args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
        openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-		count = Mat(counter).at<unsigned int>(0);
+        openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(float), &count, NULL, NULL, NULL));
        std::swap(st1, st2);
    }
 #undef DIVUP

--- a/modules/ocl/src/columnsum.cpp
+++ b/modules/ocl/src/columnsum.cpp
@@ -67,7 +67,9 @@ namespace cv
 void cv::ocl::columnSum(const oclMat& src,oclMat& dst)
 {
-	CV_Assert(src.type() == CV_32FC1 && dst.type() == CV_32FC1 && src.size() == dst.size());
+	CV_Assert(src.type() == CV_32FC1);
+	dst.create(src.size(), src.type());
 	Context *clCxt = src.clCxt;                                        

--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
--- a/modules/ocl/src/kernels/blend_linear.cl
+++ b/modules/ocl/src/kernels/blend_linear.cl
@@ -67,32 +67,6 @@ __kernel void BlendLinear_C1_D0(
 	}
 }
-__kernel void BlendLinear_C3_D0(
-	__global uchar *dst,
-	__global uchar *img1,
-	__global uchar *img2,
-	__global float *weight1,
-	__global float *weight2,
-	int rows,
-	int cols,
-	int istep,
-	int wstep
-	)
-{
-	int idx = get_global_id(0);
-	int idy = get_global_id(1);
-	int x = idx / 3;
-	int y = idy;
-	if (x < cols && y < rows)
-	{
-		int pos = idy * istep + idx;
-		int wpos = idy * (wstep /sizeof(float)) + x;
-		float w1 = weight1[wpos];
-		float w2 = weight2[wpos];
-		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
-	}
-}
 __kernel void BlendLinear_C4_D0(
 	__global uchar *dst,
 	__global uchar *img1,
@@ -143,32 +117,6 @@ __kernel void BlendLinear_C1_D5(
 	}
 }
-__kernel void BlendLinear_C3_D5(
-	__global float *dst,
-	__global float *img1,
-	__global float *img2,
-	__global float *weight1,
-	__global float *weight2,
-	int rows,
-	int cols,
-	int istep,
-	int wstep
-	)
-{
-	int idx = get_global_id(0);
-	int idy = get_global_id(1);
-	int x = idx / 3;
-	int y = idy;
-	if (x < cols && y < rows)
-	{
-		int pos = idy * (istep / sizeof(float)) + idx;
-		int wpos = idy * (wstep /sizeof(float)) + x;
-		float w1 = weight1[wpos];
-		float w2 = weight2[wpos];
-		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
-	}
-}
 __kernel void BlendLinear_C4_D5(
 	__global float *dst,
 	__global float *img1,
@@ -194,3 +142,4 @@ __kernel void BlendLinear_C4_D5(
 		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
 	}
 }
--- a/modules/ocl/src/kernels/imgproc_canny.cl
+++ b/modules/ocl/src/kernels/imgproc_canny.cl
@@ -70,8 +70,8 @@ inline float calc(int x, int y)
 // dx_buf	output dx buffer
 // dy_buf	output dy buffer
 __kernel 
-void calcSobelRowPass
+    void calcSobelRowPass
-(
+    (
    __global const uchar * src,
    __global int * dx_buf,
    __global int * dy_buf,
@@ -83,7 +83,7 @@ void calcSobelRowPass
    int dx_buf_offset,
    int dy_buf_step,
    int dy_buf_offset
-)
+    )
 {
    //src_step   /= sizeof(*src);
    //src_offset /= sizeof(*src);
@@ -128,8 +128,8 @@ void calcSobelRowPass
 // dy			direvitive in y direction output
 // mag			magnitude direvitive of xy output
 __kernel
-void calcMagnitude_buf
+    void calcMagnitude_buf
-(
+    (
    __global const int * dx_buf,
    __global const int * dy_buf,
    __global int * dx,
@@ -147,7 +147,7 @@ void calcMagnitude_buf
    int dy_offset,
    int mag_step,
    int mag_offset
-)
+    )
 {
    dx_buf_step    /= sizeof(*dx_buf);
    dx_buf_offset  /= sizeof(*dx_buf);
@@ -205,8 +205,8 @@ void calcMagnitude_buf
 // dy			direvitive in y direction output
 // mag			magnitude direvitive of xy output
 __kernel
-void calcMagnitude
+    void calcMagnitude
-(
+    (
    __global const int * dx,
    __global const int * dy,
    __global float * mag,
@@ -218,7 +218,7 @@ void calcMagnitude
    int dy_offset,
    int mag_step,
    int mag_offset
-)
+    )
 {
    dx_step    /= sizeof(*dx);
    dx_offset  /= sizeof(*dx);
@@ -261,8 +261,8 @@ void calcMagnitude
 // mag			magnitudes calculated from calcMagnitude function
 // map			output containing raw edge types
 __kernel
-void calcMap
+    void calcMap
-(
+    (
    __global const int * dx,
    __global const int * dy, 
    __global const float * mag,
@@ -279,7 +279,7 @@ void calcMap
    int mag_offset,
    int map_step,
    int map_offset
-)
+    )
 {
    dx_step    /= sizeof(*dx);
    dx_offset  /= sizeof(*dx);
@@ -361,8 +361,8 @@ void calcMap
 // non local memory version
 __kernel
-void calcMap_2 
+    void calcMap_2 
-(
+    (
    __global const int * dx,
    __global const int * dy, 
    __global const float * mag,
@@ -379,7 +379,7 @@ void calcMap_2
    int mag_offset,
    int map_step,
    int map_offset
-)
+    )
 {
    dx_step    /= sizeof(*dx);
    dx_offset  /= sizeof(*dx);
@@ -440,8 +440,8 @@ void calcMap_2
 // [256, 1, 1] threaded, local memory version
 __kernel
-void calcMap_3
+    void calcMap_3
-(
+    (
    __global const int * dx,
    __global const int * dy, 
    __global const float * mag,
@@ -458,7 +458,7 @@ void calcMap_3
    int mag_offset,
    int map_step,
    int map_offset
-)
+    )
 {
    dx_step    /= sizeof(*dx);
    dx_offset  /= sizeof(*dx);
@@ -556,8 +556,8 @@ void calcMap_3
 // st		the potiential edge points found in this kernel call
 // counter	the number of potiential edge points
 __kernel
-void edgesHysteresisLocal
+    void edgesHysteresisLocal
-(
+    (
    __global int * map,
    __global ushort2 * st, 
    volatile __global unsigned int * counter,
@@ -565,7 +565,7 @@ void edgesHysteresisLocal
    int cols,
    int map_step,
    int map_offset
-)
+    )
 {
    map_step   /= sizeof(*map);
    map_offset /= sizeof(*map);
@@ -599,7 +599,7 @@ void edgesHysteresisLocal
    {
        int n;
-		#pragma unroll
+#pragma unroll
        for (int k = 0; k < 16; ++k)
        {
            n = 0;
@@ -653,8 +653,8 @@ __constant c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 #define stack_size 512
 __kernel
-void edgesHysteresisGlobal
+    void edgesHysteresisGlobal
-(
+    (
    __global int * map,
    __global ushort2 * st1, 
    __global ushort2 * st2, 
@@ -664,7 +664,7 @@ void edgesHysteresisGlobal
    int count,
    int map_step,
    int map_offset
-)
+    )
 {
    map_step   /= sizeof(*map);
@@ -684,6 +684,12 @@ void edgesHysteresisGlobal
    __local ushort2 s_st[stack_size];
+    if(gidx + gidy == 0)
+    {
+        *counter = 0;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
    if(lidx == 0)
    {
        s_counter = 0;
@@ -770,8 +776,8 @@ void edgesHysteresisGlobal
 // map		edge type mappings
 // dst		edge output
 __kernel
-void getEdges
+    void getEdges
-(
+    (
    __global const int * map,
    __global uchar * dst,
    int rows,
@@ -780,7 +786,7 @@ void getEdges
    int map_offset,
    int dst_step,
    int dst_offset
-)
+    )
 {
    map_step   /= sizeof(*map);
    map_offset /= sizeof(*map);

--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/kernels/nonfree_surf.cl
@@ -58,17 +58,9 @@
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+#define FLT_EPSILON (1e-15)
 #define CV_PI_F 3.14159265f
-// print greyscale image to show image layout
-__kernel void printImage(image2d_t img)
-{
-    printf("(%d, %d) - %3d \n", 
-        get_global_id(0), 
-        get_global_id(1), 
-        read_imageui(img, (int2)(get_global_id(0), get_global_id(1))).x
-        );
-}
 // Use integral image to calculate haar wavelets.
 // N = 2
@@ -444,7 +436,6 @@ __kernel
        float val0 = N9[localLin];
        if (val0 > c_hessianThreshold)
        {
-            //printf(\"(%3d, %3d) N9[%3d]=%7.1f val0=%7.1f\\n\", l_x, l_y, localLin - zoff, N9[localLin], val0);
            // Coordinates for the start of the wavelet in the sum image. There
            // is some integer division involved, so don't try to simplify this
            // (cancel out sampleStep) without checking the result is the same
@@ -726,6 +717,7 @@ __kernel
    __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
    __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
    volatile __local  float s_X[128];
    volatile __local  float s_Y[128];
    volatile __local  float s_angle[128];
@@ -737,6 +729,7 @@ __kernel
    and building the keypoint descriptor are defined relative to 's' */
    const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
    /* To find the dominant orientation, the gradients in x and y are
    sampled in a circle of radius 6s using wavelets of size 4s.
    We ensure the gradient wavelet size is even to ensure the
@@ -765,9 +758,11 @@ __kernel
            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
            angle = atan2(Y, X);
            if (angle < 0)
                angle += 2.0f * CV_PI_F;
            angle *= 180.0f / CV_PI_F;
        }
    }
    s_X[tid] = X;
@@ -807,7 +802,6 @@ __kernel
            sumx += s_X[get_local_id(0) + 96];
            sumy += s_Y[get_local_id(0) + 96];
        }
        reduce_32_sum(s_sumx + get_local_id(1) * 32, sumx, get_local_id(0));
        reduce_32_sum(s_sumy + get_local_id(1) * 32, sumy, get_local_id(0));
@@ -818,10 +812,8 @@ __kernel
            bestx = sumx;
            besty = sumy;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (get_local_id(0) == 0)
    {
        s_X[get_local_id(1)] = bestx;
@@ -846,6 +838,10 @@ __kernel
            kp_dir += 2.0f * CV_PI_F;
        kp_dir *= 180.0f / CV_PI_F;
+        kp_dir = 360.0f - kp_dir;
+        if (fabs(kp_dir - 360.f) < FLT_EPSILON)
+            kp_dir = 0.f;
        featureDir[get_group_id(0)] = kp_dir;
    }
 }
@@ -940,7 +936,10 @@ void calc_dx_dy(
    const float centerX = featureX[get_group_id(0)];
    const float centerY = featureY[get_group_id(0)];
    const float size = featureSize[get_group_id(0)];
-    const float descriptor_dir = featureDir[get_group_id(0)] * (float)(CV_PI_F / 180.0f);
+    float descriptor_dir = 360.0f - featureDir[get_group_id(0)];
+    if (fabs(descriptor_dir - 360.f) < FLT_EPSILON)
+        descriptor_dir = 0.f;
+    descriptor_dir *= (float)(CV_PI_F / 180.0f);
    /* The sampling intervals and wavelet sized for selecting an orientation
    and building the keypoint descriptor are defined relative to 's' */

--- a/modules/ocl/src/kernels/objdetect_hog.cl
+++ b/modules/ocl/src/kernels/objdetect_hog.cl
@@ -448,3 +448,42 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
        grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
    }
 }
+//----------------------------------------------------------------------------
+// Resize
+__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
+                                 int dst_offset, int src_offset, int dst_step, int src_step, 
+                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    int sx = (int)floor(dx*ifx+0.5f);
+    int sy = (int)floor(dy*ify+0.5f);
+    sx = min(sx, src_cols-1);
+    sy = min(sy, src_rows-1);
+    int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
+    int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
+    if(dx<dst_cols && dy<dst_rows)
+        dst[dpos] = src[spos];
+}
+__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
+                                 int dst_offset, int src_offset, int dst_step, int src_step, 
+                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    int sx = (int)floor(dx*ifx+0.5f);
+    int sy = (int)floor(dy*ify+0.5f);
+    sx = min(sx, src_cols-1);
+    sy = min(sy, src_rows-1);
+    int dpos = dst_offset + dy * dst_step + dx;
+    int spos = src_offset + sy * src_step + sx;
+    if(dx<dst_cols && dy<dst_rows)
+        dst[dpos] = src[spos];
+}
\ No newline at end of file
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@@ -51,8 +51,6 @@ using namespace cv;
 using namespace cv::ocl;
 using namespace std;
-#define EXT_FP64 0
 #if !defined (HAVE_OPENCL)
 void cv::ocl::matchTemplate(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
 #else
@@ -113,7 +111,6 @@ namespace cv { namespace ocl
        return 0;
    }
    //////////////////////////////////////////////////////////////////////
    // SQDIFF
    void matchTemplate_SQDIFF(
@@ -137,11 +134,11 @@ namespace cv { namespace ocl
    {
        matchTemplate_CCORR(image,templ,result,buf);
        buf.image_sums.resize(1);
-		buf.image_sqsums.resize(1);
-		integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
-#if EXT_FP64 && SQRSUM_FIXED
+        integral(image.reshape(1), buf.image_sums[0]);
+#if SQRSUM_FIXED
        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 #else
        Mat sqr_mat = templ.reshape(1);
@@ -237,16 +234,12 @@ namespace cv { namespace ocl
        buf.image_sqsums.resize(1);
        integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
-#if EXT_FP64 && SQRSUM_FIXED
+#if SQRSUM_FIXED
        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-#elif EXT_FP64
+#else
        oclMat templ_c1 = templ.reshape(1);
        multiply(templ_c1, templ_c1, templ_c1);
        unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0];
-#else
-		Mat m_templ_c1 = templ.reshape(1);
-		multiply(m_templ_c1, m_templ_c1, m_templ_c1);
-		unsigned long long templ_sqsum = (unsigned long long)sum(m_templ_c1)[0];
 #endif
        Context *clCxt = image.clCxt;
        string kernelName = "normalizeKernel";
@@ -332,17 +325,10 @@ namespace cv { namespace ocl
        if(image.channels() == 1)
        {
            buf.image_sums.resize(1);
-			// FIXME: temp fix for incorrect integral kernel
+            integral(image, buf.image_sums[0]);
-			oclMat tmp_oclmat;
-			integral(image, buf.image_sums[0], tmp_oclmat);
            float templ_sum = 0;
-#if EXT_FP64
            templ_sum = (float)sum(templ)[0] / templ.size().area();
-#else
-			Mat o_templ = templ;
-			templ_sum = (float)sum(o_templ)[0] / o_templ.size().area(); // temp fix for non-double supported machine
-#endif
            args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
@@ -351,29 +337,13 @@ namespace cv { namespace ocl
        else
        {
            Vec4f templ_sum = Vec4f::all(0);
-#if EXT_FP64
            split(image,buf.images);
            templ_sum = sum(templ) / templ.size().area();
-#else 
-			// temp fix for non-double supported machine
-			Mat o_templ = templ, o_image = image;
-			vector<Mat> o_mat_vector;
-			o_mat_vector.resize(image.channels());
-			buf.images.resize(image.channels());
-			split(o_image, o_mat_vector);
-			for(int i = 0; i < o_mat_vector.size(); i ++)
-			{
-				buf.images[i] = oclMat(o_mat_vector[i]);
-			}
-			templ_sum = sum(o_templ) / templ.size().area();
-#endif
            buf.image_sums.resize(buf.images.size());
            for(int i = 0; i < image.channels(); i ++)
            {
-				// FIXME: temp fix for incorrect integral kernel
+                integral(buf.images[i], buf.image_sums[i]);
-				oclMat omat_temp;
-				integral(buf.images[i], buf.image_sums[i], omat_temp);
            }
            switch(image.channels())
            {
@@ -432,10 +402,9 @@ namespace cv { namespace ocl
            integral(image, buf.image_sums[0], buf.image_sqsums[0]);
            float templ_sum = 0;
            float templ_sqsum = 0;
-#if EXT_FP64
            templ_sum   = (float)sum(templ)[0];
 #if SQRSUM_FIXED
-			templ_sqsum = sqrSum(templ);
+            templ_sqsum = sqrSum(templ)[0];
 #else
            oclMat templ_sqr = templ;
            multiply(templ,templ, templ_sqr);
@@ -443,13 +412,7 @@ namespace cv { namespace ocl
 #endif //SQRSUM_FIXED
            templ_sqsum -= scale * templ_sum * templ_sum;
            templ_sum   *= scale;
-#else
-			// temp fix for non-double supported machine
-			Mat o_templ = templ;
-			templ_sum   = (float)sum(o_templ)[0]; 
-			templ_sqsum = sum(o_templ.mul(o_templ))[0] - scale * templ_sum * templ_sum;
-			templ_sum  *= scale;
-#endif
            args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
@@ -463,7 +426,7 @@ namespace cv { namespace ocl
        {
            Vec4f templ_sum   = Vec4f::all(0);
            Vec4f templ_sqsum = Vec4f::all(0);
-#if EXT_FP64
            split(image,buf.images);
            templ_sum   = sum(templ);
 #if SQRSUM_FIXED
@@ -475,21 +438,6 @@ namespace cv { namespace ocl
 #endif //SQRSUM_FIXED
            templ_sqsum -= scale * templ_sum * templ_sum;
-#else 
-			// temp fix for non-double supported machine
-			Mat o_templ = templ, o_image = image;
-			vector<Mat> o_mat_vector;
-			o_mat_vector.resize(image.channels());
-			buf.images.resize(image.channels());
-			split(o_image, o_mat_vector);
-			for(int i = 0; i < o_mat_vector.size(); i ++)
-			{
-				buf.images[i] = oclMat(o_mat_vector[i]);
-			}
-			templ_sum    = sum(o_templ);
-			templ_sqsum  = sum(o_templ.mul(o_templ));
-#endif
            float templ_sqsum_sum = 0;
            for(int i = 0; i < image.channels(); i ++)
            {

--- a/modules/ocl/src/pyrdown.cpp
+++ b/modules/ocl/src/pyrdown.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//		Dachuan Zhao, dachuan@multicorewareinc.com
+//		Yao Wang, yao@multicorewareinc.com
+//    
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
 #include "precomp.hpp"
 using namespace cv;
@@ -24,7 +69,6 @@ namespace cv
 template<typename T>
 void pyrdown_run(const oclMat &src, const oclMat &dst)
 {
-    CV_Assert(src.cols / 2 == dst.cols && src.rows / 2 == dst.rows);
    CV_Assert(src.type() == dst.type());
    CV_Assert(src.depth() != CV_8S);
@@ -108,7 +152,7 @@ void cv::ocl::pyrDown(const oclMat& src, oclMat& dst)
    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-	//dst.step = dst.rows;
+	dst.download_channels = src.download_channels;
    pyrdown_run(src, dst);
 }

--- a/modules/ocl/src/pyrup.cpp
+++ b/modules/ocl/src/pyrup.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //		Zhang Chunpeng chunpeng@multicorewareinc.com
+//		Yao Wang, yao@multicorewareinc.com
 //    
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -63,6 +64,7 @@ namespace cv { namespace ocl
 	void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst)
 	{		
 		dst.create(src.rows * 2, src.cols * 2, src.type());
+		dst.download_channels=src.download_channels;
 		Context *clCxt = src.clCxt;
 		const std::string kernelName = "pyrUp";

--- a/modules/ocl/src/surf.cpp
+++ b/modules/ocl/src/surf.cpp
@@ -149,8 +149,7 @@ namespace
            //loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
            bindImgTex(img);
-            oclMat integral_sqsum;
+            integral(img, surf_.sum); // the two argumented integral version is incorrect
-            integral(img, surf_.sum, integral_sqsum); // the two argumented integral version is incorrect
            bindSumTex(surf_.sum);
            maskSumTex = 0;

--- a/modules/ocl/test/test_columnsum.cpp
+++ b/modules/ocl/test/test_columnsum.cpp
@@ -74,12 +74,10 @@ PARAM_TEST_CASE(ColumnSum, cv::Size, bool )
 TEST_P(ColumnSum, Accuracy)
 {
    cv::Mat src = randomMat(size, CV_32FC1);
-	//cv::Mat src(size,CV_32FC1);
+	cv::ocl::oclMat d_dst;
+	cv::ocl::oclMat d_src(src);	
-	//cv::ocl::oclMat d_dst = ::createMat(size,src.type(),useRoi);
+    cv::ocl::columnSum(d_src,d_dst);
-	cv::ocl::oclMat d_dst = loadMat(src,useRoi);
-    cv::ocl::columnSum(loadMat(src,useRoi),d_dst);
    cv::Mat dst(d_dst);

--- a/modules/ocl/test/test_pyrdown.cpp
+++ b/modules/ocl/test/test_pyrdown.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Dachuan Zhao, dachuan@multicorewareinc.com
+//    Yao Wang yao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,9 +44,6 @@
 //
 //M*/
-//#define PRINT_CPU_TIME 1000
-//#define PRINT_TIME
 #include "precomp.hpp"
 #include <iomanip>
@@ -58,66 +56,15 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;
-PARAM_TEST_CASE(PyrDown, MatType, bool)
+PARAM_TEST_CASE(PyrDown, MatType, int)
 {
 	int type;
-    cv::Scalar val;
+	int channels;
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
    virtual void SetUp()
    {
        type = GET_PARAM(0);
+		channels = GET_PARAM(1);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
        //int devnums = getDevice(oclinfo);
        //CV_Assert(devnums > 0);
@@ -127,169 +74,36 @@ PARAM_TEST_CASE(PyrDown, MatType, bool)
 	void Cleanup()
 	{
-		mat1.release();
-		mat2.release();
-		mask.release();
-		dst.release();
-		dst1.release();
-		mat1_roi.release();
-		mat2_roi.release();
-		mask_roi.release();
-		dst_roi.release();
-		dst1_roi.release();
-		gdst_whole.release();
-		gdst1_whole.release();
-		gmat1.release();
-		gmat2.release();
-		gdst.release();
-		gdst1.release();
-		gmask.release();
-	}
-    void random_roi()
-    {
-        cv::RNG &rng = TS::ptr()->get_rng();
-#ifdef RANDOMROI
-        //randomize ROI
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-#else
-        roicols = mat1.cols;
-        roirows = mat1.rows;
-        src1x = 0;
-        src1y = 0;
-        dstx = 0;
-        dsty = 0;
-#endif
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi; //end
 	}
 };
-#define VARNAME(A) string(#A);
-void PrePrint()
-{
-		//for(int i = 0; i < MHEIGHT; i++)
-		//{
-		//	printf("(%d) ", i);
-		//	for(int k = 0; k < MWIDTH; k++)
-		//	{
-		//		printf("%d ", mat1_roi.data[i * MHEIGHT + k]);
-		//	}
-		//	printf("\n");
-		//}
-}
-void PostPrint()
-{
-		//dst_roi.convertTo(dst_roi,CV_32S);
-		//cpu_dst.convertTo(cpu_dst,CV_32S);
-		//dst_roi -= cpu_dst;
-		//cpu_dst -= dst_roi;
-		//for(int i = 0; i < MHEIGHT / 2; i++)
-		//{
-		//	printf("(%d) ", i);
-		//	for(int k = 0; k < MWIDTH / 2; k++)
-		//	{
-		//		if(gmat1.depth() == 0)
-		//		{
-		//			if(gmat1.channels() == 1)
-		//			{
-		//				printf("%d ", dst_roi.data[i * MHEIGHT / 2 + k]);
-		//			}
-		//			else
-		//			{
-		//				printf("%d ", ((unsigned*)dst_roi.data)[i * MHEIGHT / 2 + k]);
-		//			}
-		//		}
-		//		else if(gmat1.depth() == 5)
-		//		{
-		//			printf("%.6f ", ((float*)dst_roi.data)[i * MHEIGHT / 2 + k]);
-		//		}
-		//	}
-		//	printf("\n");
-		//}
-		//for(int i = 0; i < MHEIGHT / 2; i++)
-		//{
-		//	printf("(%d) ", i);
-		//	for(int k = 0; k < MWIDTH / 2; k++)
-		//	{
-		//		if(gmat1.depth() == 0)
-		//		{
-		//			if(gmat1.channels() == 1)
-		//			{
-		//				printf("%d ", cpu_dst.data[i * MHEIGHT / 2 + k]);
-		//			}
-		//			else
-		//			{
-		//				printf("%d ", ((unsigned*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
-		//			}
-		//		}
-		//		else if(gmat1.depth() == 5)
-		//		{
-		//			printf("%.6f ", ((float*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
-		//		}
-		//	}
-		//	printf("\n");
-		//}
-}
-////////////////////////////////PyrDown/////////////////////////////////////////////////
-//struct PyrDown : ArithmTestBase {};
 TEST_P(PyrDown, Mat)
 {
    for(int j = 0; j < LOOP_TIMES; j++)
    {
-        random_roi();
+        cv::Size size(MWIDTH, MHEIGHT);
+		cv::RNG &rng = TS::ptr()->get_rng();
+		cv::Mat src=randomMat(rng, size, CV_MAKETYPE(type, channels), 0, 100, false);
-		cv::pyrDown(mat1_roi, dst_roi);
+		cv::ocl::oclMat gsrc(src), gdst;
-		cv::ocl::pyrDown(gmat1, gdst);
+		cv::Mat dst_cpu;
+		cv::pyrDown(src, dst_cpu);
+		cv::ocl::pyrDown(gsrc, gdst);
-        cv::Mat cpu_dst;
+        cv::Mat dst;
-        gdst.download(cpu_dst);
+        gdst.download(dst);
-        char s[1024];
+		char s[1024]={0};
-        sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
-		EXPECT_MAT_NEAR(dst_roi, cpu_dst, dst_roi.depth() == CV_32F ? 1e-5f : 1.0f, s);
+		EXPECT_MAT_NEAR(dst, dst_cpu, dst.depth() == CV_32F ? 1e-4f : 1.0f, s);
 		Cleanup();
    }
 }
-//********test****************
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
-                            Values(false))); // Values(false) is the reserved parameter
 #endif // HAVE_OPENCL
--- a/modules/ocl/test/test_pyrup.cpp
+++ b/modules/ocl/test/test_pyrup.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Zhang Chunpeng chunpeng@multicorewareinc.com
+//    Yao Wang yao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -48,44 +49,49 @@
 #ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
-PARAM_TEST_CASE(PyrUp,cv::Size,int)
+PARAM_TEST_CASE(PyrUp, MatType, int)
 {
-	cv::Size size;
 	int type;
+	int channels;
 	//std::vector<cv::ocl::Info> oclinfo;
 	virtual void SetUp()
 	{
 		//int devnums = cv::ocl::getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
 		//CV_Assert(devnums > 0);
-		size = GET_PARAM(0);
+		type = GET_PARAM(0);
-		type = GET_PARAM(1);
+		channels = GET_PARAM(1);
 	}
 };
 TEST_P(PyrUp,Accuracy)
 {
-	cv::Mat src = randomMat(size,type);
+	for(int j = 0; j < LOOP_TIMES; j++)
+    {
+		Size size(MWIDTH, MHEIGHT);
-	cv::Mat dst_gold;
+		Mat src = randomMat(size,CV_MAKETYPE(type, channels));	
-	cv::pyrUp(src,dst_gold);
+		Mat dst_gold;
+		pyrUp(src,dst_gold);
-	cv::ocl::oclMat dst;
+		ocl::oclMat dst;
-	cv::ocl::oclMat srcMat(src);
+		ocl::oclMat srcMat(src);
-	cv::ocl::pyrUp(srcMat,dst);
+		ocl::pyrUp(srcMat,dst);
+		Mat cpu_dst;
+		dst.download(cpu_dst);
 		char s[100]={0};
-	EXPECT_MAT_NEAR(dst_gold, dst, (src.depth() == CV_32F ? 1e-4f : 1.0),s);	
+		EXPECT_MAT_NEAR(dst_gold, cpu_dst, (src.depth() == CV_32F ? 1e-4f : 1.0),s);	
+	}
 }
-#if 1
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
-    testing::Values(cv::Size(32, 32)),
+                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
-    testing::Values(MatType(CV_8UC1),MatType(CV_16UC1),MatType(CV_32FC1),MatType(CV_8UC4),
-	MatType(CV_16UC4),MatType(CV_32FC4))));
-#endif
 #endif // HAVE_OPENCL
\ No newline at end of file