Merge pull request #288 from Auron-X/TLD_OpenCL_Support

TLD Open CL Support

Merge pull request #288 from Auron-X/TLD_OpenCL_Support
TLD Open CL Support
f9634fd8 · Vadim Pisarevsky · f86a703b · 6b377061 · f9634fd8 · f9634fd8
Commit f9634fd8 authored Jul 08, 2015 by Vadim Pisarevsky
11 changed files
--- a/modules/tracking/samples/tld_test.cpp
+++ b/modules/tracking/samples/tld_test.cpp
@@ -48,8 +48,8 @@
 using namespace std;
 using namespace cv;

-#define NUM_TEST_FRAMES 500
-#define TEST_VIDEO_INDEX 1		//TLD Dataset Video Index from 1-10
+#define NUM_TEST_FRAMES 100
+#define TEST_VIDEO_INDEX 7		//TLD Dataset Video Index from 1-10
 //#define RECORD_VIDEO_FLG

 static Mat image;

--- a/modules/tracking/src/opencl/tldDetector.cl
+++ b/modules/tracking/src/opencl/tldDetector.cl
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+
+
+__kernel void NCC(__global const uchar *patch,
+				 __global const uchar *positiveSamples,
+				 __global const uchar *negativeSamples,
+				 __global float *ncc,
+				 int posNum,
+				 int negNum)
+{
+	int id = get_global_id(0);
+	if (id >= 1000) return;
+	bool posFlg;
+
+	if (id < 500)
+		posFlg = true;
+	if (id >= 500)
+	{
+		//Negative index
+		id = id - 500;
+		posFlg = false;
+	}
+
+	//Variables
+	int s1 = 0, s2 = 0, n1 = 0, n2 = 0, prod = 0;
+	float sq1 = 0, sq2 = 0, ares = 0;
+	int N = 225;
+	//NCC with positive sample
+	if (posFlg && id < posNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+			s1 += positiveSamples[id * N + i];
+			s2 += patch[i];
+			n1 += positiveSamples[id * N + i] * positiveSamples[id * N + i];
+			n2 += patch[i] * patch[i];
+			prod += positiveSamples[id * N + i] * patch[i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		ncc[id] = ares;
+	}
+
+	//NCC with negative sample
+	if (!posFlg && id < negNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+
+			s1 += negativeSamples[id * N + i];
+			s2 += patch[i];
+			n1 += negativeSamples[id * N + i] * negativeSamples[id * N + i];
+			n2 += patch[i] * patch[i];
+			prod += negativeSamples[id * N + i] * patch[i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		ncc[id+500] = ares;
+	}
+}
+
+__kernel void batchNCC(__global const uchar *patches,
+	__global const uchar *positiveSamples,
+	__global const uchar *negativeSamples,
+	__global float *posNcc,
+	__global float *negNcc,
+	int posNum,
+	int negNum,
+	int patchNum)
+{
+	int id = get_global_id(0);
+	bool posFlg;
+
+	if (id < 500*patchNum)
+		posFlg = true;
+	if (id >= 500*patchNum)
+	{
+		//Negative index
+		id = id - 500*patchNum;
+		posFlg = false;
+	}
+
+	int modelSampleID = id % 500;
+	int patchID = id / 500;
+
+	//Variables
+	int s1 = 0, s2 = 0, n1 = 0, n2 = 0, prod = 0;
+	float sq1 = 0, sq2 = 0, ares = 0;
+	int N = 225;
+
+	//NCC with positive sample
+	if (posFlg && modelSampleID < posNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+			s1 += positiveSamples[modelSampleID * N + i];
+			s2 += patches[patchID*N + i];
+			n1 += positiveSamples[modelSampleID * N + i] * positiveSamples[modelSampleID * N + i];
+			n2 += patches[patchID*N + i] * patches[patchID*N + i];
+			prod += positiveSamples[modelSampleID * N + i] * patches[patchID*N + i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		posNcc[id] = ares;
+	}
+
+	//NCC with negative sample
+	if (!posFlg && modelSampleID < negNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+
+			s1 += negativeSamples[modelSampleID * N + i];
+			s2 += patches[patchID*N + i];
+			n1 += negativeSamples[modelSampleID * N + i] * negativeSamples[modelSampleID * N + i];
+			n2 += patches[patchID*N + i] * patches[patchID*N + i];
+			prod += negativeSamples[modelSampleID * N + i] * patches[patchID*N + i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		negNcc[id] = ares;
+	}
+}
--- a/modules/tracking/src/precomp.hpp
+++ b/modules/tracking/src/precomp.hpp
@@ -44,10 +44,11 @@

 #include "opencv2/tracking.hpp"
 #include "opencv2/core/utility.hpp"
+#include "opencv2/core/ocl.hpp"

 namespace cv
 {
-    extern const double ColorNames[][10];
+	extern const double ColorNames[][10];
 }

 #endif
--- a/modules/tracking/src/tldDetector.cpp
+++ b/modules/tracking/src/tldDetector.cpp
@@ -65,19 +65,232 @@ namespace cv
 		// Calculate Relative similarity of the patch (NN-Model)
 		double TLDDetector::Sr(const Mat_<uchar>& patch)
 		{
+			/*
+			int64 e1, e2;
+			float t;
+			e1 = getTickCount();
 			double splus = 0.0, sminus = 0.0;
 			for (int i = 0; i < (int)(*positiveExamples).size(); i++)
 				splus = std::max(splus, 0.5 * (NCC((*positiveExamples)[i], patch) + 1.0));
 			for (int i = 0; i < (int)(*negativeExamples).size(); i++)
 				sminus = std::max(sminus, 0.5 * (NCC((*negativeExamples)[i], patch) + 1.0));
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			printf("Sr: %f\n", t);
 			if (splus + sminus == 0.0)
 				return 0.0;
 			return splus / (sminus + splus);
+			*/
+			//int64 e1, e2;
+			//float t;
+			//e1 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+			Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			for (int i = 0; i < *posNum; i++)
+			{
+				modelSample.data = &(posExp->data[i * 225]);
+				splus = std::max(splus, 0.5 * (NCC(modelSample, patch) + 1.0));
+			}
+			for (int i = 0; i < *negNum; i++)
+			{
+				modelSample.data = &(negExp->data[i * 225]);
+				sminus = std::max(sminus, 0.5 * (NCC(modelSample, patch) + 1.0));
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sr CPU: %f\n", t);
+			if (splus + sminus == 0.0)
+				return 0.0;
+			return splus / (sminus + splus);
+		}
+
+		double TLDDetector::ocl_Sr(const Mat_<uchar>& patch)
+		{
+			//int64 e1, e2, e3, e4;
+			//double t;
+			//e1 = getTickCount();
+			//e3 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+
+
+			UMat devPatch = patch.getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPositiveSamples = posExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegativeSamples = negExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNCC(1, 2*MAX_EXAMPLES_IN_MODEL, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+
+
+			ocl::Kernel k;
+			ocl::ProgramSource src = ocl::tracking::tldDetector_oclsrc;
+			String error;
+			ocl::Program prog(src, NULL, error);
+			k.create("NCC", prog);
+			if (k.empty())
+				printf("Kernel create failed!!!\n");
+			k.args(
+				ocl::KernelArg::PtrReadOnly(devPatch),
+				ocl::KernelArg::PtrReadOnly(devPositiveSamples),
+				ocl::KernelArg::PtrReadOnly(devNegativeSamples),
+				ocl::KernelArg::PtrWriteOnly(devNCC),
+				posNum,
+				negNum);
+
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Mem Cpy GPU: %f\n", t);
+
+			size_t globSize = 1000;
+			//e3 = getTickCount();
+			if (!k.run(1, &globSize, NULL, false))
+				printf("Kernel Run Error!!!");
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Kernel Run GPU: %f\n", t);
+
+			//e3 = getTickCount();
+			Mat resNCC = devNCC.getMat(ACCESS_READ);
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Read Mem GPU: %f\n", t);
+
+			////Compare
+			//Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			//for (int i = 0; i < 200; i+=17)
+			//{
+			//	modelSample.data = &(posExp->data[i * 225]);
+			//	printf("%f\t%f\n\n", resNCC.at<float>(i), NCC(modelSample, patch));
+			//}
+
+			//for (int i = 0; i < 200; i+=23)
+			//{
+			//	modelSample.data = &(negExp->data[i * 225]);
+			//	printf("%f\t%f\n", resNCC.at<float>(500+i), NCC(modelSample, patch));
+			//}
+
+
+			for (int i = 0; i < *posNum; i++)
+				splus = std::max(splus, 0.5 * (resNCC.at<float>(i) + 1.0));
+
+			for (int i = 0; i < *negNum; i++)
+				sminus = std::max(sminus, 0.5 * (resNCC.at<float>(i+500) +1.0));
+
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sr GPU: %f\n\n", t);
+
+			if (splus + sminus == 0.0)
+				return 0.0;
+			return splus / (sminus + splus);
+		}
+
+		void TLDDetector::ocl_batchSrSc(const Mat_<uchar>& patches, double *resultSr, double *resultSc, int numOfPatches)
+		{
+			//int64 e1, e2, e3, e4;
+			//double t;
+			//e1 = getTickCount();
+			//e3 = getTickCount();
+
+			UMat devPatches = patches.getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPositiveSamples = posExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegativeSamples = negExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPosNCC(MAX_EXAMPLES_IN_MODEL, numOfPatches, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegNCC(MAX_EXAMPLES_IN_MODEL, numOfPatches, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+
+			ocl::Kernel k;
+			ocl::ProgramSource src = ocl::tracking::tldDetector_oclsrc;
+			String error;
+			ocl::Program prog(src, NULL, error);
+			k.create("batchNCC", prog);
+			if (k.empty())
+				printf("Kernel create failed!!!\n");
+			k.args(
+				ocl::KernelArg::PtrReadOnly(devPatches),
+				ocl::KernelArg::PtrReadOnly(devPositiveSamples),
+				ocl::KernelArg::PtrReadOnly(devNegativeSamples),
+				ocl::KernelArg::PtrWriteOnly(devPosNCC),
+				ocl::KernelArg::PtrWriteOnly(devNegNCC),
+				posNum,
+				negNum,
+				numOfPatches);
+
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Mem Cpy GPU: %f\n", t);
+
+			// 2 -> Pos&Neg
+			size_t globSize = 2 * numOfPatches*MAX_EXAMPLES_IN_MODEL;
+			//e3 = getTickCount();
+			if (!k.run(1, &globSize, NULL, false))
+				printf("Kernel Run Error!!!");
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Kernel Run GPU: %f\n", t);
+
+			//e3 = getTickCount();
+			Mat posNCC = devPosNCC.getMat(ACCESS_READ);
+			Mat negNCC = devNegNCC.getMat(ACCESS_READ);
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Read Mem GPU: %f\n", t);
+
+			//Calculate Srs
+			for (int id = 0; id < numOfPatches; id++)
+			{
+				double spr = 0.0, smr = 0.0, spc = 0.0, smc = 0;
+				int med = getMedian((*timeStampsPositive));
+				for (int i = 0; i < *posNum; i++)
+				{
+					spr = std::max(spr, 0.5 * (posNCC.at<float>(id * 500 + i) + 1.0));
+					if ((int)(*timeStampsPositive)[i] <= med)
+						spc = std::max(spr, 0.5 * (posNCC.at<float>(id * 500 + i) + 1.0));
+				}
+				for (int i = 0; i < *negNum; i++)
+					smc = smr = std::max(smr, 0.5 * (negNCC.at<float>(id * 500 + i) + 1.0));
+
+				if (spr + smr == 0.0)
+					resultSr[id] = 0.0;
+				else
+					resultSr[id] = spr / (smr + spr);
+
+				if (spc + smc == 0.0)
+					resultSc[id] = 0.0;
+				else
+					resultSc[id] = spc / (smc + spc);
+			}
+
+			////Compare positive NCCs
+			/*Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			Mat_<uchar> patch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			for (int j = 0; j < numOfPatches; j++)
+			{
+				for (int i = 0; i < 1; i++)
+				{
+					modelSample.data = &(posExp->data[i * 225]);
+					patch.data = &(patches.data[j * 225]);
+					printf("%f\t%f\n", resultSr[j], Sr(patch));
+					printf("%f\t%f\n", resultSc[j], Sc(patch));
+				}
+			}*/
+
+			//for (int i = 0; i < 200; i+=23)
+			//{
+			//	modelSample.data = &(negExp->data[i * 225]);
+			//	printf("%f\t%f\n", resNCC.at<float>(500+i), NCC(modelSample, patch));
+			//}
+
+
+
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sr GPU: %f\n\n", t);
 		}

 		// Calculate Conservative similarity of the patch (NN-Model)
 		double TLDDetector::Sc(const Mat_<uchar>& patch)
 		{
+			/*
+			int64 e1, e2;
+			float t;
+			e1 = getTickCount();
 			double splus = 0.0, sminus = 0.0;
 			int med = getMedian((*timeStampsPositive));
 			for (int i = 0; i < (int)(*positiveExamples).size(); i++)
@@ -87,6 +300,117 @@ namespace cv
 			}
 			for (int i = 0; i < (int)(*negativeExamples).size(); i++)
 				sminus = std::max(sminus, 0.5 * (NCC((*negativeExamples)[i], patch) + 1.0));
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			printf("Sc: %f\n", t);
+			if (splus + sminus == 0.0)
+				return 0.0;
+
+			return splus / (sminus + splus);
+			*/
+
+			//int64 e1, e2;
+			//double t;
+			//e1 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+			Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			int med = getMedian((*timeStampsPositive));
+			for (int i = 0; i < *posNum; i++)
+			{
+				if ((int)(*timeStampsPositive)[i] <= med)
+				{
+					modelSample.data = &(posExp->data[i * 225]);
+					splus = std::max(splus, 0.5 * (NCC(modelSample, patch) + 1.0));
+				}
+			}
+			for (int i = 0; i < *negNum; i++)
+			{
+				modelSample.data = &(negExp->data[i * 225]);
+				sminus = std::max(sminus, 0.5 * (NCC(modelSample, patch) + 1.0));
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sc: %f\n", t);
+			if (splus + sminus == 0.0)
+				return 0.0;
+
+			return splus / (sminus + splus);
+		}
+
+		double TLDDetector::ocl_Sc(const Mat_<uchar>& patch)
+		{
+			//int64 e1, e2, e3, e4;
+			//float t;
+			//e1 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+
+			//e3 = getTickCount();
+
+			UMat devPatch = patch.getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPositiveSamples = posExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegativeSamples = negExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNCC(1, 2 * MAX_EXAMPLES_IN_MODEL, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+
+
+			ocl::Kernel k;
+			ocl::ProgramSource src = ocl::tracking::tldDetector_oclsrc;
+			String error;
+			ocl::Program prog(src, NULL, error);
+			k.create("NCC", prog);
+			if (k.empty())
+				printf("Kernel create failed!!!\n");
+			k.args(
+				ocl::KernelArg::PtrReadOnly(devPatch),
+				ocl::KernelArg::PtrReadOnly(devPositiveSamples),
+				ocl::KernelArg::PtrReadOnly(devNegativeSamples),
+				ocl::KernelArg::PtrWriteOnly(devNCC),
+				posNum,
+				negNum);
+
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Mem Cpy GPU: %f\n", t);
+
+			size_t globSize = 1000;
+			//e3 = getTickCount();
+			if (!k.run(1, &globSize, NULL, false))
+				printf("Kernel Run Error!!!");
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Kernel Run GPU: %f\n", t);
+
+			//e3 = getTickCount();
+			Mat resNCC = devNCC.getMat(ACCESS_READ);
+			//e4 = getTickCount();
+			//t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Read Mem GPU: %f\n", t);
+
+			////Compare
+			//Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			//for (int i = 0; i < 200; i+=17)
+			//{
+			//	modelSample.data = &(posExp->data[i * 225]);
+			//	printf("%f\t%f\n\n", resNCC.at<float>(i), NCC(modelSample, patch));
+			//}
+
+			//for (int i = 0; i < 200; i+=23)
+			//{
+			//	modelSample.data = &(negExp->data[i * 225]);
+			//	printf("%f\t%f\n", resNCC.at<float>(500+i), NCC(modelSample, patch));
+			//}
+
+			int med = getMedian((*timeStampsPositive));
+			for (int i = 0; i < *posNum; i++)
+				if ((int)(*timeStampsPositive)[i] <= med)
+					splus = std::max(splus, 0.5 * (resNCC.at<float>(i) +1.0));
+
+			for (int i = 0; i < *negNum; i++)
+				sminus = std::max(sminus, 0.5 * (resNCC.at<float>(i + 500) + 1.0));
+
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sc GPU: %f\n\n", t);
+
 			if (splus + sminus == 0.0)
 				return 0.0;
 			return splus / (sminus + splus);
@@ -129,76 +453,243 @@ namespace cv
 		}

 		//Detection - returns most probable new target location (Max Sc)
+
 		bool TLDDetector::detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize)
 		{
 			patches.clear();
-
-			Mat resized_img, blurred_img;
 			Mat_<uchar> standardPatch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
-			img.copyTo(resized_img);
-			imgBlurred.copyTo(blurred_img);
+			Mat tmp;
 			int dx = initSize.width / 10, dy = initSize.height / 10;
 			Size2d size = img.size();
 			double scale = 1.0;
-			int total = 0, pass = 0;
 			int npos = 0, nneg = 0;
-			double tmp = 0, maxSc = -5.0;
+			double maxSc = -5.0;
 			Rect2d maxScRect;
+			int scaleID;
+			std::vector <Mat> resized_imgs, blurred_imgs;
+			std::vector <Point> varBuffer, ensBuffer;
+			std::vector <int> varScaleIDs, ensScaleIDs;
+			//int64 e1, e2;
+			//double t;
+
+			//e1 = getTickCount();

 			//Detection part
+			//Generate windows and filter by variance
+			scaleID = 0;
+			resized_imgs.push_back(img);
+			blurred_imgs.push_back(imgBlurred);
 			do
 			{
 				Mat_<double> intImgP, intImgP2;
-				computeIntegralImages(resized_img, intImgP, intImgP2);
-
-				prepareClassifiers((int)blurred_img.step[0]);
-				for (int i = 0, imax = cvFloor((0.0 + resized_img.cols - initSize.width) / dx); i < imax; i++)
+				computeIntegralImages(resized_imgs[scaleID], intImgP, intImgP2);
+				for (int i = 0, imax = cvFloor((0.0 + resized_imgs[scaleID].cols - initSize.width) / dx); i < imax; i++)
 				{
-					for (int j = 0, jmax = cvFloor((0.0 + resized_img.rows - initSize.height) / dy); j < jmax; j++)
+					for (int j = 0, jmax = cvFloor((0.0 + resized_imgs[scaleID].rows - initSize.height) / dy); j < jmax; j++)
 					{
-						LabeledPatch labPatch;
-						total++;
 						if (!patchVariance(intImgP, intImgP2, originalVariancePtr, Point(dx * i, dy * j), initSize))
 							continue;
-						if (ensembleClassifierNum(&blurred_img.at<uchar>(dy * j, dx * i)) <= ENSEMBLE_THRESHOLD)
-							continue;
-						pass++;
+						varBuffer.push_back(Point(dx * i, dy * j));
+						varScaleIDs.push_back(scaleID);
+					}
+				}
+				scaleID++;
+				size.width /= SCALE_STEP;
+				size.height /= SCALE_STEP;
+				scale *= SCALE_STEP;
+				resize(img, tmp, size, 0, 0, DOWNSCALE_MODE);
+				resized_imgs.push_back(tmp);
+				GaussianBlur(resized_imgs[scaleID], tmp, GaussBlurKernelSize, 0.0f);
+				blurred_imgs.push_back(tmp);
+			} while (size.width >= initSize.width && size.height >= initSize.height);
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Variance: %d\t%f\n", varBuffer.size(), t);
+
+			//Encsemble classification
+			//e1 = getTickCount();
+			for (int i = 0; i < (int)varBuffer.size(); i++)
+			{
+				prepareClassifiers(static_cast<int> (blurred_imgs[varScaleIDs[i]].step[0]));
+				if (ensembleClassifierNum(&blurred_imgs[varScaleIDs[i]].at<uchar>(varBuffer[i].y, varBuffer[i].x)) <= ENSEMBLE_THRESHOLD)
+					continue;
+				ensBuffer.push_back(varBuffer[i]);
+				ensScaleIDs.push_back(varScaleIDs[i]);
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Ensemble: %d\t%f\n", ensBuffer.size(), t);

-						labPatch.rect = Rect2d(dx * i * scale, dy * j * scale, initSize.width * scale, initSize.height * scale);
-						resample(resized_img, Rect2d(Point(dx * i, dy * j), initSize), standardPatch);
-						tmp = Sr(standardPatch);
+			//NN classification
+			//e1 = getTickCount();
+			for (int i = 0; i < (int)ensBuffer.size(); i++)
+			{
+				LabeledPatch labPatch;
+				double curScale = pow(SCALE_STEP, ensScaleIDs[i]);
+				labPatch.rect = Rect2d(ensBuffer[i].x*curScale, ensBuffer[i].y*curScale, initSize.width * curScale, initSize.height * curScale);
+				resample(resized_imgs[ensScaleIDs[i]], Rect2d(ensBuffer[i], initSize), standardPatch);

-						////To fix: Check the paper, probably this cause wrong learning
-						//
-						labPatch.isObject = tmp > THETA_NN;
-						labPatch.shouldBeIntegrated = abs(tmp - THETA_NN) < 0.1;
-						patches.push_back(labPatch);
-						//
+				double srValue, scValue;
+				srValue = Sr(standardPatch);

-						if (!labPatch.isObject)
-						{
-							nneg++;
+				////To fix: Check the paper, probably this cause wrong learning
+				//
+				labPatch.isObject = srValue > THETA_NN;
+				labPatch.shouldBeIntegrated = abs(srValue - THETA_NN) < 0.1;
+				patches.push_back(labPatch);
+				//
+
+				if (!labPatch.isObject)
+				{
+					nneg++;
+					continue;
+				}
+				else
+				{
+					npos++;
+				}
+				scValue = Sc(standardPatch);
+				if (scValue > maxSc)
+				{
+					maxSc = scValue;
+					maxScRect = labPatch.rect;
+				}
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("NN: %d\t%f\n", patches.size(), t);
+
+			if (maxSc < 0)
+				return false;
+			res = maxScRect;
+			return true;
+		}
+
+		bool TLDDetector::ocl_detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize)
+		{
+			patches.clear();
+			Mat_<uchar> standardPatch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			Mat tmp;
+			int dx = initSize.width / 10, dy = initSize.height / 10;
+			Size2d size = img.size();
+			double scale = 1.0;
+			int npos = 0, nneg = 0;
+			double maxSc = -5.0;
+			Rect2d maxScRect;
+			int scaleID;
+			std::vector <Mat> resized_imgs, blurred_imgs;
+			std::vector <Point> varBuffer, ensBuffer;
+			std::vector <int> varScaleIDs, ensScaleIDs;
+			//int64 e1, e2;
+			//double t;
+
+			//e1 = getTickCount();
+			//Detection part
+			//Generate windows and filter by variance
+			scaleID = 0;
+			resized_imgs.push_back(img);
+			blurred_imgs.push_back(imgBlurred);
+			do
+			{
+				Mat_<double> intImgP, intImgP2;
+				computeIntegralImages(resized_imgs[scaleID], intImgP, intImgP2);
+				for (int i = 0, imax = cvFloor((0.0 + resized_imgs[scaleID].cols - initSize.width) / dx); i < imax; i++)
+				{
+					for (int j = 0, jmax = cvFloor((0.0 + resized_imgs[scaleID].rows - initSize.height) / dy); j < jmax; j++)
+					{
+						if (!patchVariance(intImgP, intImgP2, originalVariancePtr, Point(dx * i, dy * j), initSize))
 							continue;
-						}
-						else
-						{
-							npos++;
-						}
-						tmp = Sc(standardPatch);
-						if (tmp > maxSc)
-						{
-							maxSc = tmp;
-							maxScRect = labPatch.rect;
-						}
+						varBuffer.push_back(Point(dx * i, dy * j));
+						varScaleIDs.push_back(scaleID);
 					}
 				}
-
+				scaleID++;
 				size.width /= SCALE_STEP;
 				size.height /= SCALE_STEP;
 				scale *= SCALE_STEP;
-				resize(img, resized_img, size, 0, 0, DOWNSCALE_MODE);
-				GaussianBlur(resized_img, blurred_img, GaussBlurKernelSize, 0.0f);
+				resize(img, tmp, size, 0, 0, DOWNSCALE_MODE);
+				resized_imgs.push_back(tmp);
+				GaussianBlur(resized_imgs[scaleID], tmp, GaussBlurKernelSize, 0.0f);
+				blurred_imgs.push_back(tmp);
 			} while (size.width >= initSize.width && size.height >= initSize.height);
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Variance: %d\t%f\n", varBuffer.size(), t);
+
+			//Encsemble classification
+			//e1 = getTickCount();
+			for (int i = 0; i < (int)varBuffer.size(); i++)
+			{
+				prepareClassifiers((int)blurred_imgs[varScaleIDs[i]].step[0]);
+				if (ensembleClassifierNum(&blurred_imgs[varScaleIDs[i]].at<uchar>(varBuffer[i].y, varBuffer[i].x)) <= ENSEMBLE_THRESHOLD)
+					continue;
+				ensBuffer.push_back(varBuffer[i]);
+				ensScaleIDs.push_back(varScaleIDs[i]);
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Ensemble: %d\t%f\n", ensBuffer.size(), t);
+
+			//NN classification
+			//e1 = getTickCount();
+			//Prepare batch of patches
+			int numOfPatches = (int)ensBuffer.size();
+			Mat_<uchar> stdPatches(numOfPatches, 225);
+			double *resultSr = new double[numOfPatches];
+			double *resultSc = new double[numOfPatches];
+
+			uchar *patchesData = stdPatches.data;
+			for (int i = 0; i < (int)ensBuffer.size(); i++)
+			{
+				resample(resized_imgs[ensScaleIDs[i]], Rect2d(ensBuffer[i], initSize), standardPatch);
+				uchar *stdPatchData = standardPatch.data;
+				for (int j = 0; j < 225; j++)
+					patchesData[225*i+j] = stdPatchData[j];
+			}
+			//Calculate Sr and Sc batches
+			ocl_batchSrSc(stdPatches, resultSr, resultSc, numOfPatches);
+
+
+			for (int i = 0; i < (int)ensBuffer.size(); i++)
+			{
+				LabeledPatch labPatch;
+				standardPatch.data = &stdPatches.data[225 * i];
+				double curScale = pow(SCALE_STEP, ensScaleIDs[i]);
+				labPatch.rect = Rect2d(ensBuffer[i].x*curScale, ensBuffer[i].y*curScale, initSize.width * curScale, initSize.height * curScale);
+
+				double srValue, scValue;
+
+				srValue = resultSr[i];
+
+				//srValue = Sr(standardPatch);
+				//printf("%f\t%f\t\n", srValue, resultSr[i]);
+
+				////To fix: Check the paper, probably this cause wrong learning
+				//
+				labPatch.isObject = srValue > THETA_NN;
+				labPatch.shouldBeIntegrated = abs(srValue - THETA_NN) < 0.1;
+				patches.push_back(labPatch);
+				//
+
+				if (!labPatch.isObject)
+				{
+					nneg++;
+					continue;
+				}
+				else
+				{
+					npos++;
+				}
+				scValue = resultSc[i];
+				if (scValue > maxSc)
+				{
+					maxSc = scValue;
+					maxScRect = labPatch.rect;
+				}
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("NN: %d\t%f\n", patches.size(), t);

 			if (maxSc < 0)
 				return false;

--- a/modules/tracking/src/tldDetector.hpp
+++ b/modules/tracking/src/tldDetector.hpp
@@ -43,6 +43,7 @@
 #define OPENCV_TLD_DETECTOR

 #include "precomp.hpp"
+#include "opencl_kernels_tracking.hpp"
 #include "tldEnsembleClassifier.hpp"
 #include "tldUtils.hpp"

@@ -73,9 +74,14 @@ namespace cv
 			inline double ensembleClassifierNum(const uchar* data);
 			inline void prepareClassifiers(int rowstep);
 			double Sr(const Mat_<uchar>& patch);
+			double ocl_Sr(const Mat_<uchar>& patch);
 			double Sc(const Mat_<uchar>& patch);
+			double ocl_Sc(const Mat_<uchar>& patch);
+			void ocl_batchSrSc(const Mat_<uchar>& patches, double *resultSr, double *resultSc, int numOfPatches);

 			std::vector<TLDEnsembleClassifier> classifiers;
+			Mat *posExp, *negExp;
+			int *posNum, *negNum;
 			std::vector<Mat_<uchar> > *positiveExamples, *negativeExamples;
 			std::vector<int> *timeStampsPositive, *timeStampsNegative;
 			double *originalVariancePtr;
@@ -87,6 +93,7 @@ namespace cv
 				bool isObject, shouldBeIntegrated;
 			};
 			bool detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize);
+			bool ocl_detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize);
 		protected:



--- a/modules/tracking/src/tldEnsembleClassifier.cpp
+++ b/modules/tracking/src/tldEnsembleClassifier.cpp
@@ -58,11 +58,11 @@ namespace cv
 		// Calculate measure locations from 15x15 grid on minSize patches
 		void TLDEnsembleClassifier::stepPrefSuff(std::vector<Vec4b>& arr, int pos, int len, int gridSize)
 		{
-#if 0
+		#if 0
 			int step = len / (gridSize - 1), pref = (len - step * (gridSize - 1)) / 2;
 			for (int i = 0; i < (int)(sizeof(x1) / sizeof(x1[0])); i++)
 				arr[i] = pref + arr[i] * step;
-#else
+		#else
 			int total = len - gridSize;
 			int quo = total / (gridSize - 1), rem = total % (gridSize - 1);
 			int smallStep = quo, bigStep = quo + 1;

--- a/modules/tracking/src/tldEnsembleClassifier.hpp
+++ b/modules/tracking/src/tldEnsembleClassifier.hpp
@@ -64,6 +64,5 @@ namespace cv
 			std::vector<Point2i> offset;
 			int lastStep_;
 		};
-
 	}
 }
\ No newline at end of file
--- a/modules/tracking/src/tldModel.cpp
+++ b/modules/tracking/src/tldModel.cpp
@@ -56,6 +56,15 @@ namespace cv
 			detector = Ptr<TLDDetector>(new TLDDetector());

 			//Propagate data to Detector
+			posNum = 0;
+			negNum = 0;
+			posExp = Mat(Size(225, 500), CV_8UC1);
+			negExp = Mat(Size(225, 500), CV_8UC1);
+			detector->posNum = &posNum;
+			detector->negNum = &negNum;
+			detector->posExp = &posExp;
+			detector->negExp = &negExp;
+
 			detector->positiveExamples = &positiveExamples;
 			detector->negativeExamples = &negativeExamples;
 			detector->timeStampsPositive = &timeStampsPositive;
@@ -69,14 +78,13 @@ namespace cv
 				scaledImg, blurredImg, GaussBlurKernelSize, SCALE_STEP);
 			GaussianBlur(image, image_blurred, GaussBlurKernelSize, 0.0);
 			TLDDetector::generateScanGrid(image.rows, image.cols, minSize_, scanGrid);
-			getClosestN(scanGrid, Rect2d(boundingBox.x / scale, boundingBox.y / scale, boundingBox.width / scale,
-				boundingBox.height / scale), 10, closest);
-
+			getClosestN(scanGrid, Rect2d(boundingBox.x / scale, boundingBox.y / scale, boundingBox.width / scale, boundingBox.height / scale), 10, closest);
 			Mat_<uchar> blurredPatch(minSize);
 			TLDEnsembleClassifier::makeClassifiers(minSize, MEASURES_PER_CLASSIFIER, GRIDSIZE, detector->classifiers);

 			//Generate initial positive samples and put them to the model
 			positiveExamples.reserve(200);
+
 			for (int i = 0; i < (int)closest.size(); i++)
 			{
 				for (int j = 0; j < 20; j++)
@@ -188,6 +196,11 @@ namespace cv
 		void TrackerTLDModel::integrateAdditional(const std::vector<Mat_<uchar> >& eForModel, const std::vector<Mat_<uchar> >& eForEnsemble, bool isPositive)
 		{
 			int positiveIntoModel = 0, negativeIntoModel = 0, positiveIntoEnsemble = 0, negativeIntoEnsemble = 0;
+			if ((int)eForModel.size() == 0) return;
+
+			//int64 e1, e2;
+			//double t;
+			//e1 = getTickCount();
 			for (int k = 0; k < (int)eForModel.size(); k++)
 			{
 				double sr = detector->Sr(eForModel[k]);
@@ -218,6 +231,79 @@ namespace cv
 						detector->classifiers[i].integrate(eForEnsemble[k], isPositive);
 				}
 			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency() * 1000;
+			//printf("Integrate Additional: %fms\n", t);
+			/*
+			if( negativeIntoModel > 0 )
+			dfprintf((stdout, "negativeIntoModel = %d ", negativeIntoModel));
+			if( positiveIntoModel > 0 )
+			dfprintf((stdout, "positiveIntoModel = %d ", positiveIntoModel));
+			if( negativeIntoEnsemble > 0 )
+			dfprintf((stdout, "negativeIntoEnsemble = %d ", negativeIntoEnsemble));
+			if( positiveIntoEnsemble > 0 )
+			dfprintf((stdout, "positiveIntoEnsemble = %d ", positiveIntoEnsemble));
+			dfprintf((stdout, "\n"));*/
+		}
+
+		void TrackerTLDModel::ocl_integrateAdditional(const std::vector<Mat_<uchar> >& eForModel, const std::vector<Mat_<uchar> >& eForEnsemble, bool isPositive)
+		{
+			int positiveIntoModel = 0, negativeIntoModel = 0, positiveIntoEnsemble = 0, negativeIntoEnsemble = 0;
+			if ((int)eForModel.size() == 0) return;
+
+			//int64 e1, e2;
+			//double t;
+			//e1 = getTickCount();
+
+			//Prepare batch of patches
+			int numOfPatches = (int)eForModel.size();
+			Mat_<uchar> stdPatches(numOfPatches, 225);
+			double *resultSr = new double[numOfPatches];
+			double *resultSc = new double[numOfPatches];
+			uchar *patchesData = stdPatches.data;
+			for (int i = 0; i < numOfPatches; i++)
+			{
+				uchar *stdPatchData = eForModel[i].data;
+				for (int j = 0; j < 225; j++)
+					patchesData[225 * i + j] = stdPatchData[j];
+			}
+
+			//Calculate Sr and Sc batches
+			detector->ocl_batchSrSc(stdPatches, resultSr, resultSc, numOfPatches);
+
+			for (int k = 0; k < (int)eForModel.size(); k++)
+			{
+				double sr = resultSr[k];
+				if ((sr > THETA_NN) != isPositive)
+				{
+					if (isPositive)
+					{
+						positiveIntoModel++;
+						pushIntoModel(eForModel[k], true);
+					}
+					else
+					{
+						negativeIntoModel++;
+						pushIntoModel(eForModel[k], false);
+					}
+				}
+				double p = 0;
+				for (int i = 0; i < (int)detector->classifiers.size(); i++)
+					p += detector->classifiers[i].posteriorProbability(eForEnsemble[k].data, (int)eForEnsemble[k].step[0]);
+				p /= detector->classifiers.size();
+				if ((p > ENSEMBLE_THRESHOLD) != isPositive)
+				{
+					if (isPositive)
+						positiveIntoEnsemble++;
+					else
+						negativeIntoEnsemble++;
+					for (int i = 0; i < (int)detector->classifiers.size(); i++)
+						detector->classifiers[i].integrate(eForEnsemble[k], isPositive);
+				}
+			}
+			//e2 = getTickCount();
+			//t = (e2 - e1) / getTickFrequency() * 1000;
+			//printf("Integrate Additional OCL: %fms\n", t);
 			/*
 			if( negativeIntoModel > 0 )
 			dfprintf((stdout, "negativeIntoModel = %d ", negativeIntoModel));
@@ -238,12 +324,30 @@ namespace cv
 			std::vector<int>* proxyT;
 			if (positive)
 			{
+				if (posNum < 500)
+				{
+					uchar *patchPtr = example.data;
+					uchar *modelPtr = posExp.data;
+					for (int i = 0; i < STANDARD_PATCH_SIZE*STANDARD_PATCH_SIZE; i++)
+						modelPtr[posNum*STANDARD_PATCH_SIZE*STANDARD_PATCH_SIZE + i] = patchPtr[i];
+					posNum++;
+				}
+
 				proxyV = &positiveExamples;
 				proxyN = &timeStampPositiveNext;
 				proxyT = &timeStampsPositive;
 			}
 			else
 			{
+				if (negNum < 500)
+				{
+					uchar *patchPtr = example.data;
+					uchar *modelPtr = negExp.data;
+					for (int i = 0; i < STANDARD_PATCH_SIZE*STANDARD_PATCH_SIZE; i++)
+						modelPtr[negNum*STANDARD_PATCH_SIZE*STANDARD_PATCH_SIZE + i] = patchPtr[i];
+					negNum++;
+				}
+
 				proxyV = &negativeExamples;
 				proxyN = &timeStampNegativeNext;
 				proxyT = &timeStampsNegative;
@@ -268,9 +372,5 @@ namespace cv
 			dfprintf((port, "\tpositiveExamples.size() = %d\n", (int)positiveExamples.size()));
 			dfprintf((port, "\tnegativeExamples.size() = %d\n", (int)negativeExamples.size()));
 		}
-
-
-
-
 	}
 }
\ No newline at end of file
--- a/modules/tracking/src/tldModel.hpp
+++ b/modules/tracking/src/tldModel.hpp
@@ -50,9 +50,6 @@ namespace cv
 {
 	namespace tld
 	{
-
-
-
 		class TrackerTLDModel : public TrackerModel
 		{
 		public:
@@ -61,11 +58,14 @@ namespace cv
 			void setBoudingBox(Rect2d boundingBox){ boundingBox_ = boundingBox; }
 			void integrateRelabeled(Mat& img, Mat& imgBlurred, const std::vector<TLDDetector::LabeledPatch>& patches);
 			void integrateAdditional(const std::vector<Mat_<uchar> >& eForModel, const std::vector<Mat_<uchar> >& eForEnsemble, bool isPositive);
+			void ocl_integrateAdditional(const std::vector<Mat_<uchar> >& eForModel, const std::vector<Mat_<uchar> >& eForEnsemble, bool isPositive);
 			Size getMinSize(){ return minSize_; }
 			void printme(FILE* port = stdout);
 			Ptr<TLDDetector> detector;

 			std::vector<Mat_<uchar> > positiveExamples, negativeExamples;
+			Mat posExp, negExp;
+			int posNum, negNum;
 			std::vector<int> timeStampsPositive, timeStampsNegative;
 			int timeStampPositiveNext, timeStampNegativeNext;
 			double originalVariance_;
@@ -80,7 +80,6 @@ namespace cv
 			void modelUpdateImpl(){}
 			Rect2d boundingBox_;
 			RNG rng;
-
 		};

 	}

--- a/modules/tracking/src/tldTracker.cpp
+++ b/modules/tracking/src/tldTracker.cpp
@@ -116,11 +116,20 @@ bool TrackerTLDImpl::updateImpl(const Mat& image, Rect2d& boundingBox)
    std::vector<Rect2d> candidates;
    std::vector<double> candidatesRes;
    bool trackerNeedsReInit = false;
+	bool DETECT_FLG = false;
    for( int i = 0; i < 2; i++ )
    {
        Rect2d tmpCandid = boundingBox;
-        if( ( (i == 0) && !data->failedLastTime && trackerProxy->update(image, tmpCandid) ) ||
-			((i == 1) && (tldModel->detector->detect(imageForDetector, image_blurred, tmpCandid, detectorResults, tldModel->getMinSize()))))
+
+		if (i == 1)
+		{
+			if (ocl::haveOpenCL())
+				DETECT_FLG = tldModel->detector->ocl_detect(imageForDetector, image_blurred, tmpCandid, detectorResults, tldModel->getMinSize());
+			else
+				DETECT_FLG = tldModel->detector->detect(imageForDetector, image_blurred, tmpCandid, detectorResults, tldModel->getMinSize());
+		}
+
+        if( ( (i == 0) && !data->failedLastTime && trackerProxy->update(image, tmpCandid) ) || ( DETECT_FLG))
        {
            candidates.push_back(tmpCandid);
            if( i == 0 )
@@ -202,10 +211,17 @@ bool TrackerTLDImpl::updateImpl(const Mat& image, Rect2d& boundingBox)
        tldModel->integrateRelabeled(imageForDetector, image_blurred, detectorResults);
        //dprintf(("%d relabeled by nExpert\n", negRelabeled));
        pExpert.additionalExamples(examplesForModel, examplesForEnsemble);
-        tldModel->integrateAdditional(examplesForModel, examplesForEnsemble, true);
+		if (ocl::haveOpenCL())
+			tldModel->ocl_integrateAdditional(examplesForModel, examplesForEnsemble, true);
+		else
+			tldModel->integrateAdditional(examplesForModel, examplesForEnsemble, true);
        examplesForModel.clear(); examplesForEnsemble.clear();
        nExpert.additionalExamples(examplesForModel, examplesForEnsemble);
-        tldModel->integrateAdditional(examplesForModel, examplesForEnsemble, false);
+
+		if (ocl::haveOpenCL())
+			tldModel->ocl_integrateAdditional(examplesForModel, examplesForEnsemble, false);
+		else
+			tldModel->integrateAdditional(examplesForModel, examplesForEnsemble, false);
    }
    else
    {

--- a/modules/tracking/src/tldTracker.hpp
+++ b/modules/tracking/src/tldTracker.hpp
@@ -60,7 +60,6 @@ void TrackerTLD::Params::write(cv::FileStorage& /*fs*/) const {}

 namespace tld
 {
-
 class TrackerProxy
 {
 public: