#include <iomanip>
#include <stdexcept>
#include <string>
#include <iostream>
#include <cstdio>
#include <vector>
#include <numeric>
#include <opencv2/core/utility.hpp>
#include "opencv2/imgproc.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/calib3d.hpp"
#include "opencv2/video.hpp"
#include "opencv2/nonfree.hpp"
#include "opencv2/objdetect.hpp"
#include "opencv2/features2d.hpp"
#define USE_OPENCL
#ifdef USE_OPENCL
#include "opencv2/ocl.hpp"
#include "opencv2/nonfree/ocl.hpp"
#endif

#define TAB "    "

using namespace std;
using namespace cv;

// This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
// All images needed in this test are in samples/gpu folder.
// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory

void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
string abspath(const string &relpath);
int CV_CDECL cvErrorCallback(int, const char *, const char *, const char *, int, void *);
typedef struct
{
    short x;
    short y;
} COOR;
COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep,
                  cv::Size size, int sp, int sr, int maxIter, float eps, int *tab);
void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi,
                    int sp, int sr, cv::TermCriteria crit);

class Runnable
{
public:
    explicit Runnable(const std::string &runname): name_(runname) {}
    virtual ~Runnable() {}

    const std::string &name() const
    {
        return name_;
    }

    virtual void run() = 0;

private:
    std::string name_;
};

class TestSystem
{
public:
    static TestSystem &instance()
    {
        static TestSystem me;
        return me;
    }

    void setWorkingDir(const std::string &val)
    {
        working_dir_ = val;
    }
    const std::string &workingDir() const
    {
        return working_dir_;
    }

    void setTestFilter(const std::string &val)
    {
        test_filter_ = val;
    }
    const std::string &testFilter() const
    {
        return test_filter_;
    }

    void setNumIters(int num_iters)
    {
        num_iters_ = num_iters;
    }
    void setGPUWarmupIters(int num_iters)
    {
        gpu_warmup_iters_ = num_iters;
    }
    void setCPUIters(int num_iters)
    {
        cpu_num_iters_ = num_iters;
    }

    void setTopThreshold(double top)
    {
        top_ = top;
    }
    void setBottomThreshold(double bottom)
    {
        bottom_ = bottom;
    }

    void addInit(Runnable *init)
    {
        inits_.push_back(init);
    }
    void addTest(Runnable *test)
    {
        tests_.push_back(test);
    }
    void run();

    // It's public because OpenCV callback uses it
    void printError(const std::string &msg);

    std::stringstream &startNewSubtest()
    {
        finishCurrentSubtest();
        return cur_subtest_description_;
    }

    bool stop() const
    {
        return cur_iter_idx_ >= num_iters_;
    }

    bool cpu_stop() const
    {
        return cur_iter_idx_ >= cpu_num_iters_;
    }

    bool warmupStop()
    {
        return cur_warmup_idx_++ >= gpu_warmup_iters_;
    }

    void warmupComplete()
    {
        cur_warmup_idx_ = 0;
    }

    void cpuOn()
    {
        cpu_started_ = cv::getTickCount();
    }
    void cpuOff()
    {
        int64 delta = cv::getTickCount() - cpu_started_;
        cpu_times_.push_back(delta);
        ++cur_iter_idx_;
    }
    void cpuComplete()
    {
        cpu_elapsed_ += meanTime(cpu_times_);
        cur_subtest_is_empty_ = false;
        cur_iter_idx_ = 0;
    }

    void gpuOn()
    {
        gpu_started_ = cv::getTickCount();
    }
    void gpuOff()
    {
        int64 delta = cv::getTickCount() - gpu_started_;
        gpu_times_.push_back(delta);
        ++cur_iter_idx_;
    }
    void gpuComplete()
    {
        gpu_elapsed_ += meanTime(gpu_times_);
        cur_subtest_is_empty_ = false;
        cur_iter_idx_ = 0;
    }

    void gpufullOn()
    {
        gpu_full_started_ = cv::getTickCount();
    }
    void gpufullOff()
    {
        int64 delta = cv::getTickCount() - gpu_full_started_;
        gpu_full_times_.push_back(delta);
        ++cur_iter_idx_;
    }
    void gpufullComplete()
    {
        gpu_full_elapsed_ += meanTime(gpu_full_times_);
        cur_subtest_is_empty_ = false;
        cur_iter_idx_ = 0;
    }

    bool isListMode() const
    {
        return is_list_mode_;
    }
    void setListMode(bool value)
    {
        is_list_mode_ = value;
    }

    void setRecordName(const std::string &name)
    {
        recordname_ = name;
    }

    void setCurrentTest(const std::string &name)
    {
        itname_ = name;
        itname_changed_ = true;
    }

private:
    TestSystem():
        cur_subtest_is_empty_(true), cpu_elapsed_(0),
        gpu_elapsed_(0), gpu_full_elapsed_(0), speedup_total_(0.0),
        num_subtests_called_(0),
        speedup_faster_count_(0), speedup_slower_count_(0), speedup_equal_count_(0),
        speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
        num_iters_(10), cpu_num_iters_(2),
        gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
        record_(0), recordname_("performance"), itname_changed_(true)
    {
        cpu_times_.reserve(num_iters_);
        gpu_times_.reserve(num_iters_);
        gpu_full_times_.reserve(num_iters_);
    }

    void finishCurrentSubtest();
    void resetCurrentSubtest()
    {
        cpu_elapsed_ = 0;
        gpu_elapsed_ = 0;
        gpu_full_elapsed_ = 0;
        cur_subtest_description_.str("");
        cur_subtest_is_empty_ = true;
        cur_iter_idx_ = 0;
        cpu_times_.clear();
        gpu_times_.clear();
        gpu_full_times_.clear();
    }

    double meanTime(const std::vector<int64> &samples);

    void printHeading();
    void printSummary();
    void printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup);

    void writeHeading();
    void writeSummary();
    void writeMetrics(double cpu_time, double gpu_time, double gpu_full_time,
                      double speedup, double fullspeedup,
                      double gpu_min, double gpu_max, double std_dev);

    std::string working_dir_;
    std::string test_filter_;

    std::vector<Runnable *> inits_;
    std::vector<Runnable *> tests_;

    std::stringstream cur_subtest_description_;
    bool cur_subtest_is_empty_;

    int64 cpu_started_;
    int64 gpu_started_;
    int64 gpu_full_started_;
    double cpu_elapsed_;
    double gpu_elapsed_;
    double gpu_full_elapsed_;

    double speedup_total_;
    double speedup_full_total_;
    int num_subtests_called_;

    int speedup_faster_count_;
    int speedup_slower_count_;
    int speedup_equal_count_;

    int speedup_full_faster_count_;
    int speedup_full_slower_count_;
    int speedup_full_equal_count_;

    bool is_list_mode_;

    double top_;
    double bottom_;

    int num_iters_;
    int cpu_num_iters_;		//there's no need to set cpu running same times with gpu
    int gpu_warmup_iters_;	//gpu warm up times, default is 1
    int cur_iter_idx_;
    int cur_warmup_idx_;	//current gpu warm up times
    std::vector<int64> cpu_times_;
    std::vector<int64> gpu_times_;
    std::vector<int64> gpu_full_times_;

    FILE *record_;
    std::string recordname_;
    std::string itname_;
    bool itname_changed_;
};


#define GLOBAL_INIT(name) \
    struct name##_init: Runnable { \
        name##_init(): Runnable(#name) { \
            TestSystem::instance().addInit(this); \
        } \
        void run(); \
    } name##_init_instance; \
    void name##_init::run()


#define TEST(name) \
    struct name##_test: Runnable { \
        name##_test(): Runnable(#name) { \
            TestSystem::instance().addTest(this); \
        } \
        void run(); \
    } name##_test_instance; \
    void name##_test::run()

#define SUBTEST TestSystem::instance().startNewSubtest()

#define CPU_ON \
    while (!TestSystem::instance().cpu_stop()) { \
        TestSystem::instance().cpuOn()
#define CPU_OFF \
        TestSystem::instance().cpuOff(); \
    } TestSystem::instance().cpuComplete()

#define GPU_ON \
    while (!TestSystem::instance().stop()) { \
        TestSystem::instance().gpuOn()
#define GPU_OFF \
        TestSystem::instance().gpuOff(); \
    } TestSystem::instance().gpuComplete()

#define GPU_FULL_ON \
    while (!TestSystem::instance().stop()) { \
        TestSystem::instance().gpufullOn()
#define GPU_FULL_OFF \
        TestSystem::instance().gpufullOff(); \
    } TestSystem::instance().gpufullComplete()

#define WARMUP_ON \
    while (!TestSystem::instance().warmupStop()) {
#define WARMUP_OFF \
    } TestSystem::instance().warmupComplete()

void TestSystem::run()
{
    if (is_list_mode_)
    {
        for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
        {
            cout << (*it)->name() << endl;
        }

        return;
    }

    // Run test initializers
    for (vector<Runnable *>::iterator it = inits_.begin(); it != inits_.end(); ++it)
    {
        if ((*it)->name().find(test_filter_, 0) != string::npos)
        {
            (*it)->run();
        }
    }

    printHeading();
    writeHeading();

    // Run tests
    for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
    {
        try
        {
            if ((*it)->name().find(test_filter_, 0) != string::npos)
            {
                cout << endl << (*it)->name() << ":\n";

                setCurrentTest((*it)->name());
                //fprintf(record_,"%s\n",(*it)->name().c_str());

                (*it)->run();
                finishCurrentSubtest();
            }
        }
        catch (const Exception &)
        {
            // Message is printed via callback
            resetCurrentSubtest();
        }
        catch (const runtime_error &e)
        {
            printError(e.what());
            resetCurrentSubtest();
        }
    }

#ifdef USE_OPENCL
    printSummary();
    writeSummary();
#endif
}


void TestSystem::finishCurrentSubtest()
{
    if (cur_subtest_is_empty_)
        // There is no need to print subtest statistics
    {
        return;
    }

    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;

    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
    speedup_total_ += speedup;

    double fullspeedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_full_elapsed_);
    speedup_full_total_ += fullspeedup;

    if (speedup > top_)
    {
        speedup_faster_count_++;
    }
    else if (speedup < bottom_)
    {
        speedup_slower_count_++;
    }
    else
    {
        speedup_equal_count_++;
    }

    if (fullspeedup > top_)
    {
        speedup_full_faster_count_++;
    }
    else if (fullspeedup < bottom_)
    {
        speedup_full_slower_count_++;
    }
    else
    {
        speedup_full_equal_count_++;
    }

    // compute min, max and
    std::sort(gpu_times_.begin(), gpu_times_.end());
    double gpu_min = gpu_times_.front() / getTickFrequency() * 1000.0;
    double gpu_max = gpu_times_.back() / getTickFrequency() * 1000.0;
    double deviation = 0;

    if (gpu_times_.size() > 1)
    {
        double sum = 0;

        for (size_t i = 0; i < gpu_times_.size(); i++)
        {
            int64 diff = gpu_times_[i] - static_cast<int64>(gpu_elapsed_);
            double diff_time = diff * 1000 / getTickFrequency();
            sum += diff_time * diff_time;
        }

        deviation = std::sqrt(sum / gpu_times_.size());
    }

    printMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);

    num_subtests_called_++;
    resetCurrentSubtest();
}


double TestSystem::meanTime(const vector<int64> &samples)
{
    double sum = accumulate(samples.begin(), samples.end(), 0.);
    return sum / samples.size();
}


void TestSystem::printHeading()
{
    cout << endl;
    cout << setiosflags(ios_base::left);
#ifdef USE_OPENCL
    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
         << setw(14) << "SPEEDUP" << setw(14) << "GPUTOTAL, ms" << setw(14) << "TOTALSPEEDUP"
         << "DESCRIPTION\n";
#else
    cout << TAB << setw(10) << "CPU, ms\n";
#endif
    cout << resetiosflags(ios_base::left);
}

void TestSystem::writeHeading()
{
    if (!record_)
    {
#ifdef USE_OPENCL
        recordname_ += "_OCL.csv";
#else
        recordname_ += "_CPU.csv";
#endif
        record_ = fopen(recordname_.c_str(), "w");
    }

#ifdef USE_OPENCL
    fprintf(record_, "NAME,DESCRIPTION,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
#else
    fprintf(record_, "NAME,DESCRIPTION,CPU (ms)\n");
#endif
    fflush(record_);
}

void TestSystem::printSummary()
{
    cout << setiosflags(ios_base::fixed);
    cout << "\naverage GPU speedup: x"
         << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
         << endl;
    cout << "\nGPU exceeded: "
         << setprecision(3) << speedup_faster_count_
         << "\nGPU passed: "
         << setprecision(3) << speedup_equal_count_
         << "\nGPU failed: "
         << setprecision(3) << speedup_slower_count_
         << endl;
    cout << "\nGPU exceeded rate: "
         << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
         << "%"
         << "\nGPU passed rate: "
         << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
         << "%"
         << "\nGPU failed rate: "
         << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
         << "%"
         << endl;
    cout << "\naverage GPUTOTAL speedup: x"
         << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
         << endl;
    cout << "\nGPUTOTAL exceeded: "
         << setprecision(3) << speedup_full_faster_count_
         << "\nGPUTOTAL passed: "
         << setprecision(3) << speedup_full_equal_count_
         << "\nGPUTOTAL failed: "
         << setprecision(3) << speedup_full_slower_count_
         << endl;
    cout << "\nGPUTOTAL exceeded rate: "
         << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
         << "%"
         << "\nGPUTOTAL passed rate: "
         << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
         << "%"
         << "\nGPUTOTAL failed rate: "
         << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
         << "%"
         << endl;
    cout << resetiosflags(ios_base::fixed);
}


void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
{
    cout << TAB << setiosflags(ios_base::left);
    stringstream stream;

    stream << cpu_time;
    cout << setw(10) << stream.str();
#ifdef USE_OPENCL
    stream.str("");
    stream << gpu_time;
    cout << setw(10) << stream.str();

    stream.str("");
    stream << "x" << setprecision(3) << speedup;
    cout << setw(14) << stream.str();

    stream.str("");
    stream << gpu_full_time;
    cout << setw(14) << stream.str();

    stream.str("");
    stream << "x" << setprecision(3) << fullspeedup;
    cout << setw(14) << stream.str();
#endif
    cout << cur_subtest_description_.str();
    cout << resetiosflags(ios_base::left) << endl;
}

void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
{
    if (!record_)
    {
        recordname_ += ".csv";
        record_ = fopen(recordname_.c_str(), "w");
    }

#ifdef USE_OPENCL
    fprintf(record_, "%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
            cur_subtest_description_.str().c_str(),
            cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
            gpu_min, gpu_max, std_dev);
#else
    fprintf(record_, "%s,%s,%.3f\n",
            itname_changed_ ? itname_.c_str() : "", cur_subtest_description_.str().c_str(), cpu_time);
#endif

    if (itname_changed_)
    {
        itname_changed_ = false;
    }

    fflush(record_);
}

void TestSystem::writeSummary()
{
    if (!record_)
    {
        recordname_ += ".csv";
        record_ = fopen(recordname_.c_str(), "w");
    }

    fprintf(record_, "\nAverage GPU speedup: %.3f\n"
            "exceeded: %d (%.3f%%)\n"
            "passed: %d (%.3f%%)\n"
            "failed: %d (%.3f%%)\n"
            "\nAverage GPUTOTAL speedup: %.3f\n"
            "exceeded: %d (%.3f%%)\n"
            "passed: %d (%.3f%%)\n"
            "failed: %d (%.3f%%)\n",
            speedup_total_ / std::max(1, num_subtests_called_),
            speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
            speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
            speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
            speedup_full_total_ / std::max(1, num_subtests_called_),
            speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
            speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
            speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
           );
    fflush(record_);
}

void TestSystem::printError(const std::string &msg)
{
    cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
}

void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
{
    mat.create(rows, cols, type);
    RNG rng(0);
    rng.fill(mat, RNG::UNIFORM, low, high);
}


string abspath(const string &relpath)
{
    return TestSystem::instance().workingDir() + relpath;
}


int CV_CDECL cvErrorCallback(int /*status*/, const char * /*func_name*/,
                             const char *err_msg, const char * /*file_name*/,
                             int /*line*/, void * /*userdata*/)
{
    TestSystem::instance().printError(err_msg);
    return 0;
}

/////////// matchTemplate ////////////////////////
//void InitMatchTemplate()
//{
//	Mat src; gen(src, 500, 500, CV_32F, 0, 1);
//	Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
//#ifdef USE_OPENCL
//	ocl::oclMat d_src(src), d_templ(templ), d_dst;
//	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
//#endif
//}
TEST(matchTemplate)
{
    //InitMatchTemplate();

    Mat src, templ, dst;
    int templ_size = 5;


    for (int size = 1000; size <= 4000; size *= 2)
    {
        int all_type[] = {CV_32FC1, CV_32FC4};
        std::string type_name[] = {"CV_32FC1", "CV_32FC4"};

        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
            {
                gen(src, size, size, all_type[j], 0, 1);

                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR";

                gen(templ, templ_size, templ_size, all_type[j], 0, 1);

                matchTemplate(src, templ, dst, CV_TM_CCORR);

                CPU_ON;
                matchTemplate(src, templ, dst, CV_TM_CCORR);
                CPU_OFF;

#ifdef USE_OPENCL
                ocl::oclMat d_src(src), d_templ, d_dst;

                d_templ.upload(templ);

                WARMUP_ON;
                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
                WARMUP_OFF;

                GPU_ON;
                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
                GPU_OFF;

                GPU_FULL_ON;
                d_src.upload(src);
                d_templ.upload(templ);
                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
                d_dst.download(dst);
                GPU_FULL_OFF;
#endif
            }
        }

        int all_type_8U[] = {CV_8UC1};
        std::string type_name_8U[] = {"CV_8UC1"};

        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
        {
            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
            {
                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";

                gen(src, size, size, all_type_8U[j], 0, 255);

                gen(templ, templ_size, templ_size, all_type_8U[j], 0, 255);

                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);

                CPU_ON;
                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
                CPU_OFF;

#ifdef USE_OPENCL
                ocl::oclMat d_src(src);
                ocl::oclMat d_templ(templ), d_dst;

                WARMUP_ON;
                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
                WARMUP_OFF;

                GPU_ON;
                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
                GPU_OFF;

                GPU_FULL_ON;
                d_src.upload(src);
                d_templ.upload(templ);
                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
                d_dst.download(dst);
                GPU_FULL_OFF;
#endif
            }
        }
    }
}

///////////// PyrLKOpticalFlow ////////////////////////
TEST(PyrLKOpticalFlow)
{
    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};

    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
    {
        Mat frame0 = imread(abspath(images1[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);

        if (frame0.empty())
        {
            std::string errstr = "can't open " + images1[i];
            throw runtime_error(errstr);
        }

        Mat frame1 = imread(abspath(images2[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);

        if (frame1.empty())
        {
            std::string errstr = "can't open " + images2[i];
            throw runtime_error(errstr);
        }

        Mat gray_frame;

        if (i == 0)
        {
            cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
        }

        for (int points = 1000; points <= 4000; points *= 2)
        {
            if (i == 0)
                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
            else
                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
            Mat nextPts_cpu;
            Mat status_cpu;

            vector<Point2f> pts;
            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);

            vector<Point2f> nextPts;
            vector<unsigned char> status;

            vector<float> err;

            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);

            CPU_ON;
            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
            CPU_OFF;

#ifdef USE_OPENCL
            ocl::PyrLKOpticalFlow d_pyrLK;

            ocl::oclMat d_frame0(frame0);
            ocl::oclMat d_frame1(frame1);

            ocl::oclMat d_pts;
            Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
            d_pts.upload(pts_mat);

            ocl::oclMat d_nextPts;
            ocl::oclMat d_status;
            ocl::oclMat d_err;

            WARMUP_ON;
            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
            WARMUP_OFF;

            GPU_ON;
            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
            GPU_OFF;

            GPU_FULL_ON;
            d_frame0.upload(frame0);
            d_frame1.upload(frame1);
            d_pts.upload(pts_mat);
            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);

            if (!d_nextPts.empty())
            {
                d_nextPts.download(nextPts_cpu);
            }

            if (!d_status.empty())
            {
                d_status.download(status_cpu);
            }

            GPU_FULL_OFF;
#endif
        }

    }
}


///////////// pyrDown //////////////////////
TEST(pyrDown)
{
    Mat src, dst;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            pyrDown(src, dst);

            CPU_ON;
            pyrDown(src, dst);
            CPU_OFF;

#ifdef USE_OPENCL
            ocl::oclMat d_src(src);
            ocl::oclMat d_dst;

            WARMUP_ON;
            ocl::pyrDown(d_src, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::pyrDown(d_src, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::pyrDown(d_src, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }
    }
}

///////////// pyrUp ////////////////////////
TEST(pyrUp)
{
    Mat src, dst;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 500; size <= 2000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            pyrUp(src, dst);

            CPU_ON;
            pyrUp(src, dst);
            CPU_OFF;

#ifdef USE_OPENCL
            ocl::oclMat d_src(src);
            ocl::oclMat d_dst;

            WARMUP_ON;
            ocl::pyrUp(d_src, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::pyrUp(d_src, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::pyrUp(d_src, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }
    }
}

///////////// Canny ////////////////////////
TEST(Canny)
{
    Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);

    if (img.empty())
    {
        throw runtime_error("can't open aloeL.jpg");
    }

    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";

    Mat edges(img.size(), CV_8UC1);

    CPU_ON;
    Canny(img, edges, 50.0, 100.0);
    CPU_OFF;

#ifdef USE_OPENCL
    ocl::oclMat d_img(img);
    ocl::oclMat d_edges;
    ocl::CannyBuf d_buf;

    WARMUP_ON;
    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
    WARMUP_OFF;

    GPU_ON;
    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
    GPU_OFF;

    GPU_FULL_ON;
    d_img.upload(img);
    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
    d_edges.download(edges);
    GPU_FULL_OFF;
#endif
}

///////////// Haar ////////////////////////
#ifdef USE_OPENCL
namespace cv
{
namespace ocl
{

struct getRect
{
    Rect operator()(const CvAvgComp &e) const
    {
        return e.rect;
    }
};

class CascadeClassifier_GPU : public OclCascadeClassifier
{
public:
    void detectMultiScale(oclMat &image,
                          std::vector<cv::Rect>& faces,
                          double scaleFactor = 1.1,
                          int minNeighbors = 3, int flags = 0,
                          Size minSize = Size(),
                          Size maxSize = Size())
    {
        (void)maxSize;
        MemStorage storage(cvCreateMemStorage(0));
        //CvMat img=image;
        CvSeq *objs = oclHaarDetectObjects(image, storage, scaleFactor, minNeighbors, flags, minSize);
        vector<CvAvgComp> vecAvgComp;
        Seq<CvAvgComp>(objs).copyTo(vecAvgComp);
        faces.resize(vecAvgComp.size());
        std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
    }

};

}
}
#endif
TEST(Haar)
{
    Mat img = imread(abspath("basketball1.png"), CV_LOAD_IMAGE_GRAYSCALE);

    if (img.empty())
    {
        throw runtime_error("can't open basketball1.png");
    }

    CascadeClassifier faceCascadeCPU;

    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
    {
        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
    }

    vector<Rect> faces;

    SUBTEST << img.cols << "x" << img.rows << "; scale image";
    CPU_ON;
    faceCascadeCPU.detectMultiScale(img, faces,
                                    1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
    CPU_OFF;

#ifdef USE_OPENCL
    ocl::CascadeClassifier_GPU faceCascade;

    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
    {
        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
    }

    ocl::oclMat d_img(img);

    faces.clear();

    WARMUP_ON;
    faceCascade.detectMultiScale(d_img, faces,
                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
    WARMUP_OFF;

    faces.clear();

    GPU_ON;
    faceCascade.detectMultiScale(d_img, faces,
                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
    GPU_OFF;

    GPU_FULL_ON;
    d_img.upload(img);
    faceCascade.detectMultiScale(d_img, faces,
                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
    GPU_FULL_OFF;
#endif
}

///////////// blend ////////////////////////
template <typename T>
void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
{
    result_gold.create(img1.size(), img1.type());

    int cn = img1.channels();

    for (int y = 0; y < img1.rows; ++y)
    {
        const float *weights1_row = weights1.ptr<float>(y);
        const float *weights2_row = weights2.ptr<float>(y);
        const T *img1_row = img1.ptr<T>(y);
        const T *img2_row = img2.ptr<T>(y);
        T *result_gold_row = result_gold.ptr<T>(y);

        for (int x = 0; x < img1.cols * cn; ++x)
        {
            float w1 = weights1_row[x / cn];
            float w2 = weights2_row[x / cn];
            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
        }
    }
}
TEST(blend)
{
    Mat src1, src2, weights1, weights2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] << " and CV_32FC1";

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(weights1, size, size, CV_32FC1, 0, 1);
            gen(weights2, size, size, CV_32FC1, 0, 1);

            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);

            CPU_ON;
            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
            CPU_OFF;

#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);
            d_weights1.upload(weights1);
            d_weights2.upload(weights2);

            WARMUP_ON;
            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            d_weights1.upload(weights1);
            d_weights2.upload(weights2);
            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }
    }
}
///////////// columnSum////////////////////////
TEST(columnSum)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size << "; CV_32FC1";

        gen(src, size, size, CV_32FC1, 0, 256);

        CPU_ON;
        dst.create(src.size(), src.type());

        for (int i = 1; i < src.rows; ++i)
        {
            for (int j = 0; j < src.cols; ++j)
            {
                dst.at<float>(i, j) = src.at<float>(i, j) += src.at<float>(i - 1, j);
            }
        }

        CPU_OFF;

#ifdef USE_OPENCL
        d_src.upload(src);
        WARMUP_ON;
        ocl::columnSum(d_src, d_dst);
        WARMUP_OFF;

        GPU_ON;
        ocl::columnSum(d_src, d_dst);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
        ocl::columnSum(d_src, d_dst);
        d_dst.download(dst);
        GPU_FULL_OFF;
#endif
    }
}

///////////// HOG////////////////////////
TEST(HOG)
{
    Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);

    if (src.empty())
    {
        throw runtime_error("can't open road.png");
    }


    cv::HOGDescriptor hog;
    hog.setSVMDetector(hog.getDefaultPeopleDetector());
    std::vector<cv::Rect> found_locations;

    SUBTEST << 768 << 'x' << 576 << "; road.png";

    hog.detectMultiScale(src, found_locations);

    CPU_ON;
    hog.detectMultiScale(src, found_locations);
    CPU_OFF;

#ifdef USE_OPENCL
    cv::ocl::HOGDescriptor ocl_hog;
    ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
    ocl::oclMat d_src;
    d_src.upload(src);

    WARMUP_ON;
    ocl_hog.detectMultiScale(d_src, found_locations);
    WARMUP_OFF;

    GPU_ON;
    ocl_hog.detectMultiScale(d_src, found_locations);
    GPU_OFF;

    GPU_FULL_ON;
    d_src.upload(src);
    ocl_hog.detectMultiScale(d_src, found_locations);
    GPU_FULL_OFF;
#endif
}

///////////// SURF ////////////////////////

TEST(SURF)
{
    Mat keypoints_cpu;
    Mat descriptors_cpu;

    Mat src = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);

    if (src.empty())
    {
        throw runtime_error("can't open aloeL.jpg");
    }

    SUBTEST << src.cols << "x" << src.rows << "; aloeL.jpg";
    SURF surf;
    vector<KeyPoint> keypoints;
    Mat descriptors;

    surf(src, Mat(), keypoints, descriptors);

    CPU_ON;
    keypoints.clear();
    surf(src, Mat(), keypoints, descriptors);
    CPU_OFF;

#ifdef USE_OPENCL
    ocl::SURF_OCL d_surf;
    ocl::oclMat d_src(src);
    ocl::oclMat d_keypoints;
    ocl::oclMat d_descriptors;

    WARMUP_ON;
    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
    WARMUP_OFF;

    GPU_ON;
    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
    GPU_OFF;

    GPU_FULL_ON;
    d_src.upload(src);
    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);

    if (!d_keypoints.empty())
    {
        d_keypoints.download(keypoints_cpu);
    }

    if (!d_descriptors.empty())
    {
        d_descriptors.download(descriptors_cpu);
    }

    GPU_FULL_OFF;
#endif
}
//////////////////// BruteForceMatch /////////////////
TEST(BruteForceMatcher)
{
    Mat trainIdx_cpu;
    Mat distance_cpu;
    Mat allDist_cpu;
    Mat nMatches_cpu;

    for (int size = 1000; size <= 4000; size *= 2)
    {
        // Init CPU matcher
        int desc_len = 64;

        BFMatcher matcher(NORM_L2);

        Mat query;
        gen(query, size, desc_len, CV_32F, 0, 1);

        Mat train;
        gen(train, size, desc_len, CV_32F, 0, 1);
        // Output
        vector< vector<DMatch> > matches(2);
#ifdef USE_OPENCL
        // Init GPU matcher
        ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);

        ocl::oclMat d_query(query);
        ocl::oclMat d_train(train);

        ocl::oclMat d_trainIdx, d_distance, d_allDist, d_nMatches;
#endif
        SUBTEST << size << "; match";

        matcher.match(query, train, matches[0]);

        CPU_ON;
        matcher.match(query, train, matches[0]);
        CPU_OFF;

#ifdef USE_OPENCL
        WARMUP_ON;
        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
        WARMUP_OFF;

        GPU_ON;
        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
        GPU_OFF;

        GPU_FULL_ON;
        d_query.upload(query);
        d_train.upload(train);
        d_matcher.match(d_query, d_train, matches[0]);
        GPU_FULL_OFF;
#endif

        SUBTEST << size << "; knnMatch";

        matcher.knnMatch(query, train, matches, 2);

        CPU_ON;
        matcher.knnMatch(query, train, matches, 2);
        CPU_OFF;

#ifdef USE_OPENCL
        WARMUP_ON;
        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
        WARMUP_OFF;

        GPU_ON;
        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
        GPU_OFF;

        GPU_FULL_ON;
        d_query.upload(query);
        d_train.upload(train);
        d_matcher.knnMatch(d_query, d_train, matches, 2);
        GPU_FULL_OFF;
#endif
        SUBTEST << size << "; radiusMatch";

        float max_distance = 2.0f;

        matcher.radiusMatch(query, train, matches, max_distance);

        CPU_ON;
        matcher.radiusMatch(query, train, matches, max_distance);
        CPU_OFF;

#ifdef USE_OPENCL
        d_trainIdx.release();

        WARMUP_ON;
        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
        WARMUP_OFF;

        GPU_ON;
        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
        GPU_OFF;

        GPU_FULL_ON;
        d_query.upload(query);
        d_train.upload(train);
        d_matcher.radiusMatch(d_query, d_train, matches, max_distance);
        GPU_FULL_OFF;
#endif
    }
}
///////////// Lut ////////////////////////
TEST(lut)
{
    Mat src, lut, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_lut, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC3};
    std::string type_name[] = {"CV_8UC1", "CV_8UC3"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(src, size, size, all_type[j], 0, 256);
            gen(lut, 1, 256, CV_8UC1, 0, 1);
            gen(dst, size, size, all_type[j], 0, 256);

            LUT(src, lut, dst);

            CPU_ON;
            LUT(src, lut, dst);
            CPU_OFF;

#ifdef USE_OPENCL
            d_src.upload(src);
            d_lut.upload(lut);

            WARMUP_ON;
            ocl::LUT(d_src, d_lut, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::LUT(d_src, d_lut, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            d_lut.upload(lut);
            ocl::LUT(d_src, d_lut, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// Exp ////////////////////////
TEST(Exp)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size << "; CV_32FC1";

        gen(src, size, size, CV_32FC1, 0, 256);
        gen(dst, size, size, CV_32FC1, 0, 256);

        exp(src, dst);

        CPU_ON;
        exp(src, dst);
        CPU_OFF;
#ifdef USE_OPENCL
        d_src.upload(src);

        WARMUP_ON;
        ocl::exp(d_src, d_dst);
        WARMUP_OFF;

        GPU_ON;
        ocl::exp(d_src, d_dst);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
        ocl::exp(d_src, d_dst);
        d_dst.download(dst);
        GPU_FULL_OFF;
#endif
    }
}

///////////// LOG ////////////////////////
TEST(Log)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size << "; 32F";

        gen(src, size, size, CV_32F, 1, 10);

        log(src, dst);

        CPU_ON;
        log(src, dst);
        CPU_OFF;
#ifdef USE_OPENCL
        d_src.upload(src);

        WARMUP_ON;
        ocl::log(d_src, d_dst);
        WARMUP_OFF;

        GPU_ON;
        ocl::log(d_src, d_dst);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
        ocl::log(d_src, d_dst);
        d_dst.download(dst);
        GPU_FULL_OFF;
#endif
    }
}

///////////// Add ////////////////////////

TEST(Add)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(src1, size, size, all_type[j], 0, 1);
            gen(src2, size, size, all_type[j], 0, 1);

            add(src1, src2, dst);

            CPU_ON;
            add(src1, src2, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::add(d_src1, d_src2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::add(d_src1, d_src2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::add(d_src1, d_src2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// Mul ////////////////////////
TEST(Mul)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            multiply(src1, src2, dst);

            CPU_ON;
            multiply(src1, src2, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::multiply(d_src1, d_src2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::multiply(d_src1, d_src2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::multiply(d_src1, d_src2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// Div ////////////////////////
TEST(Div)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            divide(src1, src2, dst);

            CPU_ON;
            divide(src1, src2, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::divide(d_src1, d_src2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::divide(d_src1, d_src2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::divide(d_src1, d_src2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// Absdiff ////////////////////////
TEST(Absdiff)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            absdiff(src1, src2, dst);

            CPU_ON;
            absdiff(src1, src2, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::absdiff(d_src1, d_src2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::absdiff(d_src1, d_src2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::absdiff(d_src1, d_src2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// CartToPolar ////////////////////////
TEST(CartToPolar)
{
    Mat src1, src2, dst, dst1;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
#endif
    int all_type[] = {CV_32FC1};
    std::string type_name[] = {"CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);
            gen(dst1, size, size, all_type[j], 0, 256);


            cartToPolar(src1, src2, dst, dst1, 1);

            CPU_ON;
            cartToPolar(src1, src2, dst, dst1, 1);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
            WARMUP_OFF;

            GPU_ON;
            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
            d_dst.download(dst);
            d_dst1.download(dst1);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// PolarToCart ////////////////////////
TEST(PolarToCart)
{
    Mat src1, src2, dst, dst1;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
#endif
    int all_type[] = {CV_32FC1};
    std::string type_name[] = {"CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);
            gen(dst1, size, size, all_type[j], 0, 256);


            polarToCart(src1, src2, dst, dst1, 1);

            CPU_ON;
            polarToCart(src1, src2, dst, dst1, 1);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
            WARMUP_OFF;

            GPU_ON;
            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
            d_dst.download(dst);
            d_dst1.download(dst1);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// Magnitude ////////////////////////
TEST(magnitude)
{
    Mat x, y, mag;
#ifdef USE_OPENCL
    ocl::oclMat d_x, d_y, d_mag;
#endif
    int all_type[] = {CV_32FC1};
    std::string type_name[] = {"CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(x, size, size, all_type[j], 0, 1);
            gen(y, size, size, all_type[j], 0, 1);

            magnitude(x, y, mag);

            CPU_ON;
            magnitude(x, y, mag);
            CPU_OFF;
#ifdef USE_OPENCL
            d_x.upload(x);
            d_y.upload(y);

            WARMUP_ON;
            ocl::magnitude(d_x, d_y, d_mag);
            WARMUP_OFF;

            GPU_ON;
            ocl::magnitude(d_x, d_y, d_mag);
            GPU_OFF;

            GPU_FULL_ON;
            d_x.upload(x);
            d_y.upload(y);
            ocl::magnitude(d_x, d_y, d_mag);
            d_mag.download(mag);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// Transpose ////////////////////////
TEST(Transpose)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(src, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);

            transpose(src, dst);

            CPU_ON;
            transpose(src, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::transpose(d_src, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::transpose(d_src, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::transpose(d_src, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// Flip ////////////////////////
TEST(Flip)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; FLIP_BOTH";

            gen(src, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);

            flip(src, dst, 0);

            CPU_ON;
            flip(src, dst, 0);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::flip(d_src, d_dst, 0);
            WARMUP_OFF;

            GPU_ON;
            ocl::flip(d_src, d_dst, 0);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::flip(d_src, d_dst, 0);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// minMax ////////////////////////
TEST(minMax)
{
    Mat src;
#ifdef USE_OPENCL
    ocl::oclMat d_src;
#endif
    double min_val, max_val;
    Point min_loc, max_loc;
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(src, size, size, all_type[j], 0, 256);

            CPU_ON;
            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::minMax(d_src, &min_val, &max_val);
            WARMUP_OFF;

            GPU_ON;
            ocl::minMax(d_src, &min_val, &max_val);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::minMax(d_src, &min_val, &max_val);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// minMaxLoc ////////////////////////
TEST(minMaxLoc)
{
    Mat src;
#ifdef USE_OPENCL
    ocl::oclMat d_src;
#endif
    double min_val, max_val;
    Point min_loc, max_loc;
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 1);

            CPU_ON;
            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
            WARMUP_OFF;

            GPU_ON;
            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// Sum ////////////////////////
TEST(Sum)
{
    Mat src;
    Scalar cpures, gpures;
#ifdef USE_OPENCL
    ocl::oclMat d_src;
#endif
    int all_type[] = {CV_8UC1, CV_32SC1};
    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            cpures = sum(src);

            CPU_ON;
            cpures = sum(src);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            gpures = ocl::sum(d_src);
            WARMUP_OFF;

            GPU_ON;
            gpures = ocl::sum(d_src);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            gpures = ocl::sum(d_src);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// countNonZero ////////////////////////
TEST(countNonZero)
{
    Mat src;
#ifdef USE_OPENCL
    ocl::oclMat d_src;
#endif
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            countNonZero(src);

            CPU_ON;
            countNonZero(src);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::countNonZero(d_src);
            WARMUP_OFF;

            GPU_ON;
            ocl::countNonZero(d_src);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::countNonZero(d_src);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// Phase ////////////////////////
TEST(Phase)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_32FC1};
    std::string type_name[] = {"CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            phase(src1, src2, dst, 1);

            CPU_ON;
            phase(src1, src2, dst, 1);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::phase(d_src1, d_src2, d_dst, 1);
            WARMUP_OFF;

            GPU_ON;
            ocl::phase(d_src1, d_src2, d_dst, 1);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::phase(d_src1, d_src2, d_dst, 1);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// bitwise_and////////////////////////
TEST(bitwise_and)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_32SC1};
    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            bitwise_and(src1, src2, dst);

            CPU_ON;
            bitwise_and(src1, src2, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::bitwise_and(d_src1, d_src2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::bitwise_and(d_src1, d_src2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::bitwise_and(d_src1, d_src2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// bitwise_or////////////////////////
TEST(bitwise_or)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_32SC1};
    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            bitwise_or(src1, src2, dst);

            CPU_ON;
            bitwise_or(src1, src2, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::bitwise_or(d_src1, d_src2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::bitwise_or(d_src1, d_src2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::bitwise_or(d_src1, d_src2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// bitwise_xor////////////////////////
TEST(bitwise_xor)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_32SC1};
    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            bitwise_xor(src1, src2, dst);

            CPU_ON;
            bitwise_xor(src1, src2, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::bitwise_xor(d_src1, d_src2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::bitwise_xor(d_src1, d_src2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::bitwise_xor(d_src1, d_src2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// bitwise_not////////////////////////
TEST(bitwise_not)
{
    Mat src1, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_32SC1};
    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            bitwise_not(src1, dst);

            CPU_ON;
            bitwise_not(src1, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);

            WARMUP_ON;
            ocl::bitwise_not(d_src1, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::bitwise_not(d_src1, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            ocl::bitwise_not(d_src1, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// compare////////////////////////
TEST(compare)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int CMP_EQ = 0;
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            compare(src1, src2, dst, CMP_EQ);

            CPU_ON;
            compare(src1, src2, dst, CMP_EQ);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
            WARMUP_OFF;

            GPU_ON;
            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// pow ////////////////////////
TEST(pow)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_32FC1};
    std::string type_name[] = {"CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 100);
            gen(dst, size, size, all_type[j], 0, 100);

            pow(src, -2.0, dst);

            CPU_ON;
            pow(src, -2.0, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);
            d_dst.upload(dst);

            WARMUP_ON;
            ocl::pow(d_src, -2.0, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::pow(d_src, -2.0, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::pow(d_src, -2.0, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// MagnitudeSqr////////////////////////
TEST(MagnitudeSqr)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    int all_type[] = {CV_32FC1};
    std::string type_name[] = {"CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[t];

            gen(src1, size, size, all_type[t], 0, 256);
            gen(src2, size, size, all_type[t], 0, 256);
            gen(dst, size, size, all_type[t], 0, 256);


            for (int i = 0; i < src1.rows; ++i)

                for (int j = 0; j < src1.cols; ++j)
                {
                    float val1 = src1.at<float>(i, j);
                    float val2 = src2.at<float>(i, j);

                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;

                }

            CPU_ON;

            for (int i = 0; i < src1.rows; ++i)
                for (int j = 0; j < src1.cols; ++j)
                {
                    float val1 = src1.at<float>(i, j);
                    float val2 = src2.at<float>(i, j);

                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;

                }

            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// AddWeighted////////////////////////
TEST(AddWeighted)
{
    Mat src1, src2, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_dst;
#endif
    double alpha = 2.0, beta = 1.0, gama = 3.0;
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            addWeighted(src1, alpha, src2, beta, gama, dst);

            CPU_ON;
            addWeighted(src1, alpha, src2, beta, gama, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);
            d_src2.upload(src2);

            WARMUP_ON;
            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// Blur////////////////////////
TEST(Blur)
{
    Mat src1, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_dst;
#endif
    Size ksize = Size(3, 3);
    int bordertype = BORDER_CONSTANT;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            blur(src1, dst, ksize, Point(-1, -1), bordertype);

            CPU_ON;
            blur(src1, dst, ksize, Point(-1, -1), bordertype);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);

            WARMUP_ON;
            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
            WARMUP_OFF;

            GPU_ON;
            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// Laplacian////////////////////////
TEST(Laplacian)
{
    Mat src1, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_dst;
#endif
    int ksize = 3;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src1, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);


            Laplacian(src1, dst, -1, ksize, 1);

            CPU_ON;
            Laplacian(src1, dst, -1, ksize, 1);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src1.upload(src1);

            WARMUP_ON;
            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
            WARMUP_OFF;

            GPU_ON;
            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
            GPU_OFF;

            GPU_FULL_ON;
            d_src1.upload(src1);
            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// Erode ////////////////////
TEST(Erode)
{
    Mat src, dst, ker;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(256));
            ker = getStructuringElement(MORPH_RECT, Size(3, 3));

            erode(src, dst, ker);

            CPU_ON;
            erode(src, dst, ker);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::erode(d_src, d_dst, ker);
            WARMUP_OFF;

            GPU_ON;
            ocl::erode(d_src, d_dst, ker);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::erode(d_src, d_dst, ker);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// Sobel ////////////////////////
TEST(Sobel)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int dx = 1;
    int dy = 1;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            Sobel(src, dst, -1, dx, dy);

            CPU_ON;
            Sobel(src, dst, -1, dx, dy);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::Sobel(d_src, d_dst, -1, dx, dy);
            WARMUP_OFF;

            GPU_ON;
            ocl::Sobel(d_src, d_dst, -1, dx, dy);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::Sobel(d_src, d_dst, -1, dx, dy);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// Scharr ////////////////////////
TEST(Scharr)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int dx = 1;
    int dy = 0;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            Scharr(src, dst, -1, dx, dy);

            CPU_ON;
            Scharr(src, dst, -1, dx, dy);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::Scharr(d_src, d_dst, -1, dx, dy);
            WARMUP_OFF;

            GPU_ON;
            ocl::Scharr(d_src, d_dst, -1, dx, dy);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::Scharr(d_src, d_dst, -1, dx, dy);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// GaussianBlur ////////////////////////
TEST(GaussianBlur)
{
    Mat src, dst;
    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            GaussianBlur(src, dst, Size(9, 9), 0);

            CPU_ON;
            GaussianBlur(src, dst, Size(9, 9), 0);
            CPU_OFF;
#ifdef USE_OPENCL
            ocl::oclMat d_src(src);
            ocl::oclMat d_dst(src.size(), src.type());
            ocl::oclMat d_buf;

            WARMUP_ON;
            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
            WARMUP_OFF;

            GPU_ON;
            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// equalizeHist ////////////////////////
TEST(equalizeHist)
{
    Mat src, dst;
    int all_type[] = {CV_8UC1};
    std::string type_name[] = {"CV_8UC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            equalizeHist(src, dst);

            CPU_ON;
            equalizeHist(src, dst);
            CPU_OFF;
#ifdef USE_OPENCL
            ocl::oclMat d_src(src);
            ocl::oclMat d_dst;
            ocl::oclMat d_hist;
            ocl::oclMat d_buf;

            WARMUP_ON;
            ocl::equalizeHist(d_src, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::equalizeHist(d_src, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::equalizeHist(d_src, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
/////////// CopyMakeBorder //////////////////////
TEST(CopyMakeBorder)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_dst;
#endif
    int bordertype = BORDER_CONSTANT;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;


            gen(src, size, size, all_type[j], 0, 256);

            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));

            CPU_ON;
            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
            CPU_OFF;
#ifdef USE_OPENCL
            ocl::oclMat d_src(src);

            WARMUP_ON;
            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
            WARMUP_OFF;

            GPU_ON;
            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// cornerMinEigenVal ////////////////////////
TEST(cornerMinEigenVal)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_dst;
#endif
    int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
    int borderType = BORDER_REFLECT;
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;


            gen(src, size, size, all_type[j], 0, 256);

            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);

            CPU_ON;
            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
            CPU_OFF;
#ifdef USE_OPENCL
            ocl::oclMat d_src(src);

            WARMUP_ON;
            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
            WARMUP_OFF;

            GPU_ON;
            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// cornerHarris ////////////////////////
TEST(cornerHarris)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; BORDER_REFLECT";

            gen(src, size, size, all_type[j], 0, 1);

            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);

            CPU_ON;
            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
            WARMUP_OFF;

            GPU_ON;
            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }


    }
}
///////////// integral ////////////////////////
TEST(integral)
{
    Mat src, sum;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_sum, d_buf;
#endif
    int all_type[] = {CV_8UC1};
    std::string type_name[] = {"CV_8UC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j]  ;

            gen(src, size, size, all_type[j], 0, 256);

            integral(src, sum);

            CPU_ON;
            integral(src, sum);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::integral(d_src, d_sum);
            WARMUP_OFF;

            GPU_ON;
            ocl::integral(d_src, d_sum);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::integral(d_src, d_sum);
            d_sum.download(sum);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// WarpAffine ////////////////////////
TEST(WarpAffine)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    static const double coeffs[2][3] =
    {
        {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
        {sin(3.14 / 6), cos(3.14 / 6), -100.0}
    };
    Mat M(2, 3, CV_64F, (void *)coeffs);
    int interpolation = INTER_NEAREST;

    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};


    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);
            Size size1 = Size(size, size);

            warpAffine(src, dst, M, size1, interpolation);

            CPU_ON;
            warpAffine(src, dst, M, size1, interpolation);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
            WARMUP_OFF;

            GPU_ON;
            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// WarpPerspective ////////////////////////
TEST(WarpPerspective)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    static const double coeffs[3][3] =
    {
        {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
        {sin(3.14 / 6), cos(3.14 / 6), -100.0},
        {0.0, 0.0, 1.0}
    };
    Mat M(3, 3, CV_64F, (void *)coeffs);
    int interpolation = INTER_NEAREST;

    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);
            Size size1 = Size(size, size);

            warpPerspective(src, dst, M, size1, interpolation);

            CPU_ON;
            warpPerspective(src, dst, M, size1, interpolation);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
            WARMUP_OFF;

            GPU_ON;
            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// resize ////////////////////////
TEST(resize)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif

    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; up";

            gen(src, size, size, all_type[j], 0, 256);

            resize(src, dst, Size(), 2.0, 2.0);

            CPU_ON;
            resize(src, dst, Size(), 2.0, 2.0);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
            WARMUP_OFF;

            GPU_ON;
            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; down";

            gen(src, size, size, all_type[j], 0, 256);

            resize(src, dst, Size(), 0.5, 0.5);

            CPU_ON;
            resize(src, dst, Size(), 0.5, 0.5);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
            WARMUP_OFF;

            GPU_ON;
            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// threshold////////////////////////
TEST(threshold)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";

        gen(src, size, size, CV_8U, 0, 100);

        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);

        CPU_ON;
        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
        CPU_OFF;
#ifdef USE_OPENCL
        d_src.upload(src);

        WARMUP_ON;
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        WARMUP_OFF;

        GPU_ON;
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        d_dst.download(dst);
        GPU_FULL_OFF;
#endif
    }

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size << "; 32FC1; THRESH_TRUNC [NPP]";

        gen(src, size, size, CV_32FC1, 0, 100);

        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);

        CPU_ON;
        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
        CPU_OFF;
#ifdef USE_OPENCL
        d_src.upload(src);

        WARMUP_ON;
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
        WARMUP_OFF;

        GPU_ON;
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
        d_dst.download(dst);
        GPU_FULL_OFF;
#endif
    }
}
///////////// meanShiftFiltering////////////////////////
TEST(meanShiftFiltering)
{
    int sp = 10, sr = 10;

    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size << "; 8UC3 vs 8UC4";

        gen(src, size, size, CV_8UC3, Scalar::all(0), Scalar::all(256));

        pyrMeanShiftFiltering(src, dst, sp, sr);

        CPU_ON;
        pyrMeanShiftFiltering(src, dst, sp, sr);
        CPU_OFF;
#ifdef USE_OPENCL
        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));

        d_src.upload(src);

        WARMUP_ON;
        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
        WARMUP_OFF;

        GPU_ON;
        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
        d_dst.download(dst);
        GPU_FULL_OFF;
#endif
    }
}
///////////// meanShiftProc////////////////////////
COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size size, int sp, int sr, int maxIter, float eps, int *tab)
{

    int isr2 = sr * sr;
    int c0, c1, c2, c3;
    int iter;
    uchar *ptr = NULL;
    uchar *pstart = NULL;
    int revx = 0, revy = 0;
    c0 = sptr[0];
    c1 = sptr[1];
    c2 = sptr[2];
    c3 = sptr[3];

    // iterate meanshift procedure
    for (iter = 0; iter < maxIter; iter++)
    {
        int count = 0;
        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;

        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
        int minx = x0 - sp;
        int miny = y0 - sp;
        int maxx = x0 + sp;
        int maxy = y0 + sp;

        //deal with the image boundary
        if (minx < 0)
        {
            minx = 0;
        }

        if (miny < 0)
        {
            miny = 0;
        }

        if (maxx >= size.width)
        {
            maxx = size.width - 1;
        }

        if (maxy >= size.height)
        {
            maxy = size.height - 1;
        }

        if (iter == 0)
        {
            pstart = sptr;
        }
        else
        {
            pstart = pstart + revy * sstep + (revx << 2); //point to the new position
        }

        ptr = pstart;
        ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row

        for (int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
        {
            int rowCount = 0;
            int x = minx;
#if CV_ENABLE_UNROLLED

            for (; x + 4 <= maxx; x += 4, ptr += 16)
            {
                int t0, t1, t2;
                t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];

                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                {
                    s0 += t0;
                    s1 += t1;
                    s2 += t2;
                    sx += x;
                    rowCount++;
                }

                t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];

                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                {
                    s0 += t0;
                    s1 += t1;
                    s2 += t2;
                    sx += x + 1;
                    rowCount++;
                }

                t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];

                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                {
                    s0 += t0;
                    s1 += t1;
                    s2 += t2;
                    sx += x + 2;
                    rowCount++;
                }

                t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];

                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                {
                    s0 += t0;
                    s1 += t1;
                    s2 += t2;
                    sx += x + 3;
                    rowCount++;
                }
            }

#endif

            for (; x <= maxx; x++, ptr += 4)
            {
                int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];

                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
                {
                    s0 += t0;
                    s1 += t1;
                    s2 += t2;
                    sx += x;
                    rowCount++;
                }
            }

            if (rowCount == 0)
            {
                continue;
            }

            count += rowCount;
            sy += y * rowCount;
        }

        if (count == 0)
        {
            break;
        }

        int x1 = sx / count;
        int y1 = sy / count;
        s0 = s0 / count;
        s1 = s1 / count;
        s2 = s2 / count;

        bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
                        tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);

        //revise the pointer corresponding to the new (y0,x0)
        revx = x1 - x0;
        revy = y1 - y0;

        x0 = x1;
        y0 = y1;
        c0 = s0;
        c1 = s1;
        c2 = s2;

        if (stopFlag)
        {
            break;
        }
    } //for iter

    dptr[0] = (uchar)c0;
    dptr[1] = (uchar)c1;
    dptr[2] = (uchar)c2;
    dptr[3] = (uchar)c3;

    COOR coor;
    coor.x = static_cast<short>(x0);
    coor.y = static_cast<short>(y0);
    return coor;
}

void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, cv::TermCriteria crit)
{

    if (src_roi.empty())
    {
        CV_Error(CV_StsBadArg, "The input image is empty");
    }

    if (src_roi.depth() != CV_8U || src_roi.channels() != 4)
    {
        CV_Error(CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
    }

    CV_Assert((src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
              (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
    CV_Assert(!(dstCoor_roi.step & 0x3));

    if (!(crit.type & cv::TermCriteria::MAX_ITER))
    {
        crit.maxCount = 5;
    }

    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
    float eps;

    if (!(crit.type & cv::TermCriteria::EPS))
    {
        eps = 1.f;
    }

    eps = (float)std::max(crit.epsilon, 0.0);

    int tab[512];

    for (int i = 0; i < 512; i++)
    {
        tab[i] = (i - 255) * (i - 255);
    }

    uchar *sptr = src_roi.data;
    uchar *dptr = dst_roi.data;
    short *dCoorptr = (short *)dstCoor_roi.data;
    int sstep = (int)src_roi.step;
    int dstep = (int)dst_roi.step;
    int dCoorstep = (int)dstCoor_roi.step >> 1;
    cv::Size size = src_roi.size();

    for (int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
            dptr += dstep - (size.width << 2), dCoorptr += dCoorstep - (size.width << 1))
    {
        for (int j = 0; j < size.width; j++, sptr += 4, dptr += 4, dCoorptr += 2)
        {
            *((COOR *)dCoorptr) = do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
        }
    }

}
TEST(meanShiftProc)
{
    Mat src, dst, dstCoor_roi;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst, d_dstCoor_roi;
#endif
    TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";

        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
        gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
        gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));

        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);

        CPU_ON;
        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
        CPU_OFF;
#ifdef USE_OPENCL
        d_src.upload(src);

        WARMUP_ON;
        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
        WARMUP_OFF;

        GPU_ON;
        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
        d_dst.download(dst);
        d_dstCoor_roi.download(dstCoor_roi);
        GPU_FULL_OFF;
#endif
    }
}
///////////// ConvertTo////////////////////////
TEST(ConvertTo)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] << " to 32FC1";

            gen(src, size, size, all_type[j], 0, 256);
            //gen(dst, size, size, all_type[j], 0, 256);

            //d_dst.upload(dst);

            src.convertTo(dst, CV_32FC1);

            CPU_ON;
            src.convertTo(dst, CV_32FC1);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            d_src.convertTo(d_dst, CV_32FC1);
            WARMUP_OFF;

            GPU_ON;
            d_src.convertTo(d_dst, CV_32FC1);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            d_src.convertTo(d_dst, CV_32FC1);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// copyTo////////////////////////
TEST(copyTo)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);
            //gen(dst, size, size, all_type[j], 0, 256);

            //d_dst.upload(dst);

            src.copyTo(dst);

            CPU_ON;
            src.copyTo(dst);
            CPU_OFF;

#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            d_src.copyTo(d_dst);
            WARMUP_OFF;

            GPU_ON;
            d_src.copyTo(d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            d_src.copyTo(d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// setTo////////////////////////
TEST(setTo)
{
    Mat src, dst;
    Scalar val(1, 2, 3, 4);
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

            gen(src, size, size, all_type[j], 0, 256);

            src.setTo(val);

            CPU_ON;
            src.setTo(val);
            CPU_OFF;
#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            d_src.setTo(val);
            WARMUP_OFF;

            GPU_ON;
            d_src.setTo(val);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            d_src.setTo(val);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// Merge////////////////////////
TEST(Merge)
{
    Mat dst;
#ifdef USE_OPENCL
    ocl::oclMat d_dst;
#endif
    int channels = 4;
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
            Size size1 = Size(size, size);
            std::vector<Mat> src(channels);

            for (int i = 0; i < channels; ++i)
            {
                src[i] = Mat(size1, all_type[j], cv::Scalar::all(i));
            }

            merge(src, dst);

            CPU_ON;
            merge(src, dst);
            CPU_OFF;

#ifdef USE_OPENCL
            std::vector<ocl::oclMat> d_src(channels);

            for (int i = 0; i < channels; ++i)
            {
                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
            }

            WARMUP_ON;
            ocl::merge(d_src, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::merge(d_src, d_dst);
            GPU_OFF;

            GPU_FULL_ON;

            for (int i = 0; i < channels; ++i)
            {
                d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
            }

            ocl::merge(d_src, d_dst);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// Split////////////////////////
TEST(Split)
{
    //int channels = 4;
    int all_type[] = {CV_8UC1, CV_32FC1};
    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j];
            Size size1 = Size(size, size);

            Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));

            std::vector<cv::Mat> dst;

            split(src, dst);

            CPU_ON;
            split(src, dst);
            CPU_OFF;

#ifdef USE_OPENCL
            ocl::oclMat d_src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
            std::vector<cv::ocl::oclMat> d_dst;

            WARMUP_ON;
            ocl::split(d_src, d_dst);
            WARMUP_OFF;

            GPU_ON;
            ocl::split(d_src, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::split(d_src, d_dst);
            GPU_FULL_OFF;
#endif
        }

    }
}


///////////// norm////////////////////////
TEST(norm)
{
    Mat src, buf;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_buf;
#endif

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";

        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));

        norm(src, NORM_INF);

        CPU_ON;
        norm(src, NORM_INF);
        CPU_OFF;

#ifdef USE_OPENCL
        d_src.upload(src);
        d_buf.upload(buf);

        WARMUP_ON;
        ocl::norm(d_src, d_buf, NORM_INF);
        WARMUP_OFF;

        GPU_ON;
        ocl::norm(d_src, d_buf, NORM_INF);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
        ocl::norm(d_src, d_buf, NORM_INF);
        GPU_FULL_OFF;
#endif
    }
}
///////////// remap////////////////////////
TEST(remap)
{
    Mat src, dst, xmap, ymap;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst, d_xmap, d_ymap;
#endif
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

    int interpolation = INTER_LINEAR;
    int borderMode = BORDER_CONSTANT;

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
        {
            SUBTEST << size << 'x' << size << "; src " << type_name[t] << "; map CV_32FC1";

            gen(src, size, size, all_type[t], 0, 256);

            xmap.create(size, size, CV_32FC1);
            dst.create(size, size, CV_32FC1);
            ymap.create(size, size, CV_32FC1);

            for (int i = 0; i < size; ++i)
            {
                float *xmap_row = xmap.ptr<float>(i);
                float *ymap_row = ymap.ptr<float>(i);

                for (int j = 0; j < size; ++j)
                {
                    xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
                    ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
                }
            }


            remap(src, dst, xmap, ymap, interpolation, borderMode);

            CPU_ON;
            remap(src, dst, xmap, ymap, interpolation, borderMode);
            CPU_OFF;

#ifdef USE_OPENCL
            d_src.upload(src);
            d_dst.upload(dst);
            d_xmap.upload(xmap);
            d_ymap.upload(ymap);

            WARMUP_ON;
            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
            WARMUP_OFF;

            GPU_ON;
            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}
///////////// cvtColor////////////////////////
TEST(cvtColor)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif
    int all_type[] = {CV_8UC4};
    std::string type_name[] = {"CV_8UC4"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            gen(src, size, size, all_type[j], 0, 256);
            SUBTEST << size << "x" << size << "; " << type_name[j] << " ; CV_RGBA2GRAY";

            cvtColor(src, dst, CV_RGBA2GRAY, 4);

            CPU_ON;
            cvtColor(src, dst, CV_RGBA2GRAY, 4);
            CPU_OFF;

#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
            WARMUP_OFF;

            GPU_ON;
            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }


    }


}
///////////// filter2D////////////////////////
TEST(filter2D)
{
    Mat src;

    for (int size = 1000; size <= 4000; size *= 2)
    {
        int all_type[] = {CV_8UC1, CV_8UC4};
        std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            gen(src, size, size, all_type[j], 0, 256);

            for (int ksize = 3; ksize <= 15; ksize = 2*ksize+1)
            {
                SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;

                Mat kernel;
                gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);

                Mat dst;
                cv::filter2D(src, dst, -1, kernel);

                CPU_ON;
                cv::filter2D(src, dst, -1, kernel);
                CPU_OFF;
#ifdef USE_OPENCL
                ocl::oclMat d_src(src);
                ocl::oclMat d_dst;

                WARMUP_ON;
                ocl::filter2D(d_src, d_dst, -1, kernel);
                WARMUP_OFF;

                GPU_ON;
                ocl::filter2D(d_src, d_dst, -1, kernel);
                GPU_OFF;

                GPU_FULL_ON;
                d_src.upload(src);
                ocl::filter2D(d_src, d_dst, -1, kernel);
                d_dst.download(dst);
                GPU_FULL_OFF;
#endif
            }

        }


    }
}


///////////// dft ////////////////////////
TEST(dft)
{
    Mat src, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src, d_dst;
#endif

    int all_type[] = {CV_32FC1, CV_32FC2};
    std::string type_name[] = {"CV_32FC1", "CV_32FC2"};

    for (int size = 1000; size <= 4000; size *= 2)
    {
        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; complex-to-complex";

            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(1));

            dft(src, dst);

            CPU_ON;
            dft(src, dst);
            CPU_OFF;

#ifdef USE_OPENCL
            d_src.upload(src);

            WARMUP_ON;
            ocl::dft(d_src, d_dst, Size(size, size));
            WARMUP_OFF;

            GPU_ON;
            ocl::dft(d_src, d_dst, Size(size, size));
            GPU_OFF;

            GPU_FULL_ON;
            d_src.upload(src);
            ocl::dft(d_src, d_dst, Size(size, size));
            d_dst.download(dst);
            GPU_FULL_OFF;
#endif
        }

    }
}

///////////// gemm ////////////////////////
TEST(gemm)
{
    Mat src1, src2, src3, dst;
#ifdef USE_OPENCL
    ocl::oclMat d_src1, d_src2, d_src3, d_dst;
#endif

    for (int size = 1000; size <= 4000; size *= 2)
    {
        SUBTEST << size << 'x' << size;

        gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
        gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
        gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));

        gemm(src1, src2, 1.0, src3, 1.0, dst);

        CPU_ON;
        gemm(src1, src2, 1.0, src3, 1.0, dst);
        CPU_OFF;

#ifdef USE_OPENCL
        d_src1.upload(src1);
        d_src2.upload(src2);
        d_src3.upload(src3);

        WARMUP_ON;
        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
        WARMUP_OFF;

        GPU_ON;
        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
        GPU_OFF;

        GPU_FULL_ON;
        d_src1.upload(src1);
        d_src2.upload(src2);
        d_src3.upload(src3);
        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
        d_dst.download(dst);
        GPU_FULL_OFF;
#endif
    }
}

int main(int argc, const char *argv[])
{
#ifdef USE_OPENCL
    vector<ocl::Info> oclinfo;
    int num_devices = getDevice(oclinfo);

    if (num_devices < 1)
    {
        cerr << "no device found\n";
        return -1;
    }

    int devidx = 0;

    for (size_t i = 0; i < oclinfo.size(); i++)
    {
        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
        {
            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
        }
    }

#endif
    redirectError(cvErrorCallback);

    const char *keys =
        "{ h help    | false | print help message }"
        "{ f filter  |       | filter for test }"
        "{ w workdir |       | set working directory }"
        "{ l list    | false | show all tests }"
        "{ d device  | 0     | device id }"
        "{ i iters   | 10    | iteration count }"
        "{ m warmup  | 1     | gpu warm up iteration count}"
        "{ t xtop    | 1.1   | xfactor top boundary}"
        "{ b xbottom | 0.9   | xfactor bottom boundary}"
        "{ v verify  | false | only run gpu once to verify if problems occur}";

    CommandLineParser cmd(argc, argv, keys);

    if (cmd.get<bool>("help"))
    {
        cout << "Avaible options:" << endl;
        cmd.printMessage();
        return 0;
    }

#ifdef USE_OPENCL
    int device = cmd.get<int>("device");

    if (device < 0 || device >= num_devices)
    {
        cerr << "Invalid device ID" << endl;
        return -1;
    }

    if (cmd.get<bool>("verify"))
    {
        TestSystem::instance().setNumIters(1);
        TestSystem::instance().setGPUWarmupIters(0);
        TestSystem::instance().setCPUIters(0);
    }

    devidx = 0;

    for (size_t i = 0; i < oclinfo.size(); i++)
    {
        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
        {
            if (device == devidx)
            {
                ocl::setDevice(oclinfo[i], (int)j);
                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
                goto END_DEV;
            }
        }
    }

END_DEV:

#endif
    string filter = cmd.get<string>("filter");
    string workdir = cmd.get<string>("workdir");
    bool list = cmd.get<bool>("list");
    int iters = cmd.get<int>("iters");
    int wu_iters = cmd.get<int>("warmup");
    double x_top = cmd.get<double>("xtop");
    double x_bottom = cmd.get<double>("xbottom");

    TestSystem::instance().setTopThreshold(x_top);
    TestSystem::instance().setBottomThreshold(x_bottom);

    if (!filter.empty())
    {
        TestSystem::instance().setTestFilter(filter);
    }

    if (!workdir.empty())
    {
        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
        {
            workdir += '/';
        }

        TestSystem::instance().setWorkingDir(workdir);
    }

    if (list)
    {
        TestSystem::instance().setListMode(true);
    }

    TestSystem::instance().setNumIters(iters);
    TestSystem::instance().setGPUWarmupIters(wu_iters);

    TestSystem::instance().run();

    return 0;
}