Merge pull request #536 from bitwangyaoyao:2.4_fixHaar

3b1fc16f · Andrey Kamaev · OpenCV Buildbot · e6dd4e84 · 9bbf1700 · 3b1fc16f
Commit 3b1fc16f authored Feb 24, 2013 by Andrey Kamaev Committed by OpenCV Buildbot Feb 24, 2013
Hide whitespace changes
Inline Side-by-side

Showing with 211 additions and 1454 deletions

haar.cpp modules/ocl/src/haar.cpp +4 -1252

haarobjectdetect.cl modules/ocl/src/kernels/haarobjectdetect.cl +207 -202

No files found.
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -47,15 +47,10 @@
 //
 //M*/

-/* Haar features calculation */
-//#define EMU
-
 #include "precomp.hpp"
 #include <stdio.h>
 #include <string>
-#ifdef EMU
-#include "runCL.h"
-#endif
+
 using namespace cv;
 using namespace cv::ocl;
 using namespace std;
@@ -926,7 +921,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
    if( gimg.cols < minSize.width || gimg.rows < minSize.height )
        CV_Error(CV_StsError, "Image too small");

-    if( (flags & CV_HAAR_SCALE_IMAGE) && gimg.clCxt->impl->devName.find("Intel(R) HD Graphics") == string::npos )
+    if( (flags & CV_HAAR_SCALE_IMAGE) )
    {
        CvSize winSize0 = cascade->orig_window_size;
        //float scalefactor = 1.1f;
@@ -1114,30 +1109,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p ));
        args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
        args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
-        /*
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&stagebuffer));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&scaleinfobuffer));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&nodebuffer));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&gsum.data));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&gsqsum.data));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&candidatebuffer));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&pixelstep));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&loopcount));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&startstage));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&splitstage));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&endstage));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&startnode));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&splitnode));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int4), (void *)&p));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int4), (void *)&pq));
-         openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_float), (void *)&correction));*/
-        //openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_int),(void*)&n));
-        //openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_int),(void*)&grpnumperline));
-        //openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_int),(void*)&totalgrp));
-
-        //    openCLSafeCall(clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL));
-
-        //    openCLSafeCall(clFinish(gsum.clCxt->impl->clCmdQueue));
+
        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
        //t = (double)cvGetTickCount() - t;
        //printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
@@ -1258,13 +1230,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
            scaleinfo[i].factor = factor;
            int startnodenum = nodenum * i;
            float factor2 = (float)factor;
-            /*
-             openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_mem), (void *)&nodebuffer));
-             openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_mem), (void *)&newnodebuffer));
-             openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_float), (void *)&factor2));
-             openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_float), (void *)&correction[i]));
-             openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_int), (void *)&startnodenum));
-             */

            vector<pair<size_t, const void *> > args1;
            args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
@@ -1298,22 +1263,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
        //int argcount = 0;
-        /*openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&stagebuffer));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&scaleinfobuffer));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&newnodebuffer));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&gsum.data));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&gsqsum.data));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&candidatebuffer));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&step));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&loopcount));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&startstage));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&splitstage));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&endstage));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&startnode));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&splitnode));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&pbuffer));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_mem), (void *)&correctionbuffer));
-        openCLSafeCall(clSetKernelArg(kernel, argcount++, sizeof(cl_int), (void *)&nodenum));*/

        vector<pair<size_t, const void *> > args;
        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
@@ -1335,8 +1284,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS


        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
-        //openCLSafeCall(clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL));
-        //openCLSafeCall(clFinish(gsum.clCxt->impl->clCmdQueue));

        //openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
        candidate = (int *)clEnqueueMapBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
@@ -1407,204 +1354,10 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
 }


-// static CvHaarClassifierCascade * gpuLoadCascadeCART( const char **input_cascade, int n, CvSize orig_window_size )
-// {
-//     int i;
-//     CvHaarClassifierCascade *cascade = gpuCreateHaarClassifierCascade(n);
-//     cascade->orig_window_size = orig_window_size;
-
-//     for( i = 0; i < n; i++ )
-//     {
-//         int j, count, l;
-//         float threshold = 0;
-//         const char *stage = input_cascade[i];
-//         int dl = 0;
-
-//         /* tree links */
-//         int parent = -1;
-//         int next = -1;
-
-//         sscanf( stage, "%d%n", &count, &dl );
-//         stage += dl;
-
-//         assert( count > 0 );
-//         cascade->stage_classifier[i].count = count;
-//         cascade->stage_classifier[i].classifier =
-//             (CvHaarClassifier *)cvAlloc( count * sizeof(cascade->stage_classifier[i].classifier[0]));
-
-//         for( j = 0; j < count; j++ )
-//         {
-//             CvHaarClassifier *classifier = cascade->stage_classifier[i].classifier + j;
-//             int k, rects = 0;
-//             char str[100];
-
-//             sscanf( stage, "%d%n", &classifier->count, &dl );
-//             stage += dl;
-
-//             classifier->haar_feature = (CvHaarFeature *) cvAlloc(
-//                                            classifier->count * ( sizeof( *classifier->haar_feature ) +
-//                                                    sizeof( *classifier->threshold ) +
-//                                                    sizeof( *classifier->left ) +
-//                                                    sizeof( *classifier->right ) ) +
-//                                            (classifier->count + 1) * sizeof( *classifier->alpha ) );
-//             classifier->threshold = (float *) (classifier->haar_feature + classifier->count);
-//             classifier->left = (int *) (classifier->threshold + classifier->count);
-//             classifier->right = (int *) (classifier->left + classifier->count);
-//             classifier->alpha = (float *) (classifier->right + classifier->count);
-
-//             for( l = 0; l < classifier->count; l++ )
-//             {
-//                 sscanf( stage, "%d%n", &rects, &dl );
-//                 stage += dl;
-
-//                 assert( rects >= 2 && rects <= CV_HAAR_FEATURE_MAX );
-
-//                 for( k = 0; k < rects; k++ )
-//                 {
-//                     CvRect r;
-//                     int band = 0;
-//                     sscanf( stage, "%d%d%d%d%d%f%n",
-//                             &r.x, &r.y, &r.width, &r.height, &band,
-//                             &(classifier->haar_feature[l].rect[k].weight), &dl );
-//                     stage += dl;
-//                     classifier->haar_feature[l].rect[k].r = r;
-//                 }
-//                 sscanf( stage, "%s%n", str, &dl );
-//                 stage += dl;
-
-//                 classifier->haar_feature[l].tilted = strncmp( str, "tilted", 6 ) == 0;
-
-//                 for( k = rects; k < CV_HAAR_FEATURE_MAX; k++ )
-//                 {
-//                     memset( classifier->haar_feature[l].rect + k, 0,
-//                             sizeof(classifier->haar_feature[l].rect[k]) );
-//                 }
-
-//                 sscanf( stage, "%f%d%d%n", &(classifier->threshold[l]),
-//                         &(classifier->left[l]),
-//                         &(classifier->right[l]), &dl );
-//                 stage += dl;
-//             }
-//             for( l = 0; l <= classifier->count; l++ )
-//             {
-//                 sscanf( stage, "%f%n", &(classifier->alpha[l]), &dl );
-//                 stage += dl;
-//             }
-//         }
-
-//         sscanf( stage, "%f%n", &threshold, &dl );
-//         stage += dl;
-
-//         cascade->stage_classifier[i].threshold = threshold;
-
-//         /* load tree links */
-//         if( sscanf( stage, "%d%d%n", &parent, &next, &dl ) != 2 )
-//         {
-//             parent = i - 1;
-//             next = -1;
-//         }
-//         stage += dl;
-
-//         cascade->stage_classifier[i].parent = parent;
-//         cascade->stage_classifier[i].next = next;
-//         cascade->stage_classifier[i].child = -1;
-
-//         if( parent != -1 && cascade->stage_classifier[parent].child == -1 )
-//         {
-//             cascade->stage_classifier[parent].child = i;
-//         }
-//     }
-
-//     return cascade;
-// }
-
 #ifndef _MAX_PATH
 #define _MAX_PATH 1024
 #endif

-// static CvHaarClassifierCascade * gpuLoadHaarClassifierCascade( const char *directory, CvSize orig_window_size )
-// {
-//     const char **input_cascade = 0;
-//     CvHaarClassifierCascade *cascade = 0;
-
-//     int i, n;
-//     const char *slash;
-//     char name[_MAX_PATH];
-//     int size = 0;
-//     char *ptr = 0;
-
-//     if( !directory )
-//         CV_Error( CV_StsNullPtr, "Null path is passed" );
-
-//     n = (int)strlen(directory) - 1;
-//     slash = directory[n] == '\\' || directory[n] == '/' ? "" : "/";
-
-//     /* try to read the classifier from directory */
-//     for( n = 0; ; n++ )
-//     {
-//         sprintf( name, "%s%s%d/AdaBoostCARTHaarClassifier.txt", directory, slash, n );
-//         FILE *f = fopen( name, "rb" );
-//         if( !f )
-//             break;
-//         fseek( f, 0, SEEK_END );
-//         size += ftell( f ) + 1;
-//         fclose(f);
-//     }
-
-//     if( n == 0 && slash[0] )
-//         return (CvHaarClassifierCascade *)cvLoad( directory );
-
-//     if( n == 0 )
-//         CV_Error( CV_StsBadArg, "Invalid path" );
-
-//     size += (n + 1) * sizeof(char *);
-//     input_cascade = (const char **)cvAlloc( size );
-//     ptr = (char *)(input_cascade + n + 1);
-
-//     for( i = 0; i < n; i++ )
-//     {
-//         sprintf( name, "%s/%d/AdaBoostCARTHaarClassifier.txt", directory, i );
-//         FILE *f = fopen( name, "rb" );
-//         if( !f )
-//             CV_Error( CV_StsError, "" );
-//         fseek( f, 0, SEEK_END );
-//         size = ftell( f );
-//         fseek( f, 0, SEEK_SET );
-//         CV_Assert((size_t)size == fread( ptr, 1, size, f ));
-//         fclose(f);
-//         input_cascade[i] = ptr;
-//         ptr += size;
-//         *ptr++ = '\0';
-//     }
-
-//     input_cascade[n] = 0;
-//     cascade = gpuLoadCascadeCART( input_cascade, n, orig_window_size );
-
-//     if( input_cascade )
-//         cvFree( &input_cascade );
-
-//     return cascade;
-// }
-
-
-// static void gpuReleaseHaarClassifierCascade( CvHaarClassifierCascade **_cascade )
-// {
-//     if( _cascade && *_cascade )
-//     {
-//         int i, j;
-//         CvHaarClassifierCascade *cascade = *_cascade;
-
-//         for( i = 0; i < cascade->count; i++ )
-//         {
-//             for( j = 0; j < cascade->stage_classifier[i].count; j++ )
-//                 cvFree( &cascade->stage_classifier[i].classifier[j].haar_feature );
-//             cvFree( &cascade->stage_classifier[i].classifier );
-//         }
-//         gpuReleaseHidHaarClassifierCascade( (GpuHidHaarClassifierCascade **)&cascade->hid_cascade );
-//         cvFree( _cascade );
-//     }
-// }
-

 /****************************************************************************************\
 *                                  Persistence functions                                 *
@@ -1627,937 +1380,11 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
 #define ICV_HAAR_PARENT_NAME            "parent"
 #define ICV_HAAR_NEXT_NAME              "next"

-// static int gpuIsHaarClassifier( const void *struct_ptr )
-// {
-//     return CV_IS_HAAR_CLASSIFIER( struct_ptr );
-// }
-
-// static void * gpuReadHaarClassifier( CvFileStorage *fs, CvFileNode *node )
-// {
-//     CvHaarClassifierCascade *cascade = NULL;
-
-//     char buf[256];
-//     CvFileNode *seq_fn = NULL; /* sequence */
-//     CvFileNode *fn = NULL;
-//     CvFileNode *stages_fn = NULL;
-//     CvSeqReader stages_reader;
-//     int n;
-//     int i, j, k, l;
-//     int parent, next;
-
-//     stages_fn = cvGetFileNodeByName( fs, node, ICV_HAAR_STAGES_NAME );
-//     if( !stages_fn || !CV_NODE_IS_SEQ( stages_fn->tag) )
-//         CV_Error( CV_StsError, "Invalid stages node" );
-
-//     n = stages_fn->data.seq->total;
-//     cascade = gpuCreateHaarClassifierCascade(n);
-
-//     /* read size */
-//     seq_fn = cvGetFileNodeByName( fs, node, ICV_HAAR_SIZE_NAME );
-//     if( !seq_fn || !CV_NODE_IS_SEQ( seq_fn->tag ) || seq_fn->data.seq->total != 2 )
-//         CV_Error( CV_StsError, "size node is not a valid sequence." );
-//     fn = (CvFileNode *) cvGetSeqElem( seq_fn->data.seq, 0 );
-//     if( !CV_NODE_IS_INT( fn->tag ) || fn->data.i <= 0 )
-//         CV_Error( CV_StsError, "Invalid size node: width must be positive integer" );
-//     cascade->orig_window_size.width = fn->data.i;
-//     fn = (CvFileNode *) cvGetSeqElem( seq_fn->data.seq, 1 );
-//     if( !CV_NODE_IS_INT( fn->tag ) || fn->data.i <= 0 )
-//         CV_Error( CV_StsError, "Invalid size node: height must be positive integer" );
-//     cascade->orig_window_size.height = fn->data.i;
-
-//     cvStartReadSeq( stages_fn->data.seq, &stages_reader );
-//     for( i = 0; i < n; ++i )
-//     {
-//         CvFileNode *stage_fn;
-//         CvFileNode *trees_fn;
-//         CvSeqReader trees_reader;
-
-//         stage_fn = (CvFileNode *) stages_reader.ptr;
-//         if( !CV_NODE_IS_MAP( stage_fn->tag ) )
-//         {
-//             sprintf( buf, "Invalid stage %d", i );
-//             CV_Error( CV_StsError, buf );
-//         }
-
-//         trees_fn = cvGetFileNodeByName( fs, stage_fn, ICV_HAAR_TREES_NAME );
-//         if( !trees_fn || !CV_NODE_IS_SEQ( trees_fn->tag )
-//                 || trees_fn->data.seq->total <= 0 )
-//         {
-//             sprintf( buf, "Trees node is not a valid sequence. (stage %d)", i );
-//             CV_Error( CV_StsError, buf );
-//         }
-
-//         cascade->stage_classifier[i].classifier =
-//             (CvHaarClassifier *) cvAlloc( trees_fn->data.seq->total
-//                                           * sizeof( cascade->stage_classifier[i].classifier[0] ) );
-//         for( j = 0; j < trees_fn->data.seq->total; ++j )
-//         {
-//             cascade->stage_classifier[i].classifier[j].haar_feature = NULL;
-//         }
-//         cascade->stage_classifier[i].count = trees_fn->data.seq->total;
-
-//         cvStartReadSeq( trees_fn->data.seq, &trees_reader );
-//         for( j = 0; j < trees_fn->data.seq->total; ++j )
-//         {
-//             CvFileNode *tree_fn;
-//             CvSeqReader tree_reader;
-//             CvHaarClassifier *classifier;
-//             int last_idx;
-
-//             classifier = &cascade->stage_classifier[i].classifier[j];
-//             tree_fn = (CvFileNode *) trees_reader.ptr;
-//             if( !CV_NODE_IS_SEQ( tree_fn->tag ) || tree_fn->data.seq->total <= 0 )
-//             {
-//                 sprintf( buf, "Tree node is not a valid sequence."
-//                          " (stage %d, tree %d)", i, j );
-//                 CV_Error( CV_StsError, buf );
-//             }
-
-//             classifier->count = tree_fn->data.seq->total;
-//             classifier->haar_feature = (CvHaarFeature *) cvAlloc(
-//                                            classifier->count * ( sizeof( *classifier->haar_feature ) +
-//                                                    sizeof( *classifier->threshold ) +
-//                                                    sizeof( *classifier->left ) +
-//                                                    sizeof( *classifier->right ) ) +
-//                                            (classifier->count + 1) * sizeof( *classifier->alpha ) );
-//             classifier->threshold = (float *) (classifier->haar_feature + classifier->count);
-//             classifier->left = (int *) (classifier->threshold + classifier->count);
-//             classifier->right = (int *) (classifier->left + classifier->count);
-//             classifier->alpha = (float *) (classifier->right + classifier->count);
-
-//             cvStartReadSeq( tree_fn->data.seq, &tree_reader );
-//             for( k = 0, last_idx = 0; k < tree_fn->data.seq->total; ++k )
-//             {
-//                 CvFileNode *node_fn;
-//                 CvFileNode *feature_fn;
-//                 CvFileNode *rects_fn;
-//                 CvSeqReader rects_reader;
-
-//                 node_fn = (CvFileNode *) tree_reader.ptr;
-//                 if( !CV_NODE_IS_MAP( node_fn->tag ) )
-//                 {
-//                     sprintf( buf, "Tree node %d is not a valid map. (stage %d, tree %d)",
-//                              k, i, j );
-//                     CV_Error( CV_StsError, buf );
-//                 }
-//                 feature_fn = cvGetFileNodeByName( fs, node_fn, ICV_HAAR_FEATURE_NAME );
-//                 if( !feature_fn || !CV_NODE_IS_MAP( feature_fn->tag ) )
-//                 {
-//                     sprintf( buf, "Feature node is not a valid map. "
-//                              "(stage %d, tree %d, node %d)", i, j, k );
-//                     CV_Error( CV_StsError, buf );
-//                 }
-//                 rects_fn = cvGetFileNodeByName( fs, feature_fn, ICV_HAAR_RECTS_NAME );
-//                 if( !rects_fn || !CV_NODE_IS_SEQ( rects_fn->tag )
-//                         || rects_fn->data.seq->total < 1
-//                         || rects_fn->data.seq->total > CV_HAAR_FEATURE_MAX )
-//                 {
-//                     sprintf( buf, "Rects node is not a valid sequence. "
-//                              "(stage %d, tree %d, node %d)", i, j, k );
-//                     CV_Error( CV_StsError, buf );
-//                 }
-//                 cvStartReadSeq( rects_fn->data.seq, &rects_reader );
-//                 for( l = 0; l < rects_fn->data.seq->total; ++l )
-//                 {
-//                     CvFileNode *rect_fn;
-//                     CvRect r;
-
-//                     rect_fn = (CvFileNode *) rects_reader.ptr;
-//                     if( !CV_NODE_IS_SEQ( rect_fn->tag ) || rect_fn->data.seq->total != 5 )
-//                     {
-//                         sprintf( buf, "Rect %d is not a valid sequence. "
-//                                  "(stage %d, tree %d, node %d)", l, i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-
-//                     fn = CV_SEQ_ELEM( rect_fn->data.seq, CvFileNode, 0 );
-//                     if( !CV_NODE_IS_INT( fn->tag ) || fn->data.i < 0 )
-//                     {
-//                         sprintf( buf, "x coordinate must be non-negative integer. "
-//                                  "(stage %d, tree %d, node %d, rect %d)", i, j, k, l );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     r.x = fn->data.i;
-//                     fn = CV_SEQ_ELEM( rect_fn->data.seq, CvFileNode, 1 );
-//                     if( !CV_NODE_IS_INT( fn->tag ) || fn->data.i < 0 )
-//                     {
-//                         sprintf( buf, "y coordinate must be non-negative integer. "
-//                                  "(stage %d, tree %d, node %d, rect %d)", i, j, k, l );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     r.y = fn->data.i;
-//                     fn = CV_SEQ_ELEM( rect_fn->data.seq, CvFileNode, 2 );
-//                     if( !CV_NODE_IS_INT( fn->tag ) || fn->data.i <= 0
-//                             || r.x + fn->data.i > cascade->orig_window_size.width )
-//                     {
-//                         sprintf( buf, "width must be positive integer and "
-//                                  "(x + width) must not exceed window width. "
-//                                  "(stage %d, tree %d, node %d, rect %d)", i, j, k, l );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     r.width = fn->data.i;
-//                     fn = CV_SEQ_ELEM( rect_fn->data.seq, CvFileNode, 3 );
-//                     if( !CV_NODE_IS_INT( fn->tag ) || fn->data.i <= 0
-//                             || r.y + fn->data.i > cascade->orig_window_size.height )
-//                     {
-//                         sprintf( buf, "height must be positive integer and "
-//                                  "(y + height) must not exceed window height. "
-//                                  "(stage %d, tree %d, node %d, rect %d)", i, j, k, l );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     r.height = fn->data.i;
-//                     fn = CV_SEQ_ELEM( rect_fn->data.seq, CvFileNode, 4 );
-//                     if( !CV_NODE_IS_REAL( fn->tag ) )
-//                     {
-//                         sprintf( buf, "weight must be real number. "
-//                                  "(stage %d, tree %d, node %d, rect %d)", i, j, k, l );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-
-//                     classifier->haar_feature[k].rect[l].weight = (float) fn->data.f;
-//                     classifier->haar_feature[k].rect[l].r = r;
-
-//                     CV_NEXT_SEQ_ELEM( sizeof( *rect_fn ), rects_reader );
-//                 } /* for each rect */
-//                 for( l = rects_fn->data.seq->total; l < CV_HAAR_FEATURE_MAX; ++l )
-//                 {
-//                     classifier->haar_feature[k].rect[l].weight = 0;
-//                     classifier->haar_feature[k].rect[l].r = cvRect( 0, 0, 0, 0 );
-//                 }
-
-//                 fn = cvGetFileNodeByName( fs, feature_fn, ICV_HAAR_TILTED_NAME);
-//                 if( !fn || !CV_NODE_IS_INT( fn->tag ) )
-//                 {
-//                     sprintf( buf, "tilted must be 0 or 1. "
-//                              "(stage %d, tree %d, node %d)", i, j, k );
-//                     CV_Error( CV_StsError, buf );
-//                 }
-//                 classifier->haar_feature[k].tilted = ( fn->data.i != 0 );
-//                 fn = cvGetFileNodeByName( fs, node_fn, ICV_HAAR_THRESHOLD_NAME);
-//                 if( !fn || !CV_NODE_IS_REAL( fn->tag ) )
-//                 {
-//                     sprintf( buf, "threshold must be real number. "
-//                              "(stage %d, tree %d, node %d)", i, j, k );
-//                     CV_Error( CV_StsError, buf );
-//                 }
-//                 classifier->threshold[k] = (float) fn->data.f;
-//                 fn = cvGetFileNodeByName( fs, node_fn, ICV_HAAR_LEFT_NODE_NAME);
-//                 if( fn )
-//                 {
-//                     if( !CV_NODE_IS_INT( fn->tag ) || fn->data.i <= k
-//                             || fn->data.i >= tree_fn->data.seq->total )
-//                     {
-//                         sprintf( buf, "left node must be valid node number. "
-//                                  "(stage %d, tree %d, node %d)", i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     /* left node */
-//                     classifier->left[k] = fn->data.i;
-//                 }
-//                 else
-//                 {
-//                     fn = cvGetFileNodeByName( fs, node_fn, ICV_HAAR_LEFT_VAL_NAME );
-//                     if( !fn )
-//                     {
-//                         sprintf( buf, "left node or left value must be specified. "
-//                                  "(stage %d, tree %d, node %d)", i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     if( !CV_NODE_IS_REAL( fn->tag ) )
-//                     {
-//                         sprintf( buf, "left value must be real number. "
-//                                  "(stage %d, tree %d, node %d)", i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     /* left value */
-//                     if( last_idx >= classifier->count + 1 )
-//                     {
-//                         sprintf( buf, "Tree structure is broken: too many values. "
-//                                  "(stage %d, tree %d, node %d)", i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     classifier->left[k] = -last_idx;
-//                     classifier->alpha[last_idx++] = (float) fn->data.f;
-//                 }
-//                 fn = cvGetFileNodeByName( fs, node_fn, ICV_HAAR_RIGHT_NODE_NAME);
-//                 if( fn )
-//                 {
-//                     if( !CV_NODE_IS_INT( fn->tag ) || fn->data.i <= k
-//                             || fn->data.i >= tree_fn->data.seq->total )
-//                     {
-//                         sprintf( buf, "right node must be valid node number. "
-//                                  "(stage %d, tree %d, node %d)", i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     /* right node */
-//                     classifier->right[k] = fn->data.i;
-//                 }
-//                 else
-//                 {
-//                     fn = cvGetFileNodeByName( fs, node_fn, ICV_HAAR_RIGHT_VAL_NAME );
-//                     if( !fn )
-//                     {
-//                         sprintf( buf, "right node or right value must be specified. "
-//                                  "(stage %d, tree %d, node %d)", i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     if( !CV_NODE_IS_REAL( fn->tag ) )
-//                     {
-//                         sprintf( buf, "right value must be real number. "
-//                                  "(stage %d, tree %d, node %d)", i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     /* right value */
-//                     if( last_idx >= classifier->count + 1 )
-//                     {
-//                         sprintf( buf, "Tree structure is broken: too many values. "
-//                                  "(stage %d, tree %d, node %d)", i, j, k );
-//                         CV_Error( CV_StsError, buf );
-//                     }
-//                     classifier->right[k] = -last_idx;
-//                     classifier->alpha[last_idx++] = (float) fn->data.f;
-//                 }
-
-//                 CV_NEXT_SEQ_ELEM( sizeof( *node_fn ), tree_reader );
-//             } /* for each node */
-//             if( last_idx != classifier->count + 1 )
-//             {
-//                 sprintf( buf, "Tree structure is broken: too few values. "
-//                          "(stage %d, tree %d)", i, j );
-//                 CV_Error( CV_StsError, buf );
-//             }
-
-//             CV_NEXT_SEQ_ELEM( sizeof( *tree_fn ), trees_reader );
-//         } /* for each tree */
-
-//         fn = cvGetFileNodeByName( fs, stage_fn, ICV_HAAR_STAGE_THRESHOLD_NAME);
-//         if( !fn || !CV_NODE_IS_REAL( fn->tag ) )
-//         {
-//             sprintf( buf, "stage threshold must be real number. (stage %d)", i );
-//             CV_Error( CV_StsError, buf );
-//         }
-//         cascade->stage_classifier[i].threshold = (float) fn->data.f;
-
-//         parent = i - 1;
-//         next = -1;
-
-//         fn = cvGetFileNodeByName( fs, stage_fn, ICV_HAAR_PARENT_NAME );
-//         if( !fn || !CV_NODE_IS_INT( fn->tag )
-//                 || fn->data.i < -1 || fn->data.i >= cascade->count )
-//         {
-//             sprintf( buf, "parent must be integer number. (stage %d)", i );
-//             CV_Error( CV_StsError, buf );
-//         }
-//         parent = fn->data.i;
-//         fn = cvGetFileNodeByName( fs, stage_fn, ICV_HAAR_NEXT_NAME );
-//         if( !fn || !CV_NODE_IS_INT( fn->tag )
-//                 || fn->data.i < -1 || fn->data.i >= cascade->count )
-//         {
-//             sprintf( buf, "next must be integer number. (stage %d)", i );
-//             CV_Error( CV_StsError, buf );
-//         }
-//         next = fn->data.i;
-
-//         cascade->stage_classifier[i].parent = parent;
-//         cascade->stage_classifier[i].next = next;
-//         cascade->stage_classifier[i].child = -1;
-
-//         if( parent != -1 && cascade->stage_classifier[parent].child == -1 )
-//         {
-//             cascade->stage_classifier[parent].child = i;
-//         }
-
-//         CV_NEXT_SEQ_ELEM( sizeof( *stage_fn ), stages_reader );
-//     } /* for each stage */
-
-//     return cascade;
-// }
-
-// static void gpuWriteHaarClassifier( CvFileStorage *fs, const char *name, const void *struct_ptr,
-//                         CvAttrList attributes )
-// {
-//     int i, j, k, l;
-//     char buf[256];
-//     const CvHaarClassifierCascade *cascade = (const CvHaarClassifierCascade *) struct_ptr;
-
-//     /* TODO: parameters check */
-
-//     cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_HAAR, attributes );
-
-//     cvStartWriteStruct( fs, ICV_HAAR_SIZE_NAME, CV_NODE_SEQ | CV_NODE_FLOW );
-//     cvWriteInt( fs, NULL, cascade->orig_window_size.width );
-//     cvWriteInt( fs, NULL, cascade->orig_window_size.height );
-//     cvEndWriteStruct( fs ); /* size */
-
-//     cvStartWriteStruct( fs, ICV_HAAR_STAGES_NAME, CV_NODE_SEQ );
-//     for( i = 0; i < cascade->count; ++i )
-//     {
-//         cvStartWriteStruct( fs, NULL, CV_NODE_MAP );
-//         sprintf( buf, "stage %d", i );
-//         cvWriteComment( fs, buf, 1 );
-
-//         cvStartWriteStruct( fs, ICV_HAAR_TREES_NAME, CV_NODE_SEQ );
-
-//         for( j = 0; j < cascade->stage_classifier[i].count; ++j )
-//         {
-//             CvHaarClassifier *tree = &cascade->stage_classifier[i].classifier[j];
-
-//             cvStartWriteStruct( fs, NULL, CV_NODE_SEQ );
-//             sprintf( buf, "tree %d", j );
-//             cvWriteComment( fs, buf, 1 );
-
-//             for( k = 0; k < tree->count; ++k )
-//             {
-//                 CvHaarFeature *feature = &tree->haar_feature[k];
-
-//                 cvStartWriteStruct( fs, NULL, CV_NODE_MAP );
-//                 if( k )
-//                 {
-//                     sprintf( buf, "node %d", k );
-//                 }
-//                 else
-//                 {
-//                     sprintf( buf, "root node" );
-//                 }
-//                 cvWriteComment( fs, buf, 1 );
-
-//                 cvStartWriteStruct( fs, ICV_HAAR_FEATURE_NAME, CV_NODE_MAP );
-
-//                 cvStartWriteStruct( fs, ICV_HAAR_RECTS_NAME, CV_NODE_SEQ );
-//                 for( l = 0; l < CV_HAAR_FEATURE_MAX && feature->rect[l].r.width != 0; ++l )
-//                 {
-//                     cvStartWriteStruct( fs, NULL, CV_NODE_SEQ | CV_NODE_FLOW );
-//                     cvWriteInt(  fs, NULL, feature->rect[l].r.x );
-//                     cvWriteInt(  fs, NULL, feature->rect[l].r.y );
-//                     cvWriteInt(  fs, NULL, feature->rect[l].r.width );
-//                     cvWriteInt(  fs, NULL, feature->rect[l].r.height );
-//                     cvWriteReal( fs, NULL, feature->rect[l].weight );
-//                     cvEndWriteStruct( fs ); /* rect */
-//                 }
-//                 cvEndWriteStruct( fs ); /* rects */
-//                 cvWriteInt( fs, ICV_HAAR_TILTED_NAME, feature->tilted );
-//                 cvEndWriteStruct( fs ); /* feature */
-
-//                 cvWriteReal( fs, ICV_HAAR_THRESHOLD_NAME, tree->threshold[k]);
-
-//                 if( tree->left[k] > 0 )
-//                 {
-//                     cvWriteInt( fs, ICV_HAAR_LEFT_NODE_NAME, tree->left[k] );
-//                 }
-//                 else
-//                 {
-//                     cvWriteReal( fs, ICV_HAAR_LEFT_VAL_NAME,
-//                                  tree->alpha[-tree->left[k]] );
-//                 }
-
-//                 if( tree->right[k] > 0 )
-//                 {
-//                     cvWriteInt( fs, ICV_HAAR_RIGHT_NODE_NAME, tree->right[k] );
-//                 }
-//                 else
-//                 {
-//                     cvWriteReal( fs, ICV_HAAR_RIGHT_VAL_NAME,
-//                                  tree->alpha[-tree->right[k]] );
-//                 }
-
-//                 cvEndWriteStruct( fs ); /* split */
-//             }
-
-//             cvEndWriteStruct( fs ); /* tree */
-//         }
-
-//         cvEndWriteStruct( fs ); /* trees */
-
-//         cvWriteReal( fs, ICV_HAAR_STAGE_THRESHOLD_NAME, cascade->stage_classifier[i].threshold);
-//         cvWriteInt( fs, ICV_HAAR_PARENT_NAME, cascade->stage_classifier[i].parent );
-//         cvWriteInt( fs, ICV_HAAR_NEXT_NAME, cascade->stage_classifier[i].next );
-
-//         cvEndWriteStruct( fs ); /* stage */
-//     } /* for each stage */
-
-//     cvEndWriteStruct( fs ); /* stages */
-//     cvEndWriteStruct( fs ); /* root */
-// }
-
-// static void * gpuCloneHaarClassifier( const void *struct_ptr )
-// {
-//     CvHaarClassifierCascade *cascade = NULL;
-
-//     int i, j, k, n;
-//     const CvHaarClassifierCascade *cascade_src =
-//         (const CvHaarClassifierCascade *) struct_ptr;
-
-//     n = cascade_src->count;
-//     cascade = gpuCreateHaarClassifierCascade(n);
-//     cascade->orig_window_size = cascade_src->orig_window_size;
-
-//     for( i = 0; i < n; ++i )
-//     {
-//         cascade->stage_classifier[i].parent = cascade_src->stage_classifier[i].parent;
-//         cascade->stage_classifier[i].next = cascade_src->stage_classifier[i].next;
-//         cascade->stage_classifier[i].child = cascade_src->stage_classifier[i].child;
-//         cascade->stage_classifier[i].threshold = cascade_src->stage_classifier[i].threshold;
-
-//         cascade->stage_classifier[i].count = 0;
-//         cascade->stage_classifier[i].classifier =
-//             (CvHaarClassifier *) cvAlloc( cascade_src->stage_classifier[i].count
-//                                           * sizeof( cascade->stage_classifier[i].classifier[0] ) );
-
-//         cascade->stage_classifier[i].count = cascade_src->stage_classifier[i].count;
-
-//         for( j = 0; j < cascade->stage_classifier[i].count; ++j )
-//             cascade->stage_classifier[i].classifier[j].haar_feature = NULL;
-
-//         for( j = 0; j < cascade->stage_classifier[i].count; ++j )
-//         {
-//             const CvHaarClassifier *classifier_src =
-//                 &cascade_src->stage_classifier[i].classifier[j];
-//             CvHaarClassifier *classifier =
-//                 &cascade->stage_classifier[i].classifier[j];
-
-//             classifier->count = classifier_src->count;
-//             classifier->haar_feature = (CvHaarFeature *) cvAlloc(
-//                                            classifier->count * ( sizeof( *classifier->haar_feature ) +
-//                                                    sizeof( *classifier->threshold ) +
-//                                                    sizeof( *classifier->left ) +
-//                                                    sizeof( *classifier->right ) ) +
-//                                            (classifier->count + 1) * sizeof( *classifier->alpha ) );
-//             classifier->threshold = (float *) (classifier->haar_feature + classifier->count);
-//             classifier->left = (int *) (classifier->threshold + classifier->count);
-//             classifier->right = (int *) (classifier->left + classifier->count);
-//             classifier->alpha = (float *) (classifier->right + classifier->count);
-//             for( k = 0; k < classifier->count; ++k )
-//             {
-//                 classifier->haar_feature[k] = classifier_src->haar_feature[k];
-//                 classifier->threshold[k] = classifier_src->threshold[k];
-//                 classifier->left[k] = classifier_src->left[k];
-//                 classifier->right[k] = classifier_src->right[k];
-//                 classifier->alpha[k] = classifier_src->alpha[k];
-//             }
-//             classifier->alpha[classifier->count] =
-//                 classifier_src->alpha[classifier->count];
-//         }
-//     }
-
-//     return cascade;
-// }
-
-#if 0
-CvType haar_type( CV_TYPE_NAME_HAAR, gpuIsHaarClassifier,
-                  (CvReleaseFunc)gpuReleaseHaarClassifierCascade,
-                  gpuReadHaarClassifier, gpuWriteHaarClassifier,
-                  gpuCloneHaarClassifier );
-
-
-namespace cv
-{
-
-HaarClassifierCascade::HaarClassifierCascade() {}
-HaarClassifierCascade::HaarClassifierCascade(const String &filename)
-{
-    load(filename);
-}
-
-bool HaarClassifierCascade::load(const String &filename)
-{
-    cascade = Ptr<CvHaarClassifierCascade>((CvHaarClassifierCascade *)cvLoad(filename.c_str(), 0, 0, 0));
-    return (CvHaarClassifierCascade *)cascade != 0;
-}
-
-void HaarClassifierCascade::detectMultiScale( const Mat &image,
-        Vector<Rect> &objects, double scaleFactor,
-        int minNeighbors, int flags,
-        Size minSize )
-{
-    MemStorage storage(cvCreateMemStorage(0));
-    CvMat _image = image;
-    CvSeq *_objects = gpuHaarDetectObjects( &_image, cascade, storage, scaleFactor,
-                                            minNeighbors, flags, minSize );
-    Seq<Rect>(_objects).copyTo(objects);
-}
-
-int HaarClassifierCascade::runAt(Point pt, int startStage, int) const
-{
-    return gpuRunHaarClassifierCascade(cascade, pt, startStage);
-}
-
-void HaarClassifierCascade::setImages( const Mat &sum, const Mat &sqsum,
-                                       const Mat &tilted, double scale )
-{
-    CvMat _sum = sum, _sqsum = sqsum, _tilted = tilted;
-    gpuSetImagesForHaarClassifierCascade( cascade, &_sum, &_sqsum, &_tilted, scale );
-}
-
-}
-#endif
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////reserved functios//////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/*#if CV_SSE2
-#   if CV_SSE4 || defined __SSE4__
-#       include <smmintrin.h>
-#   else
-#       define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
-#       define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
-#   endif
-#if defined CV_ICC
-#   define CV_HAAR_USE_SSE 1
-#endif
-#endif*/
-
-
-/*
-CV_IMPL void
-gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
-const CvArr* _sum,
-const CvArr* _sqsum,
-const CvArr* _tilted_sum,
-double scale )
-{
-CvMat sum_stub, *sum = (CvMat*)_sum;
-CvMat sqsum_stub, *sqsum = (CvMat*)_sqsum;
-CvMat tilted_stub, *tilted = (CvMat*)_tilted_sum;
-GpuHidHaarClassifierCascade* cascade;
-int coi0 = 0, coi1 = 0;
-int i;
-int datasize;
-int totalclassifier;
-CvRect equRect;
-double weight_scale;
-int rows,cols;
-
-if( !CV_IS_HAAR_CLASSIFIER(_cascade) )
-CV_Error( !_cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier pointer" );
-
-if( scale <= 0 )
-CV_Error( CV_StsOutOfRange, "Scale must be positive" );
-
-sum = cvGetMat( sum, &sum_stub, &coi0 );
-sqsum = cvGetMat( sqsum, &sqsum_stub, &coi1 );
-
-if( coi0 || coi1 )
-CV_Error( CV_BadCOI, "COI is not supported" );
-
-if( !CV_ARE_SIZES_EQ( sum, sqsum ))
-CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
-
-if( CV_MAT_TYPE(sqsum->type) != CV_64FC1 ||
-CV_MAT_TYPE(sum->type) != CV_32SC1 )
-CV_Error( CV_StsUnsupportedFormat,
-"Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
-
-if( !_cascade->hid_cascade )
-gpuCreateHidHaarClassifierCascade(_cascade,&datasize,&totalclassifier);
-
-cascade =(GpuHidHaarClassifierCascade *)_cascade->hid_cascade;
-
-if( cascade->has_tilted_features )
-{
-tilted = cvGetMat( tilted, &tilted_stub, &coi1 );
-
-if( CV_MAT_TYPE(tilted->type) != CV_32SC1 )
-CV_Error( CV_StsUnsupportedFormat,
-"Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
-
-if( sum->step != tilted->step )
-CV_Error( CV_StsUnmatchedSizes,
-"Sum and tilted_sum must have the same stride (step, widthStep)" );
-
-if( !CV_ARE_SIZES_EQ( sum, tilted ))
-CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
-//cascade->tilted = *tilted;
-}
-
-_cascade->scale = scale;
-_cascade->real_window_size.width = cvRound( _cascade->orig_window_size.width * scale );
-_cascade->real_window_size.height = cvRound( _cascade->orig_window_size.height * scale );
-
-//cascade->sum = *sum;
-//cascade->sqsum = *sqsum;
-
-equRect.x = equRect.y = cvRound(scale);
-equRect.width = cvRound((_cascade->orig_window_size.width-2)*scale);
-equRect.height = cvRound((_cascade->orig_window_size.height-2)*scale);
-weight_scale = 1./(equRect.width*equRect.height);
-cascade->inv_window_area = weight_scale;
-
-cascade->p0 = sum_elem_ptr(*sum, equRect.y, equRect.x);
-cascade->p1 = sum_elem_ptr(*sum, equRect.y, equRect.x + equRect.width );
-cascade->p2 = sum_elem_ptr(*sum, equRect.y + equRect.height, equRect.x );
-cascade->p3 = sum_elem_ptr(*sum, equRect.y + equRect.height,
-equRect.x + equRect.width );
-*/
-/*    rows=sum->rows;
-cols=sum->cols;
-cascade->p0 = equRect.y*cols + equRect.x;
-cascade->p1 = equRect.y*cols + equRect.x + equRect.width;
-cascade->p2 = (equRect.y + equRect.height) * cols + equRect.x;
-cascade->p3 = (equRect.y + equRect.height) * cols + equRect.x + equRect.width ;
-*/
-/*
-cascade->pq0 = sqsum_elem_ptr(*sqsum, equRect.y, equRect.x);
-cascade->pq1 = sqsum_elem_ptr(*sqsum, equRect.y, equRect.x + equRect.width );
-cascade->pq2 = sqsum_elem_ptr(*sqsum, equRect.y + equRect.height, equRect.x );
-cascade->pq3 = sqsum_elem_ptr(*sqsum, equRect.y + equRect.height,
-equRect.x + equRect.width );
-*/
-/* init pointers in haar features according to real window size and
-given image pointers */
-/*    for( i = 0; i < _cascade->count; i++ )
-{
-int j, k, l;
-for( j = 0; j < cascade->stage_classifier[i].count; j++ )
-{
-for( l = 0; l < cascade->stage_classifier[i].classifier[j].count; l++ )
-{
-CvHaarFeature* feature =
-&_cascade->stage_classifier[i].classifier[j].haar_feature[l];
-*/                /* GpuHidHaarClassifier* classifier =
-cascade->stage_classifier[i].classifier + j; */
-//GpuHidHaarFeature* hidfeature =
-//  &cascade->stage_classifier[i].classifier[j].node[l].feature;
-/*                double sum0 = 0, area0 = 0;
-CvRect r[3];
-
-int base_w = -1, base_h = -1;
-int new_base_w = 0, new_base_h = 0;
-int kx, ky;
-int flagx = 0, flagy = 0;
-int x0 = 0, y0 = 0;
-int nr;
-*/
-/* align blocks */
-/*                for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
-{
-//if( !hidfeature->rect[k].p0 )
-//    break;
-r[k] = feature->rect[k].r;
-base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].width-1) );
-base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].x - r[0].x-1) );
-base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].height-1) );
-base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].y - r[0].y-1) );
-}
-
-nr = k;
-
-base_w += 1;
-base_h += 1;
-kx = r[0].width / base_w;
-ky = r[0].height / base_h;
-
-if( kx <= 0 )
-{
-flagx = 1;
-new_base_w = cvRound( r[0].width * scale ) / kx;
-x0 = cvRound( r[0].x * scale );
-}
-
-if( ky <= 0 )
-{
-flagy = 1;
-new_base_h = cvRound( r[0].height * scale ) / ky;
-y0 = cvRound( r[0].y * scale );
-}
-
-for( k = 0; k < nr; k++ )
-{
-CvRect tr;
-double correction_ratio;
-
-if( flagx )
-{
-tr.x = (r[k].x - r[0].x) * new_base_w / base_w + x0;
-tr.width = r[k].width * new_base_w / base_w;
-}
-else
+static int gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade, CvPoint pt, int start_stage */)
 {
-tr.x = cvRound( r[k].x * scale );
-tr.width = cvRound( r[k].width * scale );
-}
-
-if( flagy )
-{
-tr.y = (r[k].y - r[0].y) * new_base_h / base_h + y0;
-tr.height = r[k].height * new_base_h / base_h;
-}
-else
-{
-tr.y = cvRound( r[k].y * scale );
-tr.height = cvRound( r[k].height * scale );
-}
-
-#if CV_ADJUST_WEIGHTS
-{
-// RAINER START
-const float orig_feature_size =  (float)(feature->rect[k].r.width)*feature->rect[k].r.height;
-const float orig_norm_size = (float)(_cascade->orig_window_size.width)*(_cascade->orig_window_size.height);
-const float feature_size = float(tr.width*tr.height);
-//const float normSize    = float(equRect.width*equRect.height);
-float target_ratio = orig_feature_size / orig_norm_size;
-//float isRatio = featureSize / normSize;
-//correctionRatio = targetRatio / isRatio / normSize;
-correction_ratio = target_ratio / feature_size;
-// RAINER END
-}
-#else
-correction_ratio = weight_scale * (!feature->tilted ? 1 : 0.5);
-#endif
-
-if( !feature->tilted )
-{
-hidfeature->rect[k].p0 = tr.y * rows + tr.x;
-hidfeature->rect[k].p1 = tr.y * rows + tr.x + tr.width;
-hidfeature->rect[k].p2 = (tr.y + tr.height) * rows + tr.x;
-hidfeature->rect[k].p3 = (tr.y + tr.height) * rows + tr.x + tr.width;
-
-}
-else
-{
-hidfeature->rect[k].p2 = (tr.y + tr.width) * rows + tr.x + tr.width;
-hidfeature->rect[k].p3 = (tr.y + tr.width + tr.height) * rows + tr.x + tr.width - tr.height;
-hidfeature->rect[k].p0 = tr.y*rows + tr.x;
-hidfeature->rect[k].p1 = (tr.y + tr.height) * rows + tr.x - tr.height;
-
-}
-
-//hidfeature->rect[k].weight = (float)(feature->rect[k].weight * correction_ratio);
-
-if( k == 0 )
-area0 = tr.width * tr.height;
-else
-;//  sum0 += hidfeature->rect[k].weight * tr.width * tr.height;
-}
-
-//hidfeature->rect[0].weight = (float)(-sum0/area0);*/
-//            } /* l */
-//        } /* j */
-//    }
-//}
-/*
-CV_INLINE
-double gpuEvalHidHaarClassifier( GpuHidHaarClassifier *classifier,
-double variance_norm_factor,
-size_t p_offset )
-{
-
-    int idx = 0;
-    do
-    {
-    GpuHidHaarTreeNode* node = classifier->node + idx;
-    double t = node->threshold * variance_norm_factor;
-
-    double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
-    sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
-
-    if( node->feature.rect[2].p0 )
-    sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
-
-    idx = sum < t ? node->left : node->right;
-    }
-    while( idx > 0 );
-    return classifier->alpha[-idx];
-
-    return 0.;
-}
-
-
-*/
-static int gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade,
-CvPoint pt, int start_stage */)
-{
-    /*
-    int result = -1;
-
-    int p_offset, pq_offset;
-    int i, j;
-    double mean, variance_norm_factor;
-    GpuHidHaarClassifierCascade* cascade;
-
-    if( !CV_IS_HAAR_CLASSIFIER(_cascade) )
-    CV_Error( !_cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid cascade pointer" );
-
-    cascade = (GpuHidHaarClassifierCascade*) _cascade->hid_cascade;
-    if( !cascade )
-    CV_Error( CV_StsNullPtr, "Hidden cascade has not been created.\n"
-    "Use gpuSetImagesForHaarClassifierCascade" );
-
-    if( pt.x < 0 || pt.y < 0 ||
-    pt.x + _cascade->real_window_size.width >= cascade->sum.width-2 ||
-    pt.y + _cascade->real_window_size.height >= cascade->sum.height-2 )
-    return -1;
-
-    p_offset = pt.y * (cascade->sum.step/sizeof(sumtype)) + pt.x;
-    pq_offset = pt.y * (cascade->sqsum.step/sizeof(sqsumtype)) + pt.x;
-    mean = calc_sum(*cascade,p_offset)*cascade->inv_window_area;
-    variance_norm_factor = cascade->pq0[pq_offset] - cascade->pq1[pq_offset] -
-    cascade->pq2[pq_offset] + cascade->pq3[pq_offset];
-    variance_norm_factor = variance_norm_factor*cascade->inv_window_area - mean*mean;
-    if( variance_norm_factor >= 0. )
-    variance_norm_factor = sqrt(variance_norm_factor);
-    else
-    variance_norm_factor = 1.;
-
-
-    if( cascade->is_stump_based )
-    {
-    for( i = start_stage; i < cascade->count; i++ )
-    {
-    double stage_sum = 0;
-
-    if( cascade->stage_classifier[i].two_rects )
-    {
-    for( j = 0; j < cascade->stage_classifier[i].count; j++ )
-    {
-    GpuHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
-    GpuHidHaarTreeNode* node = classifier->node;
-    double t = node->threshold*variance_norm_factor;
-    double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
-    sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
-    stage_sum += classifier->alpha[sum >= t];
-    }
-    }
-    else
-    {
-    for( j = 0; j < cascade->stage_classifier[i].count; j++ )
-    {
-    GpuHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
-    GpuHidHaarTreeNode* node = classifier->node;
-    double t = node->threshold*variance_norm_factor;
-    double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
-    sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
-    if( node->feature.rect[2].p0 )
-    sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
-
-    stage_sum += classifier->alpha[sum >= t];
-    }
-    }
-
-    if( stage_sum < cascade->stage_classifier[i].threshold )
-    return -i;
-    }
-    }
-    */
    return 1;
 }

-
 namespace cv
 {
 namespace ocl
@@ -2671,78 +1498,3 @@ struct gpuHaarDetectObjects_ScaleCascade_Invoker

 }
 }
-
-/*
-typedef struct _ALIGNED_ON(128) GpuHidHaarFeature
-{
-struct _ALIGNED_ON(32)
-{
-int    p0 _ALIGNED_ON(4);
-int    p1 _ALIGNED_ON(4);
-int    p2 _ALIGNED_ON(4);
-int    p3 _ALIGNED_ON(4);
-float weight  _ALIGNED_ON(4);
-}
-rect[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(32);
-}
-GpuHidHaarFeature;
-
-
-typedef struct _ALIGNED_ON(128) GpuHidHaarTreeNode
-{
-int left _ALIGNED_ON(4);
-int right _ALIGNED_ON(4);
-float threshold _ALIGNED_ON(4);
-int p0[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(16);
-int p1[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(16);
-int p2[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(16);
-int p3[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(16);
-float weight[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(16);
-float alpha[2] _ALIGNED_ON(8);
-// GpuHidHaarFeature feature __attribute__((aligned (128)));
-}
-GpuHidHaarTreeNode;
-
-
-typedef struct _ALIGNED_ON(32) GpuHidHaarClassifier
-{
-int count _ALIGNED_ON(4);
-//CvHaarFeature* orig_feature;
-GpuHidHaarTreeNode* node _ALIGNED_ON(8);
-float* alpha _ALIGNED_ON(8);
-}
-GpuHidHaarClassifier;
-
-
-typedef struct _ALIGNED_ON(64) __attribute__((aligned (64))) GpuHidHaarStageClassifier
-{
-int  count _ALIGNED_ON(4);
-float threshold _ALIGNED_ON(4);
-int two_rects _ALIGNED_ON(4);
-GpuHidHaarClassifier* classifier _ALIGNED_ON(8);
-struct GpuHidHaarStageClassifier* next _ALIGNED_ON(8);
-struct GpuHidHaarStageClassifier* child _ALIGNED_ON(8);
-struct GpuHidHaarStageClassifier* parent _ALIGNED_ON(8);
-}
-GpuHidHaarStageClassifier;
-
-
-typedef struct _ALIGNED_ON(64) GpuHidHaarClassifierCascade
-{
-int  count _ALIGNED_ON(4);
-int  is_stump_based _ALIGNED_ON(4);
-int  has_tilted_features _ALIGNED_ON(4);
-int  is_tree _ALIGNED_ON(4);
-int pq0 _ALIGNED_ON(4);
-int pq1 _ALIGNED_ON(4);
-int pq2 _ALIGNED_ON(4);
-int pq3 _ALIGNED_ON(4);
-int p0 _ALIGNED_ON(4);
-int p1 _ALIGNED_ON(4);
-int p2 _ALIGNED_ON(4);
-int p3 _ALIGNED_ON(4);
-float inv_window_area _ALIGNED_ON(4);
-// GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
-}GpuHidHaarClassifierCascade;
-*/
-/* End of file. */
--- a/modules/ocl/src/kernels/haarobjectdetect.cl
+++ b/modules/ocl/src/kernels/haarobjectdetect.cl
@@ -9,6 +9,7 @@
 //    Niko Li, newlife20080214@gmail.com
 //    Wang Weiyan, wangweiyanster@gmail.com
 //    Jia Haipeng, jiahaipeng95@gmail.com
+//    Nathan, liujun@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -47,14 +48,14 @@ typedef float sqsumtype;
 typedef struct  __attribute__((aligned (128)))  GpuHidHaarFeature
 {
    struct __attribute__((aligned (32)))
-    {
-        int p0 __attribute__((aligned (4)));
-        int p1 __attribute__((aligned (4)));
-        int p2 __attribute__((aligned (4)));
-        int p3 __attribute__((aligned (4)));
-        float weight __attribute__((aligned (4)));
-    }
-    rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
+{
+    int p0 __attribute__((aligned (4)));
+    int p1 __attribute__((aligned (4)));
+    int p2 __attribute__((aligned (4)));
+    int p3 __attribute__((aligned (4)));
+    float weight __attribute__((aligned (4)));
+}
+rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
 }
 GpuHidHaarFeature;

@@ -108,31 +109,31 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
    int p2 __attribute__((aligned (4)));
    int p3 __attribute__((aligned (4)));
    float inv_window_area __attribute__((aligned (4)));
-}GpuHidHaarClassifierCascade;
+} GpuHidHaarClassifierCascade;


 __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade,
-                                          global GpuHidHaarStageClassifier * stagecascadeptr,
-                                          global int4 * info,
-                                          global GpuHidHaarTreeNode * nodeptr,
-                                          global const int * restrict sum1,
-                                          global const float * restrict sqsum1,
-                                          global int4 * candidate,
-                                          const int pixelstep,
-                                          const int loopcount,
-                                          const int start_stage,
-                                          const int split_stage,
-                                          const int end_stage,
-                                          const int startnode,
-                                          const int splitnode,
-                                          const int4 p,
-                                          const int4 pq,
-                                          const float correction
-                                          //const int width,
-                                          //const int height,
-                                          //const int grpnumperline,
-                                          //const int totalgrp
-                                          )
+    global GpuHidHaarStageClassifier * stagecascadeptr,
+    global int4 * info,
+    global GpuHidHaarTreeNode * nodeptr,
+    global const int * restrict sum1,
+    global const float * restrict sqsum1,
+    global int4 * candidate,
+    const int pixelstep,
+    const int loopcount,
+    const int start_stage,
+    const int split_stage,
+    const int end_stage,
+    const int startnode,
+    const int splitnode,
+    const int4 p,
+    const int4 pq,
+    const float correction
+    //const int width,
+    //const int height,
+    //const int grpnumperline,
+    //const int totalgrp
+)
 {
    int grpszx = get_local_size(0);
    int grpszy = get_local_size(1);
@@ -184,7 +185,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa

        __global const int * sum = sum1 + imgoff;
        __global const float * sqsum = sqsum1 + imgoff;
-        for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
+        for(int grploop=grpidx; grploop<totalgrp; grploop+=grpnumx)
        {
            int grpidy = grploop / grpnumperline;
            int grpidx = grploop - mul24(grpidy, grpnumperline);
@@ -195,7 +196,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
            int grpoffx = x-lclidx;
            int grpoffy = y-lclidy;

-            for(int i=0;i<read_loop;i++)
+            for(int i=0; i<read_loop; i++)
            {
                int pos_id = mad24(i,lcl_sz,lcl_id);
                pos_id = pos_id < total_read ? pos_id : 0;
@@ -234,15 +235,15 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                cascadeinfo1.x +=lcl_off;
                cascadeinfo1.z +=lcl_off;
                mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
-                    lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
-                    *correction;
+                        lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
+                       *correction;

                int p_offset = mad24(y, pixelstep, x);

                cascadeinfo2.x +=p_offset;
                cascadeinfo2.z +=p_offset;
                variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
-                    sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
+                                      sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];

                variance_norm_factor = variance_norm_factor * correction - mean * mean;
                variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
@@ -270,19 +271,19 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                        info2.z +=lcl_off;

                        float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
-                            lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
+                                          lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;


                        classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
-                            lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
+                                     lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;


                        //if((info3.z - info3.x) && (!stageinfo.z))
                        //{
-                            info3.x +=lcl_off;
-                            info3.z +=lcl_off;
-                            classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
-                                lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
+                        info3.x +=lcl_off;
+                        info3.z +=lcl_off;
+                        classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
+                                     lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
                        //}
                        stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
                        nodecounter++;
@@ -299,12 +300,13 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                int queuecount  = lclcount[0];
+                barrier(CLK_LOCAL_MEM_FENCE);
                nodecounter = splitnode;
-                for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
+                for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
                {
-                  //barrier(CLK_LOCAL_MEM_FENCE);
+                    //barrier(CLK_LOCAL_MEM_FENCE);
                    //if(lcl_id == 0)
-            lclcount[0]=0;
+                    lclcount[0]=0;
                    barrier(CLK_LOCAL_MEM_FENCE);

                    int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
@@ -316,70 +318,73 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                    int lcl_compute_win_id = (lcl_id >>(6-perfscale));
                    int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
                    int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
-                    for(int queueloop=0;queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/;queueloop++)
+                    for(int queueloop=0; queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/; queueloop++)
                    {
                        float stage_sum = 0.f;
                        int temp_coord = lcloutindex[lcl_compute_win_id<<1];
                        float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
                        int queue_pixel = mad24(((temp_coord  & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);

-                      //barrier(CLK_LOCAL_MEM_FENCE);
-            if(lcl_compute_win_id < queuecount) {
-
-            int tempnodecounter = lcl_compute_id;
-                        float part_sum = 0.f;
-                        for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
+                        //barrier(CLK_LOCAL_MEM_FENCE);
+                        if(lcl_compute_win_id < queuecount)
                        {
-                            __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);

-                            int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
-                            int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
-                            int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
-                            float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-                            float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
-                            float nodethreshold  = w.w * variance_norm_factor;
+                            int tempnodecounter = lcl_compute_id;
+                            float part_sum = 0.f;
+                            for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
+                            {
+                                __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);

-                            info1.x +=queue_pixel;
-                            info1.z +=queue_pixel;
-                            info2.x +=queue_pixel;
-                            info2.z +=queue_pixel;
+                                int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+                                int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+                                int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+                                float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+                                float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                                float nodethreshold  = w.w * variance_norm_factor;

-                            float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
-                                lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
+                                info1.x +=queue_pixel;
+                                info1.z +=queue_pixel;
+                                info2.x +=queue_pixel;
+                                info2.z +=queue_pixel;

+                                float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
+                                                  lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;

-                            classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
-                                lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
-                        //if((info3.z - info3.x) && (!stageinfo.z))
-                        //{
+
+                                classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
+                                             lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
+                                //if((info3.z - info3.x) && (!stageinfo.z))
+                                //{
                                info3.x +=queue_pixel;
                                info3.z +=queue_pixel;
                                classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
-                                    lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
-                        //}
-                            part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
-                            tempnodecounter +=lcl_compute_win;
-                        }//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
-                        partialsum[lcl_id]=part_sum;
-            }
-                        barrier(CLK_LOCAL_MEM_FENCE);
-            if(lcl_compute_win_id < queuecount) {
-                        for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
-                        {
-                            stage_sum += partialsum[lcl_id+i];
+                                             lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
+                                //}
+                                part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                                tempnodecounter +=lcl_compute_win;
+                            }//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
+                            partialsum[lcl_id]=part_sum;
                        }
-                        if(stage_sum >= stagethreshold && (lcl_compute_id==0))
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                        if(lcl_compute_win_id < queuecount)
                        {
-                            int queueindex = atomic_inc(lclcount);
-                            lcloutindex[queueindex<<1] = temp_coord;
-                            lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+                            for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
+                            {
+                                stage_sum += partialsum[lcl_id+i];
+                            }
+                            if(stage_sum >= stagethreshold && (lcl_compute_id==0))
+                            {
+                                int queueindex = atomic_inc(lclcount);
+                                lcloutindex[queueindex<<1] = temp_coord;
+                                lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+                            }
+                            lcl_compute_win_id +=(1<<perfscale);
                        }
-                        lcl_compute_win_id +=(1<<perfscale);
-            }
                        barrier(CLK_LOCAL_MEM_FENCE);
                    }//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
-                  barrier(CLK_LOCAL_MEM_FENCE);
+                    //barrier(CLK_LOCAL_MEM_FENCE);
                    queuecount = lclcount[0];
+                    barrier(CLK_LOCAL_MEM_FENCE);
                    nodecounter += stageinfo.x;
                }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
                //barrier(CLK_LOCAL_MEM_FENCE);
@@ -420,138 +425,138 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa



-                /*
-                if(stagecascade->two_rects)
-                {
-                    #pragma unroll
-                    for( n = 0; n < stagecascade->count; n++ )
-                    {
-                        t1 = *(node + counter);
-                        t = t1.threshold * variance_norm_factor;
-                        classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
+/*
+if(stagecascade->two_rects)
+{
+    #pragma unroll
+    for( n = 0; n < stagecascade->count; n++ )
+    {
+        t1 = *(node + counter);
+        t = t1.threshold * variance_norm_factor;
+        classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];

-                        classsum  += calc_sum1(t1, p_offset,1) * t1.weight[1];
-                        stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
+        classsum  += calc_sum1(t1, p_offset,1) * t1.weight[1];
+        stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];

-                        counter++;
-                    }
-                }
-                else
-                {
-                    #pragma unroll
-                    for( n = 0; n < stagecascade->count; n++ )
-                    {
-                        t = node[counter].threshold*variance_norm_factor;
-                        classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
-                        classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
+        counter++;
+    }
+}
+else
+{
+    #pragma unroll
+    for( n = 0; n < stagecascade->count; n++ )
+    {
+        t = node[counter].threshold*variance_norm_factor;
+        classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
+        classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];

-                        if( node[counter].p0[2] )
-                            classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
+        if( node[counter].p0[2] )
+            classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];

-                        stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
+        stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify

-                        counter++;
-                    }
-                }
-                */
-                /*
+        counter++;
+    }
+}
+*/
+/*
 __kernel void gpuRunHaarClassifierCascade_ScaleWindow(
-                                          constant GpuHidHaarClassifierCascade * _cascade,
-                                          global GpuHidHaarStageClassifier * stagecascadeptr,
-                                          //global GpuHidHaarClassifier * classifierptr,
-                                          global GpuHidHaarTreeNode * nodeptr,
-                                          global int * sum,
-                                          global float * sqsum,
-                                          global int * _candidate,
-                                          int pixel_step,
-                                          int cols,
-                                          int rows,
-                                          int start_stage,
-                                          int end_stage,
-                                          //int counts,
-                                          int nodenum,
-                                          int ystep,
-                                          int detect_width,
-                                          //int detect_height,
-                                          int loopcount,
-                                          int outputstep)
-                                          //float scalefactor)
+                          constant GpuHidHaarClassifierCascade * _cascade,
+                          global GpuHidHaarStageClassifier * stagecascadeptr,
+                          //global GpuHidHaarClassifier * classifierptr,
+                          global GpuHidHaarTreeNode * nodeptr,
+                          global int * sum,
+                          global float * sqsum,
+                          global int * _candidate,
+                          int pixel_step,
+                          int cols,
+                          int rows,
+                          int start_stage,
+                          int end_stage,
+                          //int counts,
+                          int nodenum,
+                          int ystep,
+                          int detect_width,
+                          //int detect_height,
+                          int loopcount,
+                          int outputstep)
+                          //float scalefactor)
 {
-    unsigned int x1 = get_global_id(0);
-    unsigned int y1 = get_global_id(1);
-    int p_offset;
-    int m, n;
-    int result;
-    int counter;
-    float mean, variance_norm_factor;
-    for(int i=0;i<loopcount;i++)
-    {
-        constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
-        global int * candidate = _candidate + i*outputstep;
-        int window_width = cascade->p1 - cascade->p0;
-        int window_height = window_width;
-        result = 1;
-        counter = 0;
-            unsigned int x = mul24(x1,ystep);
-            unsigned int y = mul24(y1,ystep);
-        if((x < cols - window_width - 1) && (y < rows - window_height -1))
-        {
-            global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
-            //global GpuHidHaarClassifier      *classifier   = classifierptr;
-            global GpuHidHaarTreeNode        *node         = nodeptr + nodenum*i;
+unsigned int x1 = get_global_id(0);
+unsigned int y1 = get_global_id(1);
+int p_offset;
+int m, n;
+int result;
+int counter;
+float mean, variance_norm_factor;
+for(int i=0;i<loopcount;i++)
+{
+constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
+global int * candidate = _candidate + i*outputstep;
+int window_width = cascade->p1 - cascade->p0;
+int window_height = window_width;
+result = 1;
+counter = 0;
+unsigned int x = mul24(x1,ystep);
+unsigned int y = mul24(y1,ystep);
+if((x < cols - window_width - 1) && (y < rows - window_height -1))
+{
+global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
+//global GpuHidHaarClassifier      *classifier   = classifierptr;
+global GpuHidHaarTreeNode        *node         = nodeptr + nodenum*i;

-            p_offset = mad24(y, pixel_step, x);// modify
+p_offset = mad24(y, pixel_step, x);// modify

-            mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
-                    *(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
-                    *cascade->inv_window_area;
+mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
+    *(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
+    *cascade->inv_window_area;

-            variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
-                                    *(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
-            variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
-            variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
+variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
+                    *(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
+variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
+variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify

-            // if( cascade->is_stump_based )
-            //{
-            for( m = start_stage; m < end_stage; m++ )
-            {
-                float stage_sum = 0.f;
-                float t,  classsum;
-                GpuHidHaarTreeNode t1;
+// if( cascade->is_stump_based )
+//{
+for( m = start_stage; m < end_stage; m++ )
+{
+float stage_sum = 0.f;
+float t,  classsum;
+GpuHidHaarTreeNode t1;

-                //#pragma unroll
-                for( n = 0; n < stagecascade->count; n++ )
-                {
-                     t1 = *(node + counter);
-                     t  = t1.threshold * variance_norm_factor;
-                     classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1];
+//#pragma unroll
+for( n = 0; n < stagecascade->count; n++ )
+{
+     t1 = *(node + counter);
+     t  = t1.threshold * variance_norm_factor;
+     classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1];

-                     if((t1.p0[2]) && (!stagecascade->two_rects))
-                         classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2];
+     if((t1.p0[2]) && (!stagecascade->two_rects))
+         classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2];

-                     stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
-                     counter++;
-                }
+     stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
+     counter++;
+}

-                if (stage_sum < stagecascade->threshold)
-                {
-                    result = 0;
-                    break;
-                }
+if (stage_sum < stagecascade->threshold)
+{
+    result = 0;
+    break;
+}

-                stagecascade++;
+stagecascade++;

-            }
-                if(result)
-                {
-                    candidate[4 * (y1 * detect_width + x1)]     = x;
-                    candidate[4 * (y1 * detect_width + x1) + 1] = y;
-                    candidate[4 * (y1 * detect_width + x1)+2]     = window_width;
-                    candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
-                }
-            //}
-        }
-    }
+}
+if(result)
+{
+    candidate[4 * (y1 * detect_width + x1)]     = x;
+    candidate[4 * (y1 * detect_width + x1) + 1] = y;
+    candidate[4 * (y1 * detect_width + x1)+2]     = window_width;
+    candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
+}
+//}
+}
+}
 }
 */