Commit 302a5adc authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

converted Haar cascades to the new format; now they are handled with C++ code.

parent fdf1996e
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -474,16 +474,18 @@ HaarEvaluator::~HaarEvaluator()
bool HaarEvaluator::read(const FileNode& node)
{
features->resize(node.size());
featuresPtr = &(*features)[0];
FileNodeIterator it = node.begin(), it_end = node.end();
size_t i, n = node.size();
CV_Assert(n > 0);
features.resize(n);
featuresPtr = &features[0];
FileNodeIterator it = node.begin();
hasTiltedFeatures = false;
for(int i = 0; it != it_end; ++it, i++)
for(i = 0; i < n; i++, ++it)
{
if(!featuresPtr[i].read(*it))
if(!features[i].read(*it))
return false;
if( featuresPtr[i].tilted )
if( features[i].tilted )
hasTiltedFeatures = true;
}
return true;
......@@ -494,7 +496,6 @@ Ptr<FeatureEvaluator> HaarEvaluator::clone() const
Ptr<HaarEvaluator> ret = makePtr<HaarEvaluator>();
ret->origWinSize = origWinSize;
ret->features = features;
ret->featuresPtr = &(*ret->features)[0];
ret->hasTiltedFeatures = hasTiltedFeatures;
ret->sum0 = sum0, ret->sqsum0 = sqsum0, ret->tilted0 = tilted0;
ret->sum = sum, ret->sqsum = sqsum, ret->tilted = tilted;
......@@ -540,10 +541,10 @@ bool HaarEvaluator::setImage( const Mat &image, Size _origWinSize )
CV_SUM_PTRS( p[0], p[1], p[2], p[3], sdata, normrect, sumStep );
CV_SUM_PTRS( pq[0], pq[1], pq[2], pq[3], sqdata, normrect, sqsumStep );
size_t fi, nfeatures = features->size();
size_t fi, nfeatures = features.size();
for( fi = 0; fi < nfeatures; fi++ )
featuresPtr[fi].updatePtrs( !featuresPtr[fi].tilted ? sum : tilted );
optfeaturesPtr[fi].updatePtrs( !featuresPtr[fi].tilted ? sum : tilted );
return true;
}
......
......@@ -186,6 +186,32 @@ protected:
#define CALC_SUM(rect,offset) CALC_SUM_((rect)[0], (rect)[1], (rect)[2], (rect)[3], offset)
#define CV_SUM_OFS( p0, p1, p2, p3, sum, rect, step ) \
/* (x, y) */ \
(p0) = sum + (rect).x + (step) * (rect).y, \
/* (x + w, y) */ \
(p1) = sum + (rect).x + (rect).width + (step) * (rect).y, \
/* (x + w, y) */ \
(p2) = sum + (rect).x + (step) * ((rect).y + (rect).height), \
/* (x + w, y + h) */ \
(p3) = sum + (rect).x + (rect).width + (step) * ((rect).y + (rect).height)
#define CV_TILTED_OFS( p0, p1, p2, p3, tilted, rect, step ) \
/* (x, y) */ \
(p0) = tilted + (rect).x + (step) * (rect).y, \
/* (x - h, y + h) */ \
(p1) = tilted + (rect).x - (rect).height + (step) * ((rect).y + (rect).height), \
/* (x + w, y + w) */ \
(p2) = tilted + (rect).x + (rect).width + (step) * ((rect).y + (rect).width), \
/* (x + w - h, y + w + h) */ \
(p3) = tilted + (rect).x + (rect).width - (rect).height \
+ (step) * ((rect).y + (rect).width + (rect).height)
#define CALC_SUM_(p0, p1, p2, p3, offset) \
((p0)[offset] - (p1)[offset] - (p2)[offset] + (p3)[offset])
#define CALC_SUM(rect,offset) CALC_SUM_((rect)[0], (rect)[1], (rect)[2], (rect)[3], offset)
//---------------------------------------------- HaarEvaluator ---------------------------------------
class HaarEvaluator : public FeatureEvaluator
......@@ -195,8 +221,6 @@ public:
{
Feature();
float calc( int offset ) const;
void updatePtrs( const Mat& sum );
bool read( const FileNode& node );
bool tilted;
......@@ -208,8 +232,19 @@ public:
Rect r;
float weight;
} rect[RECT_NUM];
};
struct OptFeature
{
OptFeature();
const int* p[RECT_NUM][4];
enum { RECT_NUM = Feature::RECT_NUM };
float calc( const int* pwin ) const;
void setPtrs( const Mat& sum, const Feature& f );
int ofs[RECT_NUM][4];
float weight[RECT_NUM];
};
HaarEvaluator();
......@@ -223,23 +258,26 @@ public:
virtual bool setWindow(Point pt);
double operator()(int featureIdx) const
{ return featuresPtr[featureIdx].calc(offset) * varianceNormFactor; }
{ return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; }
virtual double calcOrd(int featureIdx) const
{ return (*this)(featureIdx); }
protected:
Size origWinSize;
Ptr<std::vector<Feature> > features;
Feature* featuresPtr; // optimization
std::vector<Feature> features;
std::vector<OptFeature> optfeatures;
OptFeature* optfeaturesPtr; // optimization
bool hasTiltedFeatures;
Mat sum0, sqsum0, tilted0;
Mat sum, sqsum, tilted;
Rect normrect;
const int *p[4];
const double *pq[4];
int p[4];
int pq[4];
const int* pwin;
const double* pqwin;
int offset;
double varianceNormFactor;
};
......@@ -249,12 +287,18 @@ inline HaarEvaluator::Feature :: Feature()
tilted = false;
rect[0].r = rect[1].r = rect[2].r = Rect();
rect[0].weight = rect[1].weight = rect[2].weight = 0;
p[0][0] = p[0][1] = p[0][2] = p[0][3] =
p[1][0] = p[1][1] = p[1][2] = p[1][3] =
p[2][0] = p[2][1] = p[2][2] = p[2][3] = 0;
}
inline float HaarEvaluator::Feature :: calc( int _offset ) const
inline HaarEvaluator::OptFeature :: OptFeature()
{
weight[0] = weight[1] = weight[2] = 0.f;
ofs[0][0] = ofs[0][1] = ofs[0][2] = ofs[0][3] =
ofs[1][0] = ofs[1][1] = ofs[1][2] = ofs[1][3] =
ofs[2][0] = ofs[2][1] = ofs[2][2] = ofs[2][3] = 0;
}
/*inline float HaarEvaluator::Feature :: calc( int _offset ) const
{
float ret = rect[0].weight * CALC_SUM(p[0], _offset) + rect[1].weight * CALC_SUM(p[1], _offset);
......@@ -262,12 +306,13 @@ inline float HaarEvaluator::Feature :: calc( int _offset ) const
ret += rect[2].weight * CALC_SUM(p[2], _offset);
return ret;
}
}*/
inline void HaarEvaluator::Feature :: updatePtrs( const Mat& _sum )
inline void HaarEvaluator::OptFeature :: setPtrs( const Mat& _sum, const Feature& _f )
{
const int* ptr = (const int*)_sum.data;
size_t step = _sum.step/sizeof(ptr[0]);
size_t tiltedofs =
if (tilted)
{
CV_TILTED_PTRS( p[0][0], p[0][1], p[0][2], p[0][3], ptr, rect[0].r, step );
......
......@@ -203,7 +203,8 @@ static bool convert(const String& oldcascade, const String& newcascade)
for( i = 0; i < nstages; i++ )
maxWeakCount = std::max(maxWeakCount, stages[i].weaks.size());
newfs << "stageType" << "BOOST"
newfs << "cascade" << "{:opencv-cascade-classifier"
<< "stageType" << "BOOST"
<< "featureType" << "HAAR"
<< "height" << cascadesize.width
<< "width" << cascadesize.height
......@@ -250,8 +251,8 @@ static bool convert(const String& oldcascade, const String& newcascade)
{
if( j >= 2 && fabs(f.rect[j].weight) < FLT_EPSILON )
break;
newfs << f.rect[j].r.x << f.rect[j].r.y <<
f.rect[j].r.width << f.rect[j].r.height << f.rect[j].weight;
newfs << "[" << f.rect[j].r.x << f.rect[j].r.y <<
f.rect[j].r.width << f.rect[j].r.height << f.rect[j].weight << "]";
}
newfs << "]";
if( f.tilted )
......@@ -259,7 +260,7 @@ static bool convert(const String& oldcascade, const String& newcascade)
newfs << "}";
}
newfs << "]";
newfs << "]" << "}";
return true;
}
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Wang Weiyan, wangweiyanster@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Wu Xinglong, wxl370@126.com
// Wang Yao, bitwangyaoyao@gmail.com
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include "opencl_kernels.hpp"
#if 0
using namespace cv;
using namespace cv::ocl;
/* these settings affect the quality of detection: change with care */
#define CV_ADJUST_FEATURES 1
#define CV_ADJUST_WEIGHTS 0
typedef int sumtype;
typedef double sqsumtype;
typedef struct CvHidHaarFeature
{
struct
{
sumtype *p0, *p1, *p2, *p3;
float weight;
}
rect[CV_HAAR_FEATURE_MAX];
}
CvHidHaarFeature;
typedef struct CvHidHaarTreeNode
{
CvHidHaarFeature feature;
float threshold;
int left;
int right;
}
CvHidHaarTreeNode;
typedef struct CvHidHaarClassifier
{
int count;
//CvHaarFeature* orig_feature;
CvHidHaarTreeNode *node;
float *alpha;
}
CvHidHaarClassifier;
typedef struct CvHidHaarStageClassifier
{
int count;
float threshold;
CvHidHaarClassifier *classifier;
int two_rects;
struct CvHidHaarStageClassifier *next;
struct CvHidHaarStageClassifier *child;
struct CvHidHaarStageClassifier *parent;
}
CvHidHaarStageClassifier;
struct CvHidHaarClassifierCascade
{
int count;
int is_stump_based;
int has_tilted_features;
int is_tree;
double inv_window_area;
CvMat sum, sqsum, tilted;
CvHidHaarStageClassifier *stage_classifier;
sqsumtype *pq0, *pq1, *pq2, *pq3;
sumtype *p0, *p1, *p2, *p3;
void **ipp_stages;
};
typedef struct
{
int width_height;
int grpnumperline_totalgrp;
int imgoff;
float factor;
} detect_piramid_info;
#ifdef _MSC_VER
#define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT))
typedef _ALIGNED_ON(128) struct GpuHidHaarTreeNode
{
_ALIGNED_ON(64) int p[CV_HAAR_FEATURE_MAX][4];
float weight[CV_HAAR_FEATURE_MAX] ;
float threshold ;
_ALIGNED_ON(16) float alpha[3] ;
_ALIGNED_ON(4) int left ;
_ALIGNED_ON(4) int right ;
}
GpuHidHaarTreeNode;
typedef _ALIGNED_ON(32) struct GpuHidHaarClassifier
{
_ALIGNED_ON(4) int count;
_ALIGNED_ON(8) GpuHidHaarTreeNode *node ;
_ALIGNED_ON(8) float *alpha ;
}
GpuHidHaarClassifier;
typedef _ALIGNED_ON(64) struct GpuHidHaarStageClassifier
{
_ALIGNED_ON(4) int count ;
_ALIGNED_ON(4) float threshold ;
_ALIGNED_ON(4) int two_rects ;
_ALIGNED_ON(8) GpuHidHaarClassifier *classifier ;
_ALIGNED_ON(8) struct GpuHidHaarStageClassifier *next;
_ALIGNED_ON(8) struct GpuHidHaarStageClassifier *child ;
_ALIGNED_ON(8) struct GpuHidHaarStageClassifier *parent ;
}
GpuHidHaarStageClassifier;
typedef _ALIGNED_ON(64) struct GpuHidHaarClassifierCascade
{
_ALIGNED_ON(4) int count ;
_ALIGNED_ON(4) int is_stump_based ;
_ALIGNED_ON(4) int has_tilted_features ;
_ALIGNED_ON(4) int is_tree ;
_ALIGNED_ON(4) int pq0 ;
_ALIGNED_ON(4) int pq1 ;
_ALIGNED_ON(4) int pq2 ;
_ALIGNED_ON(4) int pq3 ;
_ALIGNED_ON(4) int p0 ;
_ALIGNED_ON(4) int p1 ;
_ALIGNED_ON(4) int p2 ;
_ALIGNED_ON(4) int p3 ;
_ALIGNED_ON(4) float inv_window_area ;
} GpuHidHaarClassifierCascade;
#else
#define _ALIGNED_ON(_ALIGNMENT) __attribute__((aligned(_ALIGNMENT) ))
typedef struct _ALIGNED_ON(128) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] _ALIGNED_ON(64);
float weight[CV_HAAR_FEATURE_MAX];// _ALIGNED_ON(16);
float threshold;// _ALIGNED_ON(4);
float alpha[3] _ALIGNED_ON(16);
int left _ALIGNED_ON(4);
int right _ALIGNED_ON(4);
}
GpuHidHaarTreeNode;
typedef struct _ALIGNED_ON(32) GpuHidHaarClassifier
{
int count _ALIGNED_ON(4);
GpuHidHaarTreeNode *node _ALIGNED_ON(8);
float *alpha _ALIGNED_ON(8);
}
GpuHidHaarClassifier;
typedef struct _ALIGNED_ON(64) GpuHidHaarStageClassifier
{
int count _ALIGNED_ON(4);
float threshold _ALIGNED_ON(4);
int two_rects _ALIGNED_ON(4);
GpuHidHaarClassifier *classifier _ALIGNED_ON(8);
struct GpuHidHaarStageClassifier *next _ALIGNED_ON(8);
struct GpuHidHaarStageClassifier *child _ALIGNED_ON(8);
struct GpuHidHaarStageClassifier *parent _ALIGNED_ON(8);
}
GpuHidHaarStageClassifier;
typedef struct _ALIGNED_ON(64) GpuHidHaarClassifierCascade
{
int count _ALIGNED_ON(4);
int is_stump_based _ALIGNED_ON(4);
int has_tilted_features _ALIGNED_ON(4);
int is_tree _ALIGNED_ON(4);
int pq0 _ALIGNED_ON(4);
int pq1 _ALIGNED_ON(4);
int pq2 _ALIGNED_ON(4);
int pq3 _ALIGNED_ON(4);
int p0 _ALIGNED_ON(4);
int p1 _ALIGNED_ON(4);
int p2 _ALIGNED_ON(4);
int p3 _ALIGNED_ON(4);
float inv_window_area _ALIGNED_ON(4);
} GpuHidHaarClassifierCascade;
#endif
const int icv_object_win_border = 1;
const float icv_stage_threshold_bias = 0.0001f;
double globaltime = 0;
/* create more efficient internal representation of haar classifier cascade */
static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarClassifierCascade *cascade, int *size, int *totalclassifier)
{
GpuHidHaarClassifierCascade *out = 0;
int i, j, k, l;
int datasize;
int total_classifiers = 0;
int total_nodes = 0;
char errorstr[256];
GpuHidHaarStageClassifier *stage_classifier_ptr;
GpuHidHaarClassifier *haar_classifier_ptr;
GpuHidHaarTreeNode *haar_node_ptr;
CvSize orig_window_size;
int has_tilted_features = 0;
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier pointer" );
if( cascade->hid_cascade )
CV_Error( CV_StsError, "hid_cascade has been already created" );
if( !cascade->stage_classifier )
CV_Error( CV_StsNullPtr, "" );
if( cascade->count <= 0 )
CV_Error( CV_StsOutOfRange, "Negative number of cascade stages" );
orig_window_size = cascade->orig_window_size;
/* check input structure correctness and calculate total memory size needed for
internal representation of the classifier cascade */
for( i = 0; i < cascade->count; i++ )
{
CvHaarStageClassifier *stage_classifier = cascade->stage_classifier + i;
if( !stage_classifier->classifier ||
stage_classifier->count <= 0 )
{
sprintf( errorstr, "header of the stage classifier #%d is invalid "
"(has null pointers or non-positive classfier count)", i );
CV_Error( CV_StsError, errorstr );
}
total_classifiers += stage_classifier->count;
for( j = 0; j < stage_classifier->count; j++ )
{
CvHaarClassifier *classifier = stage_classifier->classifier + j;
total_nodes += classifier->count;
for( l = 0; l < classifier->count; l++ )
{
for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
{
if( classifier->haar_feature[l].rect[k].r.width )
{
CvRect r = classifier->haar_feature[l].rect[k].r;
int tilted = classifier->haar_feature[l].tilted;
has_tilted_features |= tilted != 0;
if( r.width < 0 || r.height < 0 || r.y < 0 ||
r.x + r.width > orig_window_size.width
||
(!tilted &&
(r.x < 0 || r.y + r.height > orig_window_size.height))
||
(tilted && (r.x - r.height < 0 ||
r.y + r.width + r.height > orig_window_size.height)))
{
sprintf( errorstr, "rectangle #%d of the classifier #%d of "
"the stage classifier #%d is not inside "
"the reference (original) cascade window", k, j, i );
CV_Error( CV_StsNullPtr, errorstr );
}
}
}
}
}
}
// this is an upper boundary for the whole hidden cascade size
datasize = sizeof(GpuHidHaarClassifierCascade) +
sizeof(GpuHidHaarStageClassifier) * cascade->count +
sizeof(GpuHidHaarClassifier) * total_classifiers +
sizeof(GpuHidHaarTreeNode) * total_nodes;
*totalclassifier = total_classifiers;
*size = datasize;
out = (GpuHidHaarClassifierCascade *)cvAlloc( datasize );
memset( out, 0, sizeof(*out) );
/* init header */
out->count = cascade->count;
stage_classifier_ptr = (GpuHidHaarStageClassifier *)(out + 1);
haar_classifier_ptr = (GpuHidHaarClassifier *)(stage_classifier_ptr + cascade->count);
haar_node_ptr = (GpuHidHaarTreeNode *)(haar_classifier_ptr + total_classifiers);
out->is_stump_based = 1;
out->has_tilted_features = has_tilted_features;
out->is_tree = 0;
/* initialize internal representation */
for( i = 0; i < cascade->count; i++ )
{
CvHaarStageClassifier *stage_classifier = cascade->stage_classifier + i;
GpuHidHaarStageClassifier *hid_stage_classifier = stage_classifier_ptr + i;
hid_stage_classifier->count = stage_classifier->count;
hid_stage_classifier->threshold = stage_classifier->threshold - icv_stage_threshold_bias;
hid_stage_classifier->classifier = haar_classifier_ptr;
hid_stage_classifier->two_rects = 1;
haar_classifier_ptr += stage_classifier->count;
for( j = 0; j < stage_classifier->count; j++ )
{
CvHaarClassifier *classifier = stage_classifier->classifier + j;
GpuHidHaarClassifier *hid_classifier = hid_stage_classifier->classifier + j;
int node_count = classifier->count;
float *alpha_ptr = &haar_node_ptr->alpha[0];
hid_classifier->count = node_count;
hid_classifier->node = haar_node_ptr;
hid_classifier->alpha = alpha_ptr;
for( l = 0; l < node_count; l++ )
{
GpuHidHaarTreeNode *node = hid_classifier->node + l;
CvHaarFeature *feature = classifier->haar_feature + l;
memset( node, -1, sizeof(*node) );
node->threshold = classifier->threshold[l];
node->left = classifier->left[l];
node->right = classifier->right[l];
if( fabs(feature->rect[2].weight) < DBL_EPSILON ||
feature->rect[2].r.width == 0 ||
feature->rect[2].r.height == 0 )
{
node->p[2][0] = 0;
node->p[2][1] = 0;
node->p[2][2] = 0;
node->p[2][3] = 0;
node->weight[2] = 0;
}
else
hid_stage_classifier->two_rects = 0;
memcpy( node->alpha, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
haar_node_ptr = haar_node_ptr + 1;
}
out->is_stump_based &= node_count == 1;
}
}
cascade->hid_cascade = (CvHidHaarClassifierCascade *)out;
assert( (char *)haar_node_ptr - (char *)out <= datasize );
return out;
}
#define sum_elem_ptr(sum,row,col) \
((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
#define sqsum_elem_ptr(sqsum,row,col) \
((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
#define calc_sum(rect,offset) \
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_cascade,
double scale,
int step)
{
GpuHidHaarClassifierCascade *cascade;
int coi0 = 0, coi1 = 0;
int i;
int datasize;
int total;
CvRect equRect;
double weight_scale;
GpuHidHaarStageClassifier *stage_classifier;
if( !CV_IS_HAAR_CLASSIFIER(_cascade) )
CV_Error( !_cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier pointer" );
if( scale <= 0 )
CV_Error( CV_StsOutOfRange, "Scale must be positive" );
if( coi0 || coi1 )
CV_Error( CV_BadCOI, "COI is not supported" );
if( !_cascade->hid_cascade )
gpuCreateHidHaarClassifierCascade(_cascade, &datasize, &total);
cascade = (GpuHidHaarClassifierCascade *) _cascade->hid_cascade;
stage_classifier = (GpuHidHaarStageClassifier *) (cascade + 1);
_cascade->scale = scale;
_cascade->real_window_size.width = cvRound( _cascade->orig_window_size.width * scale );
_cascade->real_window_size.height = cvRound( _cascade->orig_window_size.height * scale );
equRect.x = equRect.y = cvRound(scale);
equRect.width = cvRound((_cascade->orig_window_size.width - 2) * scale);
equRect.height = cvRound((_cascade->orig_window_size.height - 2) * scale);
weight_scale = 1. / (equRect.width * equRect.height);
cascade->inv_window_area = weight_scale;
cascade->pq0 = equRect.x;
cascade->pq1 = equRect.y;
cascade->pq2 = equRect.x + equRect.width;
cascade->pq3 = equRect.y + equRect.height;
cascade->p0 = equRect.x;
cascade->p1 = equRect.y;
cascade->p2 = equRect.x + equRect.width;
cascade->p3 = equRect.y + equRect.height;
/* init pointers in haar features according to real window size and
given image pointers */
for( i = 0; i < _cascade->count; i++ )
{
int j, k, l;
for( j = 0; j < stage_classifier[i].count; j++ )
{
for( l = 0; l < stage_classifier[i].classifier[j].count; l++ )
{
CvHaarFeature *feature =
&_cascade->stage_classifier[i].classifier[j].haar_feature[l];
GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
double sum0 = 0, area0 = 0;
CvRect r[3];
int base_w = -1, base_h = -1;
int new_base_w = 0, new_base_h = 0;
int kx, ky;
int flagx = 0, flagy = 0;
int x0 = 0, y0 = 0;
int nr;
/* align blocks */
for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
{
if(!hidnode->p[k][0])
break;
r[k] = feature->rect[k].r;
base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].width - 1) );
base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].x - r[0].x - 1) );
base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].height - 1) );
base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].y - r[0].y - 1) );
}
nr = k;
base_w += 1;
base_h += 1;
if(base_w == 0)
base_w = 1;
kx = r[0].width / base_w;
if(base_h == 0)
base_h = 1;
ky = r[0].height / base_h;
if( kx <= 0 )
{
flagx = 1;
new_base_w = cvRound( r[0].width * scale ) / kx;
x0 = cvRound( r[0].x * scale );
}
if( ky <= 0 )
{
flagy = 1;
new_base_h = cvRound( r[0].height * scale ) / ky;
y0 = cvRound( r[0].y * scale );
}
for( k = 0; k < nr; k++ )
{
CvRect tr;
double correction_ratio;
if( flagx )
{
tr.x = (r[k].x - r[0].x) * new_base_w / base_w + x0;
tr.width = r[k].width * new_base_w / base_w;
}
else
{
tr.x = cvRound( r[k].x * scale );
tr.width = cvRound( r[k].width * scale );
}
if( flagy )
{
tr.y = (r[k].y - r[0].y) * new_base_h / base_h + y0;
tr.height = r[k].height * new_base_h / base_h;
}
else
{
tr.y = cvRound( r[k].y * scale );
tr.height = cvRound( r[k].height * scale );
}
#if CV_ADJUST_WEIGHTS
{
// RAINER START
const float orig_feature_size = (float)(feature->rect[k].r.width) * feature->rect[k].r.height;
const float orig_norm_size = (float)(_cascade->orig_window_size.width) * (_cascade->orig_window_size.height);
const float feature_size = float(tr.width * tr.height);
//const float normSize = float(equRect.width*equRect.height);
float target_ratio = orig_feature_size / orig_norm_size;
//float isRatio = featureSize / normSize;
//correctionRatio = targetRatio / isRatio / normSize;
correction_ratio = target_ratio / feature_size;
// RAINER END
}
#else
correction_ratio = weight_scale * (!feature->tilted ? 1 : 0.5);
#endif
if( !feature->tilted )
{
hidnode->p[k][0] = tr.x;
hidnode->p[k][1] = tr.y;
hidnode->p[k][2] = tr.x + tr.width;
hidnode->p[k][3] = tr.y + tr.height;
}
else
{
hidnode->p[k][2] = (tr.y + tr.width) * step + tr.x + tr.width;
hidnode->p[k][3] = (tr.y + tr.width + tr.height) * step + tr.x + tr.width - tr.height;
hidnode->p[k][0] = tr.y * step + tr.x;
hidnode->p[k][1] = (tr.y + tr.height) * step + tr.x - tr.height;
}
hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
if( k == 0 )
area0 = tr.width * tr.height;
else
sum0 += hidnode->weight[k] * tr.width * tr.height;
}
hidnode->weight[0] = (float)(-sum0 / area0);
} /* l */
} /* j */
}
}
static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade)
{
GpuHidHaarClassifierCascade *cascade;
int i;
int datasize;
int total;
CvRect equRect;
double weight_scale;
GpuHidHaarStageClassifier *stage_classifier;
if( !CV_IS_HAAR_CLASSIFIER(_cascade) )
CV_Error( !_cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier pointer" );
if( !_cascade->hid_cascade )
gpuCreateHidHaarClassifierCascade(_cascade, &datasize, &total);
cascade = (GpuHidHaarClassifierCascade *) _cascade->hid_cascade;
stage_classifier = (GpuHidHaarStageClassifier *) cascade + 1;
_cascade->scale = 1.0;
_cascade->real_window_size.width = _cascade->orig_window_size.width ;
_cascade->real_window_size.height = _cascade->orig_window_size.height;
equRect.x = equRect.y = 1;
equRect.width = _cascade->orig_window_size.width - 2;
equRect.height = _cascade->orig_window_size.height - 2;
weight_scale = 1;
cascade->inv_window_area = weight_scale;
cascade->p0 = equRect.x;
cascade->p1 = equRect.y;
cascade->p2 = equRect.height;
cascade->p3 = equRect.width ;
for( i = 0; i < _cascade->count; i++ )
{
int j, l;
for( j = 0; j < stage_classifier[i].count; j++ )
{
for( l = 0; l < stage_classifier[i].classifier[j].count; l++ )
{
const CvHaarFeature *feature =
&_cascade->stage_classifier[i].classifier[j].haar_feature[l];
GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
for( int k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
{
const CvRect tr = feature->rect[k].r;
if (tr.width == 0)
break;
double correction_ratio = weight_scale * (!feature->tilted ? 1 : 0.5);
hidnode->p[k][0] = tr.x;
hidnode->p[k][1] = tr.y;
hidnode->p[k][2] = tr.width;
hidnode->p[k][3] = tr.height;
hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
}
} /* l */
} /* j */
}
}
CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemStorage *storage, double scaleFactor,
int minNeighbors, int flags, CvSize minSize, CvSize maxSize)
{
CvHaarClassifierCascade *cascade = oldCascade;
const double GROUP_EPS = 0.2;
CvSeq *result_seq = 0;
cv::ConcurrentRectVector allCandidates;
std::vector<cv::Rect> rectList;
std::vector<int> rweights;
double factor;
int datasize=0;
int totalclassifier=0;
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
GpuHidHaarClassifier *classifier;
GpuHidHaarTreeNode *node;
int *candidate;
cl_int status;
bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
if( maxSize.height == 0 || maxSize.width == 0 )
{
maxSize.height = gimg.rows;
maxSize.width = gimg.cols;
}
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
if( !storage )
CV_Error( CV_StsNullPtr, "Null storage pointer" );
if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
if( scaleFactor <= 1 )
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
if( findBiggestObject )
flags &= ~CV_HAAR_SCALE_IMAGE;
if( !cascade->hid_cascade )
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), storage );
if( CV_MAT_CN(gimg.type()) > 1 )
{
oclMat gtemp;
cvtColor( gimg, gtemp, CV_BGR2GRAY );
gimg = gtemp;
}
if( findBiggestObject )
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
if( gimg.cols < minSize.width || gimg.rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
cl_command_queue qu = getClCommandQueue(Context::getContext());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
CvSize winSize0 = cascade->orig_window_size;
int totalheight = 0;
int indexy = 0;
CvSize sz;
vector<CvSize> sizev;
vector<float> scalev;
for(factor = 1.f;; factor *= scaleFactor)
{
CvSize winSize = { cvRound(winSize0.width * factor), cvRound(winSize0.height * factor) };
sz.width = cvRound( gimg.cols / factor ) + 1;
sz.height = cvRound( gimg.rows / factor ) + 1;
CvSize sz1 = { sz.width - winSize0.width - 1, sz.height - winSize0.height - 1 };
if( sz1.width <= 0 || sz1.height <= 0 )
break;
if( winSize.width > maxSize.width || winSize.height > maxSize.height )
break;
if( winSize.width < minSize.width || winSize.height < minSize.height )
continue;
totalheight += sz.height;
sizev.push_back(sz);
scalev.push_back(factor);
}
oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1);
oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1);
oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1);
int sdepth = 0;
if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
sdepth = CV_64FC1;
else
sdepth = CV_32FC1;
sdepth = CV_MAT_DEPTH(sdepth);
int type = CV_MAKE_TYPE(sdepth, 1);
oclMat gsqsum_t(totalheight + 4, gimg.cols + 1, type);
cl_mem stagebuffer;
cl_mem nodebuffer;
cl_mem candidatebuffer;
cl_mem scaleinfobuffer;
cv::Rect roi, roi2;
cv::Mat imgroi, imgroisq;
cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
int grp_per_CU = 12;
size_t blocksize = 8;
size_t localThreads[3] = { blocksize, blocksize , 1 };
size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->getDeviceInfo().maxComputeUnits) *localThreads[0],
localThreads[1], 1
};
int outputsz = 256 * globalThreads[0] / localThreads[0];
int loopcount = sizev.size();
detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
for( int i = 0; i < loopcount; i++ )
{
sz = sizev[i];
factor = scalev[i];
roi = Rect(0, indexy, sz.width, sz.height);
roi2 = Rect(0, 0, sz.width - 1, sz.height - 1);
resizeroi = gimg1(roi2);
gimgroi = gsum(roi);
gimgroisq = gsqsum_t(roi);
int width = gimgroi.cols - 1 - cascade->orig_window_size.width;
int height = gimgroi.rows - 1 - cascade->orig_window_size.height;
scaleinfo[i].width_height = (width << 16) | height;
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
scaleinfo[i].imgoff = gimgroi.offset >> 2;
scaleinfo[i].factor = factor;
cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
indexy += sz.height;
}
if(gsqsum_t.depth() == CV_64F)
gsqsum_t.convertTo(gsqsum, CV_32FC1);
else
gsqsum = gsqsum_t;
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz);
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
int startstage = 0;
int endstage = gcascade->count;
int startnode = 0;
int pixelstep = gsum.step / 4;
int splitstage = 3;
int splitnode = stage[0].count + stage[1].count + stage[2].count;
cl_int4 p, pq;
p.s[0] = gcascade->p0;
p.s[1] = gcascade->p1;
p.s[2] = gcascade->p2;
p.s[3] = gcascade->p3;
pq.s[0] = gcascade->pq0;
pq.s[1] = gcascade->pq1;
pq.s[2] = gcascade->pq2;
pq.s[3] = gcascade->pq3;
float correction = gcascade->inv_window_area;
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode ));
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p ));
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
{
//setup local group size
localThreads[0] = 8;
localThreads[1] = 16;
localThreads[2] = 1;
//init maximal number of workgroups
int WGNumX = 1+(sizev[0].width /(localThreads[0]));
int WGNumY = 1+(sizev[0].height/(localThreads[1]));
int WGNumZ = loopcount;
int WGNum = 0; //accurate number of non -empty workgroups
oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
{
cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
openCLVerifyCall(status);
for(int z=0;z<WGNumZ;++z)
{
int Width = (scaleinfo[z].width_height >> 16)&0xFFFF;
int Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF;
for(int y=0;y<WGNumY;++y)
{
int gy = y*localThreads[1];
if(gy>=(Height-cascade->orig_window_size.height))
continue; // no data to process
for(int x=0;x<WGNumX;++x)
{
int gx = x*localThreads[0];
if(gx>=(Width-cascade->orig_window_size.width))
continue; // no data to process
// save no-empty workgroup info into array
pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
pWGInfo[WGNum].s[1] = (gx << 16) | gy;
pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
WGNum++;
}
}
}
openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0));
pWGInfo = NULL;
}
// setup global sizes to have linear array of workgroups with WGNum size
globalThreads[0] = localThreads[0]*WGNum;
globalThreads[1] = localThreads[1];
globalThreads[2] = 1;
#define NODE_SIZE 12
// pack node info to have less memory loads
oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
{
cl_int status;
cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE, 0, oclNodesPK.step, 0,0,0,&status);
openCLVerifyCall(status);
//use known local data stride to precalulate indexes
int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width);
// check that maximal value is less than maximal unsigned short
assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < (int)USHRT_MAX);
for(int i = 0;i<nodenum;++i)
{//process each node from classifier
struct NodePK
{
unsigned short slm_index[3][4];
float weight[3];
float threshold;
float alpha[2];
};
struct NodePK * pOut = (struct NodePK *)(pNodesPK + NODE_SIZE*i);
for(int k=0;k<3;++k)
{// calc 4 short indexes in shared local mem for each rectangle instead of 2 (x,y) pair.
int* p = &(node[i].p[k][0]);
pOut->slm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]);
pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]);
pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]);
pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]);
}
//store used float point values for each node
pOut->weight[0] = node[i].weight[0];
pOut->weight[1] = node[i].weight[1];
pOut->weight[2] = node[i].weight[2];
pOut->threshold = node[i].threshold;
pOut->alpha[0] = node[i].alpha[0];
pOut->alpha[1] = node[i].alpha[1];
}
openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0));
pNodesPK = NULL;
}
// add 2 additional buffers (WGinfo and packed nodes) as 2 last args
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart ));
//form build options for kernel
string options = "-D PACKED_CLASSIFIER";
options += format(" -D NODE_SIZE=%d",NODE_SIZE);
options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
options += format(" -D LSx=%d",localThreads[0]);
options += format(" -D LSy=%d",localThreads[1]);
options += format(" -D SPLITNODE=%d",splitnode);
options += format(" -D SPLITSTAGE=%d",splitstage);
options += format(" -D OUTPUTSZ=%d",outputsz);
// init candiate global count by 0
int pattern = 0;
openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
// execute face detector
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
//read candidate buffer back and put it into host list
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
assert(candidate[0]<outputsz);
//printf("candidate[0]=%d\n",candidate[0]);
for(int i = 1; i <= candidate[0]; i++)
{
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],candidate[4 * i + 2], candidate[4 * i + 3]));
}
}
else
{
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
for(int i = 0; i < outputsz; i++)
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
free(scaleinfo);
free(candidate);
openCLSafeCall(clReleaseMemObject(stagebuffer));
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(nodebuffer));
openCLSafeCall(clReleaseMemObject(candidatebuffer));
}
else
{
CvSize winsize0 = cascade->orig_window_size;
int n_factors = 0;
oclMat gsum;
oclMat gsqsum;
oclMat gsqsum_t;
cv::ocl::integral(gimg, gsum, gsqsum_t);
if(gsqsum_t.depth() == CV_64F)
gsqsum_t.convertTo(gsqsum, CV_32FC1);
else
gsqsum = gsqsum_t;
CvSize sz;
vector<CvSize> sizev;
vector<float> scalev;
gpuSetHaarClassifierCascade(cascade);
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
cl_mem stagebuffer;
cl_mem nodebuffer;
cl_mem candidatebuffer;
cl_mem scaleinfobuffer;
cl_mem pbuffer;
cl_mem correctionbuffer;
for( n_factors = 0, factor = 1;
cvRound(factor * winsize0.width) < gimg.cols - 10 &&
cvRound(factor * winsize0.height) < gimg.rows - 10;
n_factors++, factor *= scaleFactor )
{
CvSize winSize = { cvRound( winsize0.width * factor ),
cvRound( winsize0.height * factor )
};
if( winSize.width < minSize.width || winSize.height < minSize.height )
{
continue;
}
sizev.push_back(winSize);
scalev.push_back(factor);
}
int loopcount = scalev.size();
if(loopcount == 0)
{
loopcount = 1;
n_factors = 1;
sizev.push_back(minSize);
scalev.push_back( std::min(cvRound(minSize.width / winsize0.width), cvRound(minSize.height / winsize0.height)) );
}
detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * loopcount);
float *correction = (float *)malloc(sizeof(float) * loopcount);
int grp_per_CU = 12;
size_t blocksize = 8;
size_t localThreads[3] = { blocksize, blocksize , 1 };
size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->getDeviceInfo().maxComputeUnits *localThreads[0],
localThreads[1], 1 };
int outputsz = 256 * globalThreads[0] / localThreads[0];
int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
nodenum * sizeof(GpuHidHaarTreeNode));
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE,
loopcount * nodenum * sizeof(GpuHidHaarTreeNode));
int startstage = 0;
int endstage = gcascade->count;
for(int i = 0; i < loopcount; i++)
{
sz = sizev[i];
factor = scalev[i];
double ystep = std::max(2., factor);
int equRect_x = cvRound(factor * gcascade->p0);
int equRect_y = cvRound(factor * gcascade->p1);
int equRect_w = cvRound(factor * gcascade->p3);
int equRect_h = cvRound(factor * gcascade->p2);
p[i].s[0] = equRect_x;
p[i].s[1] = equRect_y;
p[i].s[2] = equRect_x + equRect_w;
p[i].s[3] = equRect_y + equRect_h;
correction[i] = 1. / (equRect_w * equRect_h);
int width = (gsum.cols - 1 - sz.width + ystep - 1) / ystep;
int height = (gsum.rows - 1 - sz.height + ystep - 1) / ystep;
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
scaleinfo[i].width_height = (width << 16) | height;
scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
scaleinfo[i].imgoff = 0;
scaleinfo[i].factor = factor;
int startnodenum = nodenum * i;
float factor2 = (float)factor;
vector<pair<size_t, const void *> > args1;
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&newnodebuffer ));
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&factor2 ));
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&correction[i] ));
args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
size_t globalThreads2[3] = {nodenum, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
}
int step = gsum.step / 4;
int startnode = 0;
int splitstage = 3;
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz);
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer(qu, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer(qu, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&newnodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&pbuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&nodenum ));
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], candidate[4 * i + 2], candidate[4 * i + 3]));
}
free(scaleinfo);
free(p);
free(correction);
clEnqueueUnmapMemObject(qu, candidatebuffer, candidate, 0, 0, 0);
openCLSafeCall(clReleaseMemObject(stagebuffer));
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(nodebuffer));
openCLSafeCall(clReleaseMemObject(newnodebuffer));
openCLSafeCall(clReleaseMemObject(candidatebuffer));
openCLSafeCall(clReleaseMemObject(pbuffer));
openCLSafeCall(clReleaseMemObject(correctionbuffer));
}
cvFree(&cascade->hid_cascade);
rectList.resize(allCandidates.size());
if(!allCandidates.empty())
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
if( minNeighbors != 0 || findBiggestObject )
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
else
rweights.resize(rectList.size(), 0);
if( findBiggestObject && rectList.size() )
{
CvAvgComp result_comp = {{0, 0, 0, 0}, 0};
for( size_t i = 0; i < rectList.size(); i++ )
{
cv::Rect r = rectList[i];
if( r.area() > cv::Rect(result_comp.rect).area() )
{
result_comp.rect = r;
result_comp.neighbors = rweights[i];
}
}
cvSeqPush( result_seq, &result_comp );
}
else
{
for( size_t i = 0; i < rectList.size(); i++ )
{
CvAvgComp c;
c.rect = rectList[i];
c.neighbors = rweights[i];
cvSeqPush( result_seq, &c );
}
}
return result_seq;
}
struct getRect
{
Rect operator()(const CvAvgComp &e) const
{
return e.rect;
}
};
void cv::ocl::OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv::Rect>& faces,
double scaleFactor, int minNeighbors, int flags,
Size minSize, Size maxSize)
{
CvSeq* _objects;
MemStorage storage(cvCreateMemStorage(0));
_objects = oclHaarDetectObjects(gimg, storage, scaleFactor, minNeighbors, flags, minSize, maxSize);
vector<CvAvgComp> vecAvgComp;
Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
faces.resize(vecAvgComp.size());
std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
}
struct OclBuffers
{
cl_mem stagebuffer;
cl_mem nodebuffer;
cl_mem candidatebuffer;
cl_mem scaleinfobuffer;
cl_mem pbuffer;
cl_mem correctionbuffer;
cl_mem newnodebuffer;
};
void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv::Rect>& faces,
double scaleFactor, int minNeighbors, int flags,
Size minSize, Size maxSize)
{
int blocksize = 8;
int grp_per_CU = 12;
size_t localThreads[3] = { blocksize, blocksize, 1 };
size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->getDeviceInfo().maxComputeUnits *localThreads[0],
localThreads[1],
1 };
int outputsz = 256 * globalThreads[0] / localThreads[0];
Init(gimg.rows, gimg.cols, scaleFactor, flags, outputsz, localThreads, minSize, maxSize);
const double GROUP_EPS = 0.2;
cv::ConcurrentRectVector allCandidates;
std::vector<cv::Rect> rectList;
std::vector<int> rweights;
CvHaarClassifierCascade *cascade = oldCascade;
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
if( CV_MAT_CN(gimg.type()) > 1 )
{
oclMat gtemp;
cvtColor( gimg, gtemp, CV_BGR2GRAY );
gimg = gtemp;
}
int *candidate;
cl_command_queue qu = getClCommandQueue(Context::getContext());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
int indexy = 0;
CvSize sz;
cv::Rect roi, roi2;
cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
for( int i = 0; i < m_loopcount; i++ )
{
sz = sizev[i];
roi = Rect(0, indexy, sz.width, sz.height);
roi2 = Rect(0, 0, sz.width - 1, sz.height - 1);
resizeroi = gimg1(roi2);
gimgroi = gsum(roi);
gimgroisq = gsqsum_t(roi);
cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
indexy += sz.height;
}
if(gsqsum_t.depth() == CV_64F)
gsqsum_t.convertTo(gsqsum, CV_32FC1);
else
gsqsum = gsqsum_t;
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
int startstage = 0;
int endstage = gcascade->count;
int startnode = 0;
int pixelstep = gsum.step / 4;
int splitstage = 3;
int splitnode = stage[0].count + stage[1].count + stage[2].count;
cl_int4 p, pq;
p.s[0] = gcascade->p0;
p.s[1] = gcascade->p1;
p.s[2] = gcascade->p2;
p.s[3] = gcascade->p3;
pq.s[0] = gcascade->pq0;
pq.s[1] = gcascade->pq1;
pq.s[2] = gcascade->pq2;
pq.s[3] = gcascade->pq3;
float correction = gcascade->inv_window_area;
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode ));
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p ));
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
memset(candidate, 0, 4 * sizeof(int) * outputsz);
openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
{
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
}
free((void *)candidate);
candidate = NULL;
}
else
{
cv::ocl::integral(gimg, gsum, gsqsum_t);
if(gsqsum_t.depth() == CV_64F)
gsqsum_t.convertTo(gsqsum, CV_32FC1);
else
gsqsum = gsqsum_t;
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
int step = gsum.step / 4;
int startnode = 0;
int splitstage = 3;
int startstage = 0;
int endstage = gcascade->count;
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
}
rectList.resize(allCandidates.size());
if(!allCandidates.empty())
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
if( minNeighbors != 0 || findBiggestObject )
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
else
rweights.resize(rectList.size(), 0);
GenResult(faces, rectList, rweights);
}
void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
double scaleFactor, int flags,
const int outputsz, const size_t localThreads[],
CvSize minSize, CvSize maxSize)
{
if(initialized)
{
return; // we only allow one time initialization
}
CvHaarClassifierCascade *cascade = oldCascade;
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
if( scaleFactor <= 1 )
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
if( cols < minSize.width || rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
int datasize=0;
int totalclassifier=0;
if( !cascade->hid_cascade )
{
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
}
if( maxSize.height == 0 || maxSize.width == 0 )
{
maxSize.height = rows;
maxSize.width = cols;
}
findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
if( findBiggestObject )
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
m_scaleFactor = scaleFactor;
m_rows = rows;
m_cols = cols;
m_flags = flags;
m_minSize = minSize;
m_maxSize = maxSize;
// initialize nodes
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
GpuHidHaarClassifier *classifier;
GpuHidHaarTreeNode *node;
cl_command_queue qu = getClCommandQueue(Context::getContext());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
sizeof(GpuHidHaarStageClassifier) * gcascade->count,
stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
}
else
{
gpuSetHaarClassifierCascade(cascade);
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount);
float *correction = (float *)malloc(sizeof(float) * m_loopcount);
double factor;
for(int i = 0; i < m_loopcount; i++)
{
factor = scalev[i];
int equRect_x = (int)(factor * gcascade->p0 + 0.5);
int equRect_y = (int)(factor * gcascade->p1 + 0.5);
int equRect_w = (int)(factor * gcascade->p3 + 0.5);
int equRect_h = (int)(factor * gcascade->p2 + 0.5);
p[i].s[0] = equRect_x;
p[i].s[1] = equRect_y;
p[i].s[2] = equRect_x + equRect_w;
p[i].s[3] = equRect_y + equRect_h;
correction[i] = 1. / (equRect_w * equRect_h);
int startnodenum = m_nodenum * i;
float factor2 = (float)factor;
vector<pair<size_t, const void *> > args1;
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer ));
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&factor2 ));
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&correction[i] ));
args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
size_t globalThreads2[3] = {m_nodenum, 1, 1};
openCLExecuteKernel(Context::getContext(), &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
}
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL));
free(p);
free(correction);
}
initialized = true;
}
void cv::ocl::OclCascadeClassifierBuf::CreateBaseBufs(const int datasize, const int totalclassifier,
const int flags, const int outputsz)
{
if (!initialized)
{
buffers = malloc(sizeof(OclBuffers));
size_t tempSize =
sizeof(GpuHidHaarStageClassifier) * ((GpuHidHaarClassifierCascade *)oldCascade->hid_cascade)->count;
m_nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - tempSize - sizeof(GpuHidHaarClassifier) * totalclassifier)
/ sizeof(GpuHidHaarTreeNode);
((OclBuffers *)buffers)->stagebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, tempSize);
((OclBuffers *)buffers)->nodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, m_nodenum * sizeof(GpuHidHaarTreeNode));
}
if (initialized
&& ((m_flags & CV_HAAR_SCALE_IMAGE) ^ (flags & CV_HAAR_SCALE_IMAGE)))
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
}
if (flags & CV_HAAR_SCALE_IMAGE)
{
((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(),
CL_MEM_WRITE_ONLY,
4 * sizeof(int) * outputsz);
}
else
{
((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(),
CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
4 * sizeof(int) * outputsz);
}
}
void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
const int rows, const int cols, const int flags,
const double scaleFactor, const size_t localThreads[],
CvSize minSize, CvSize maxSize)
{
if (initialized)
{
if ((m_flags & CV_HAAR_SCALE_IMAGE) && !(flags & CV_HAAR_SCALE_IMAGE))
{
gimg1.release();
gsum.release();
gsqsum.release();
gsqsum_t.release();
}
else if (!(m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE))
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
else if ((m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE))
{
if (fabs(m_scaleFactor - scaleFactor) < 1e-6
&& (rows == m_rows && cols == m_cols)
&& (minSize.width == m_minSize.width)
&& (minSize.height == m_minSize.height)
&& (maxSize.width == m_maxSize.width)
&& (maxSize.height == m_maxSize.height))
{
return;
}
}
else
{
if (fabs(m_scaleFactor - scaleFactor) < 1e-6
&& (rows == m_rows && cols == m_cols)
&& (minSize.width == m_minSize.width)
&& (minSize.height == m_minSize.height)
&& (maxSize.width == m_maxSize.width)
&& (maxSize.height == m_maxSize.height))
{
return;
}
else
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
}
}
int loopcount;
int indexy = 0;
int totalheight = 0;
double factor;
Rect roi;
CvSize sz;
CvSize winSize0 = oldCascade->orig_window_size;
detect_piramid_info *scaleinfo;
cl_command_queue qu = getClCommandQueue(Context::getContext());
if (flags & CV_HAAR_SCALE_IMAGE)
{
for(factor = 1.f;; factor *= scaleFactor)
{
CvSize winSize = { cvRound(winSize0.width * factor), cvRound(winSize0.height * factor) };
sz.width = cvRound( cols / factor ) + 1;
sz.height = cvRound( rows / factor ) + 1;
CvSize sz1 = { sz.width - winSize0.width - 1, sz.height - winSize0.height - 1 };
if( sz1.width <= 0 || sz1.height <= 0 )
break;
if( winSize.width > maxSize.width || winSize.height > maxSize.height )
break;
if( winSize.width < minSize.width || winSize.height < minSize.height )
continue;
totalheight += sz.height;
sizev.push_back(sz);
scalev.push_back(static_cast<float>(factor));
}
loopcount = sizev.size();
gimg1.create(rows, cols, CV_8UC1);
gsum.create(totalheight + 4, cols + 1, CV_32SC1);
gsqsum.create(totalheight + 4, cols + 1, CV_32FC1);
int sdepth = 0;
if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
sdepth = CV_64FC1;
else
sdepth = CV_32FC1;
sdepth = CV_MAT_DEPTH(sdepth);
int type = CV_MAKE_TYPE(sdepth, 1);
gsqsum_t.create(totalheight + 4, cols + 1, type);
scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
for( int i = 0; i < loopcount; i++ )
{
sz = sizev[i];
roi = Rect(0, indexy, sz.width, sz.height);
int width = sz.width - 1 - oldCascade->orig_window_size.width;
int height = sz.height - 1 - oldCascade->orig_window_size.height;
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height;
((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
((detect_piramid_info *)scaleinfo)[i].imgoff = gsum(roi).offset >> 2;
((detect_piramid_info *)scaleinfo)[i].factor = scalev[i];
indexy += sz.height;
}
}
else
{
for(factor = 1;
cvRound(factor * winSize0.width) < cols - 10 && cvRound(factor * winSize0.height) < rows - 10;
factor *= scaleFactor)
{
CvSize winSize = { cvRound( winSize0.width * factor ), cvRound( winSize0.height * factor ) };
if( winSize.width < minSize.width || winSize.height < minSize.height )
{
continue;
}
sizev.push_back(winSize);
scalev.push_back(factor);
}
loopcount = scalev.size();
if(loopcount == 0)
{
loopcount = 1;
sizev.push_back(minSize);
scalev.push_back( std::min(cvRound(minSize.width / winSize0.width), cvRound(minSize.height / winSize0.height)) );
}
((OclBuffers *)buffers)->pbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY,
sizeof(cl_int4) * loopcount);
((OclBuffers *)buffers)->correctionbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY,
sizeof(cl_float) * loopcount);
((OclBuffers *)buffers)->newnodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_WRITE,
loopcount * m_nodenum * sizeof(GpuHidHaarTreeNode));
scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
for( int i = 0; i < loopcount; i++ )
{
sz = sizev[i];
factor = scalev[i];
double ystep = cv::max(2.,factor);
int width = cvRound((cols - 1 - sz.width + ystep - 1) / ystep);
int height = cvRound((rows - 1 - sz.height + ystep - 1) / ystep);
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height;
((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
((detect_piramid_info *)scaleinfo)[i].imgoff = 0;
((detect_piramid_info *)scaleinfo)[i].factor = factor;
}
}
if (loopcount != m_loopcount)
{
if (initialized)
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
}
((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
}
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
sizeof(detect_piramid_info)*loopcount,
scaleinfo, 0, NULL, NULL));
free(scaleinfo);
m_loopcount = loopcount;
}
void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& faces,
const std::vector<cv::Rect> &rectList,
const std::vector<int> &rweights)
{
MemStorage tempStorage(cvCreateMemStorage(0));
CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), tempStorage );
if( findBiggestObject && rectList.size() )
{
CvAvgComp result_comp = {{0, 0, 0, 0}, 0};
for( size_t i = 0; i < rectList.size(); i++ )
{
cv::Rect r = rectList[i];
if( r.area() > cv::Rect(result_comp.rect).area() )
{
result_comp.rect = r;
result_comp.neighbors = rweights[i];
}
}
cvSeqPush( result_seq, &result_comp );
}
else
{
for( size_t i = 0; i < rectList.size(); i++ )
{
CvAvgComp c;
c.rect = rectList[i];
c.neighbors = rweights[i];
cvSeqPush( result_seq, &c );
}
}
vector<CvAvgComp> vecAvgComp;
Seq<CvAvgComp>(result_seq).copyTo(vecAvgComp);
faces.resize(vecAvgComp.size());
std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
}
void cv::ocl::OclCascadeClassifierBuf::release()
{
if(initialized)
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
if( (m_flags & CV_HAAR_SCALE_IMAGE) )
{
cvFree(&oldCascade->hid_cascade);
}
else
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
free(buffers);
buffers = NULL;
initialized = false;
}
}
#ifndef _MAX_PATH
#define _MAX_PATH 1024
#endif
#endif
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Niko Li, newlife20080214@gmail.com
// Wang Weiyan, wangweiyanster@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
// Erping Pang, erping@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
#define CV_HAAR_FEATURE_MAX 3
#define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])
#define calc_sum1(rect,offset,i) (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset])
typedef int sumtype;
typedef float sqsumtype;
#ifndef STUMP_BASED
#define STUMP_BASED 1
#endif
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
float weight[CV_HAAR_FEATURE_MAX];
float threshold;
float alpha[3] __attribute__((aligned (16)));
int left __attribute__((aligned (4)));
int right __attribute__((aligned (4)));
}
GpuHidHaarTreeNode;
//typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
//{
// int count __attribute__((aligned (4)));
// GpuHidHaarTreeNode* node __attribute__((aligned (8)));
// float* alpha __attribute__((aligned (8)));
//}
//GpuHidHaarClassifier;
typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
{
int count __attribute__((aligned (4)));
float threshold __attribute__((aligned (4)));
int two_rects __attribute__((aligned (4)));
int reserved0 __attribute__((aligned (8)));
int reserved1 __attribute__((aligned (8)));
int reserved2 __attribute__((aligned (8)));
int reserved3 __attribute__((aligned (8)));
}
GpuHidHaarStageClassifier;
//typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
//{
// int count __attribute__((aligned (4)));
// int is_stump_based __attribute__((aligned (4)));
// int has_tilted_features __attribute__((aligned (4)));
// int is_tree __attribute__((aligned (4)));
// int pq0 __attribute__((aligned (4)));
// int pq1 __attribute__((aligned (4)));
// int pq2 __attribute__((aligned (4)));
// int pq3 __attribute__((aligned (4)));
// int p0 __attribute__((aligned (4)));
// int p1 __attribute__((aligned (4)));
// int p2 __attribute__((aligned (4)));
// int p3 __attribute__((aligned (4)));
// float inv_window_area __attribute__((aligned (4)));
//} GpuHidHaarClassifierCascade;
#ifdef PACKED_CLASSIFIER
// this code is scalar, one pixel -> one workitem
__kernel void gpuRunHaarClassifierCascadePacked(
global const GpuHidHaarStageClassifier * stagecascadeptr,
global const int4 * info,
global const GpuHidHaarTreeNode * nodeptr,
global const int * restrict sum,
global const float * restrict sqsum,
volatile global int4 * candidate,
const int pixelstep,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
const int splitnode,
const int4 p,
const int4 pq,
const float correction,
global const int* pNodesPK,
global const int4* pWGInfo
)
{
// this version used information provided for each workgroup
// no empty WG
int gid = (int)get_group_id(0);
int lid_x = (int)get_local_id(0);
int lid_y = (int)get_local_id(1);
int lid = lid_y*LSx+lid_x;
int4 WGInfo = pWGInfo[gid];
int GroupX = (WGInfo.y >> 16)&0xFFFF;
int GroupY = (WGInfo.y >> 0 )& 0xFFFF;
int Width = (WGInfo.x >> 16)&0xFFFF;
int Height = (WGInfo.x >> 0 )& 0xFFFF;
int ImgOffset = WGInfo.z;
float ScaleFactor = as_float(WGInfo.w);
#define DATA_SIZE_X (LSx+WND_SIZE_X)
#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
local int SumL[DATA_SIZE];
// read input data window into local mem
for(int i = 0; i<DATA_SIZE; i+=(LSx*LSy))
{
int index = i+lid; // index in shared local memory
if(index<DATA_SIZE)
{// calc global x,y coordinat and read data from there
int x = min(GroupX + (index % (DATA_SIZE_X)),Width-1);
int y = min(GroupY + (index / (DATA_SIZE_X)),Height-1);
SumL[index] = sum[ImgOffset+y*pixelstep+x];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// calc variance_norm_factor for all stages
float variance_norm_factor;
int nodecounter= startnode;
int4 info1 = p;
int4 info2 = pq;
{
int xl = lid_x;
int yl = lid_y;
int OffsetLocal = yl * DATA_SIZE_X + xl;
int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl);
// add shift to get position on scaled image
OffsetGlobal += ImgOffset;
float mean =
SumL[info1.y*DATA_SIZE_X+info1.x+OffsetLocal] -
SumL[info1.y*DATA_SIZE_X+info1.z+OffsetLocal] -
SumL[info1.w*DATA_SIZE_X+info1.x+OffsetLocal] +
SumL[info1.w*DATA_SIZE_X+info1.z+OffsetLocal];
float sq =
sqsum[info2.y*pixelstep+info2.x+OffsetGlobal] -
sqsum[info2.y*pixelstep+info2.z+OffsetGlobal] -
sqsum[info2.w*pixelstep+info2.x+OffsetGlobal] +
sqsum[info2.w*pixelstep+info2.z+OffsetGlobal];
mean *= correction;
sq *= correction;
variance_norm_factor = sq - mean * mean;
variance_norm_factor = (variance_norm_factor >=0.f) ? sqrt(variance_norm_factor) : 1.f;
}// end calc variance_norm_factor for all stages
int result = (1.0f>0.0f);
for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
{// iterate until candidate is exist
float stage_sum = 0.0f;
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
int stagecount = stageinfo->count;
float stagethreshold = stageinfo->threshold;
int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
{
// simple macro to extract shorts from int
#define M0(_t) ((_t)&0xFFFF)
#define M1(_t) (((_t)>>16)&0xFFFF)
// load packed node data from global memory (L3) into registers
global const int4* pN = (__global int4*)(pNodesPK+nodecounter*NODE_SIZE);
int4 n0 = pN[0];
int4 n1 = pN[1];
int4 n2 = pN[2];
float nodethreshold = as_float(n2.y) * variance_norm_factor;
// calc sum of intensity pixels according to node information
float classsum =
(SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
(SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
(SumL[M0(n1.x)+lcl_off] - SumL[M1(n1.x)+lcl_off] - SumL[M0(n1.y)+lcl_off] + SumL[M1(n1.y)+lcl_off]) * as_float(n2.x);
//accumulate stage responce
stage_sum += (classsum >= nodethreshold) ? as_float(n2.w) : as_float(n2.z);
}
result = (stage_sum >= stagethreshold);
}// next stage if needed
if(result)
{// all stages will be passed and there is a detected face on the tested position
int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
if(index<OUTPUTSZ)
{
int x = GroupX+lid_x;
int y = GroupY+lid_y;
int4 candidate_result;
candidate_result.x = convert_int_rtn(x*ScaleFactor);
candidate_result.y = convert_int_rtn(y*ScaleFactor);
candidate_result.z = convert_int_rtn(ScaleFactor*WND_SIZE_X);
candidate_result.w = convert_int_rtn(ScaleFactor*WND_SIZE_Y);
candidate[index] = candidate_result;
}
}
}//end gpuRunHaarClassifierCascade
#else
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
global GpuHidHaarTreeNode * nodeptr,
global const int * restrict sum1,
global const float * restrict sqsum1,
global int4 * candidate,
const int pixelstep,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
const int splitnode,
const int4 p,
const int4 pq,
const float correction)
{
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx = get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx,grpszy);
int lcl_id = mad24(lclidy,grpszx,lclidx);
__local int lclshare[1024];
__local int* lcldata = lclshare;//for save win data
__local int* glboutindex = lcldata + 28*28;//for save global out index
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
glboutindex[0]=0;
int outputoff = mul24(grpidx,256);
//assume window size is 20X20
#define WINDOWSIZE 20+1
//make sure readwidth is the multiple of 4
//ystep =1, from host code
int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
int readheight = grpszy-1+WINDOWSIZE;
int read_horiz_cnt = readwidth >> 2;//each read int4
int total_read = mul24(read_horiz_cnt,readheight);
int read_loop = (total_read + lcl_sz - 1) >> 6;
candidate[outputoff+(lcl_id<<2)] = (int4)0;
candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
for(int scalei = 0; scalei <loopcount; scalei++)
{
int4 scaleinfo1= info[scalei];
int height = scaleinfo1.x & 0xffff;
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
int imgoff = scaleinfo1.z;
float factor = as_float(scaleinfo1.w);
__global const int * sum = sum1 + imgoff;
__global const float * sqsum = sqsum1 + imgoff;
for(int grploop=grpidx; grploop<totalgrp; grploop+=grpnumx)
{
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int x = mad24(grpidx,grpszx,lclidx);
int y = mad24(grpidy,grpszy,lclidy);
int grpoffx = x-lclidx;
int grpoffy = y-lclidy;
for(int i=0; i<read_loop; i++)
{
int pos_id = mad24(i,lcl_sz,lcl_id);
pos_id = pos_id < total_read ? pos_id : 0;
int lcl_y = pos_id / read_horiz_cnt;
int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
int glb_x = grpoffx + (lcl_x<<2);
int glb_y = grpoffy + lcl_y;
int glb_off = mad24(min(glb_y, height + WINDOWSIZE - 1),pixelstep,glb_x);
int4 data = *(__global int4*)&sum[glb_off];
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
vstore4(data, 0, &lcldata[lcl_off]);
}
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int result = 1;
int nodecounter= startnode;
float mean, variance_norm_factor;
barrier(CLK_LOCAL_MEM_FENCE);
int lcl_off = mad24(lclidy,readwidth,lclidx);
int4 cascadeinfo1, cascadeinfo2;
cascadeinfo1 = p;
cascadeinfo2 = pq;
cascadeinfo1.x +=lcl_off;
cascadeinfo1.z +=lcl_off;
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
*correction;
int p_offset = mad24(y, pixelstep, x);
cascadeinfo2.x +=p_offset;
cascadeinfo2.z +=p_offset;
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
variance_norm_factor = variance_norm_factor * correction - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
{
float stage_sum = 0.f;
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
int stagecount = stageinfo->count;
float stagethreshold = stageinfo->threshold;
for(int nodeloop = 0; nodeloop < stagecount; )
{
__global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
(((__global uchar*)nodeptr) + nodecounter * sizeof(GpuHidHaarTreeNode));
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=lcl_off;
info1.z +=lcl_off;
info2.x +=lcl_off;
info2.z +=lcl_off;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
info3.x +=lcl_off;
info3.z +=lcl_off;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
stage_sum += passThres ? alpha3.y : alpha3.x;
nodecounter++;
nodeloop++;
#else
bool isRootNode = (nodecounter & 1) == 0;
if(isRootNode)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += passThres ? alpha3.z : alpha3.y;
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (stage_sum >= stagethreshold) ? 1 : 0;
}
if(factor < 2)
{
if(result && lclidx %2 ==0 && lclidy %2 ==0 )
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
}
}
else
{
if(result)
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
nodecounter = splitnode;
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
{
lclcount[0]=0;
barrier(CLK_LOCAL_MEM_FENCE);
//int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
int stagecount = stageinfo->count;
float stagethreshold = stageinfo->threshold;
int perfscale = queuecount > 4 ? 3 : 2;
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
int lcl_compute_win = lcl_sz >> perfscale;
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stagecount + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
if(lcl_compute_win_id < queuecount)
{
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
const int stump_factor = STUMP_BASED ? 1 : 2;
int root_offset = 0;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stagecount;)
{
__global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
(((__global uchar*)nodeptr) + sizeof(GpuHidHaarTreeNode) * ((nodecounter + tempnodecounter) * stump_factor + root_offset));
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_pixel;
info1.z +=queue_pixel;
info2.x +=queue_pixel;
info2.z +=queue_pixel;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
info3.x +=queue_pixel;
info3.z +=queue_pixel;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
part_sum += passThres ? alpha3.y : alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
#else
if(root_offset == 0)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
root_offset = 1;
}
else
{
part_sum += alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
}
}
else
{
part_sum += passThres ? alpha3.z : alpha3.y;
tempnodecounter += lcl_compute_win;
lcl_loop++;
root_offset = 0;
}
#endif
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount)
{
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum += partialsum[lcl_id+i];
}
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = temp_coord;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
lcl_compute_win_id +=(1<<perfscale);
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
queuecount = lclcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
nodecounter += stagecount;
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
if(lcl_id<queuecount)
{
int temp = lcloutindex[lcl_id<<1];
int x = mad24(grpidx,grpszx,temp & 0xffff);
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
temp = glboutindex[0];
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rte(factor*20.f);
candidate_result.x = convert_int_rte(x*factor);
candidate_result.y = convert_int_rte(y*factor);
atomic_inc(glboutindex);
int i = outputoff+temp+lcl_id;
if(candidate[i].z == 0)
{
candidate[i] = candidate_result;
}
else
{
for(i=i+1;;i++)
{
if(candidate[i].z == 0)
{
candidate[i] = candidate_result;
break;
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
}//end for(int scalei = 0; scalei <loopcount; scalei++)
}
#endif
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Wu Xinglong, wxl370@126.com
// Sen Liu, swjtuls1987@126.com
// Peng Xiao, pengxiao@outlook.com
// Erping Pang, erping@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#define CV_HAAR_FEATURE_MAX 3
typedef int sumtype;
typedef float sqsumtype;
typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[3] __attribute__((aligned(16)));
int left __attribute__((aligned(4)));
int right __attribute__((aligned(4)));
}
GpuHidHaarTreeNode;
//typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
//{
// int count __attribute__((aligned(4)));
// GpuHidHaarTreeNode *node __attribute__((aligned(8)));
// float *alpha __attribute__((aligned(8)));
//}
//GpuHidHaarClassifier;
typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
{
int count __attribute__((aligned(4)));
float threshold __attribute__((aligned(4)));
int two_rects __attribute__((aligned(4)));
int reserved0 __attribute__((aligned(8)));
int reserved1 __attribute__((aligned(8)));
int reserved2 __attribute__((aligned(8)));
int reserved3 __attribute__((aligned(8)));
}
GpuHidHaarStageClassifier;
//typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
//{
// int count __attribute__((aligned(4)));
// int is_stump_based __attribute__((aligned(4)));
// int has_tilted_features __attribute__((aligned(4)));
// int is_tree __attribute__((aligned(4)));
// int pq0 __attribute__((aligned(4)));
// int pq1 __attribute__((aligned(4)));
// int pq2 __attribute__((aligned(4)));
// int pq3 __attribute__((aligned(4)));
// int p0 __attribute__((aligned(4)));
// int p1 __attribute__((aligned(4)));
// int p2 __attribute__((aligned(4)));
// int p3 __attribute__((aligned(4)));
// float inv_window_area __attribute__((aligned(4)));
//} GpuHidHaarClassifierCascade;
__kernel void gpuRunHaarClassifierCascade_scaled2(
global GpuHidHaarStageClassifier *stagecascadeptr_,
global int4 *info,
global GpuHidHaarTreeNode *nodeptr_,
global const int *restrict sum,
global const float *restrict sqsum,
global int4 *candidate,
const int rows,
const int cols,
const int step,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
global int4 *p,
global float *correction,
const int nodecount)
{
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx = get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_id = mad24(lclidy, grpszx, lclidx);
__local int glboutindex[1];
__local int lclcount[1];
__local int lcloutindex[64];
glboutindex[0] = 0;
int outputoff = mul24(grpidx, 256);
candidate[outputoff + (lcl_id << 2)] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
int max_idx = rows * cols - 1;
for (int scalei = 0; scalei < loopcount; scalei++)
{
int4 scaleinfo1 = info[scalei];
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
float factor = as_float(scaleinfo1.w);
float correction_t = correction[scalei];
float ystep = max(2.0f, factor);
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
{
int4 cascadeinfo = p[scalei];
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int ix = mad24(grpidx, grpszx, lclidx);
int iy = mad24(grpidy, grpszy, lclidy);
int x = round(ix * ystep);
int y = round(iy * ystep);
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int nodecounter;
float mean, variance_norm_factor;
//if((ix < width) && (iy < height))
{
const int p_offset = mad24(y, step, x);
cascadeinfo.x += p_offset;
cascadeinfo.z += p_offset;
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+ sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
* correction_t;
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+ sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
bool result = true;
nodecounter = startnode + nodecount * scalei;
for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
{
float stage_sum = 0.f;
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
(((__global uchar*)stagecascadeptr_)+stageloop*sizeof(GpuHidHaarStageClassifier));
int stagecount = stageinfo->count;
for (int nodeloop = 0; nodeloop < stagecount;)
{
__global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
(((__global uchar*)nodeptr_) + nodecounter * sizeof(GpuHidHaarTreeNode));
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x += p_offset;
info1.z += p_offset;
info2.x += p_offset;
info2.z += p_offset;
info3.x += p_offset;
info3.z += p_offset;
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
- sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
+ sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
- sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
+ sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
- sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
+ sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
bool passThres = (classsum >= nodethreshold) ? 1 : 0;
#if STUMP_BASED
stage_sum += passThres ? alpha3.y : alpha3.x;
nodecounter++;
nodeloop++;
#else
bool isRootNode = (nodecounter & 1) == 0;
if(isRootNode)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += (passThres ? alpha3.z : alpha3.y);
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (stage_sum >= stageinfo->threshold) ? 1 : 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (result)
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex] = (y << 16) | x;
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
if (lcl_id < queuecount)
{
int temp = lcloutindex[lcl_id];
int x = temp & 0xffff;
int y = (temp & (int)0xffff0000) >> 16;
temp = atomic_inc(glboutindex);
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
candidate_result.x = x;
candidate_result.y = y;
int i = outputoff+temp+lcl_id;
if(candidate[i].z == 0)
{
candidate[i] = candidate_result;
}
else
{
for(i=i+1;;i++)
{
if(candidate[i].z == 0)
{
candidate[i] = candidate_result;
break;
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, const int nodenum)
{
const int counter = get_global_id(0);
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
GpuHidHaarTreeNode t1 = *(__global GpuHidHaarTreeNode*)
(((__global uchar*)orinode) + counter * sizeof(GpuHidHaarTreeNode));
__global GpuHidHaarTreeNode* pNew = (__global GpuHidHaarTreeNode*)
(((__global uchar*)newnode) + (counter + nodenum) * sizeof(GpuHidHaarTreeNode));
#pragma unroll
for (i = 0; i < 3; i++)
{
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
}
t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
#pragma unroll
for (i = 0; i < 3; i++)
{
pNew->p[i][0] = tr_x[i];
pNew->p[i][1] = tr_y[i];
pNew->p[i][2] = tr_x[i] + tr_w[i];
pNew->p[i][3] = tr_y[i] + tr_h[i];
pNew->weight[i] = t1.weight[i] * weight_scale;
}
pNew->left = t1.left;
pNew->right = t1.right;
pNew->threshold = t1.threshold;
pNew->alpha[0] = t1.alpha[0];
pNew->alpha[1] = t1.alpha[1];
pNew->alpha[2] = t1.alpha[2];
}
......@@ -98,6 +98,8 @@ int main( int argc, const char** argv )
return -1;
}
cout << "old cascade: " << (cascade.isOldFormatCascade() ? "TRUE" : "FALSE") << endl;
if( inputName.empty() || (isdigit(inputName.c_str()[0]) && inputName.c_str()[1] == '\0') )
{
int c = inputName.empty() ? 0 : inputName.c_str()[0] - '0';
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment