Commit 6de42270 authored by LeonidBeynenson's avatar LeonidBeynenson

Made changes to allow ml module to work with big data.

parent 8521ac5d
This diff is collapsed.
...@@ -796,7 +796,7 @@ struct CV_EXPORTS CvDTreeTrainData ...@@ -796,7 +796,7 @@ struct CV_EXPORTS CvDTreeTrainData
const CvMat* responses; const CvMat* responses;
CvMat* responses_copy; // used in Boosting CvMat* responses_copy; // used in Boosting
int buf_count, buf_size; int buf_count, buf_size; // buf_size is obsolete, please do not use it, use expression ((int64)buf->rows * (int64)buf->cols / buf_count) instead
bool shared; bool shared;
int is_buf_16u; int is_buf_16u;
...@@ -806,6 +806,12 @@ struct CV_EXPORTS CvDTreeTrainData ...@@ -806,6 +806,12 @@ struct CV_EXPORTS CvDTreeTrainData
CvMat* counts; CvMat* counts;
CvMat* buf; CvMat* buf;
inline size_t get_length_subbuf() const
{
size_t res = (size_t)(work_var_count + 1) * (size_t)sample_count;
return res;
}
CvMat* direction; CvMat* direction;
CvMat* split_buf; CvMat* split_buf;
......
...@@ -1130,13 +1130,13 @@ CvBoost::update_weights( CvBoostTree* tree ) ...@@ -1130,13 +1130,13 @@ CvBoost::update_weights( CvBoostTree* tree )
int *sample_idx_buf; int *sample_idx_buf;
const int* sample_idx = 0; const int* sample_idx = 0;
cv::AutoBuffer<uchar> inn_buf; cv::AutoBuffer<uchar> inn_buf;
size_t _buf_size = (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ? data->sample_count*sizeof(int) : 0; size_t _buf_size = (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ? (size_t)(data->sample_count)*sizeof(int) : 0;
if( !tree ) if( !tree )
_buf_size += n*sizeof(int); _buf_size += n*sizeof(int);
else else
{ {
if( have_subsample ) if( have_subsample )
_buf_size += data->buf->cols*(sizeof(float)+sizeof(uchar)); _buf_size += data->get_length_subbuf()*(sizeof(float)+sizeof(uchar));
} }
inn_buf.allocate(_buf_size); inn_buf.allocate(_buf_size);
uchar* cur_buf_pos = (uchar*)inn_buf; uchar* cur_buf_pos = (uchar*)inn_buf;
...@@ -1151,6 +1151,7 @@ CvBoost::update_weights( CvBoostTree* tree ) ...@@ -1151,6 +1151,7 @@ CvBoost::update_weights( CvBoostTree* tree )
sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf ); sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );
} }
CvMat* dtree_data_buf = data->buf; CvMat* dtree_data_buf = data->buf;
size_t length_buf_row = data->get_length_subbuf();
if( !tree ) // before training the first tree, initialize weights and other parameters if( !tree ) // before training the first tree, initialize weights and other parameters
{ {
int* class_labels_buf = (int*)cur_buf_pos; int* class_labels_buf = (int*)cur_buf_pos;
...@@ -1189,7 +1190,7 @@ CvBoost::update_weights( CvBoostTree* tree ) ...@@ -1189,7 +1190,7 @@ CvBoost::update_weights( CvBoostTree* tree )
if (data->is_buf_16u) if (data->is_buf_16u)
{ {
unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*dtree_data_buf->cols + unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*length_buf_row +
data->data_root->offset + (data->work_var_count-1)*data->sample_count); data->data_root->offset + (data->work_var_count-1)*data->sample_count);
for( i = 0; i < n; i++ ) for( i = 0; i < n; i++ )
{ {
...@@ -1207,7 +1208,7 @@ CvBoost::update_weights( CvBoostTree* tree ) ...@@ -1207,7 +1208,7 @@ CvBoost::update_weights( CvBoostTree* tree )
} }
else else
{ {
int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*dtree_data_buf->cols + int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*length_buf_row +
data->data_root->offset + (data->work_var_count-1)*data->sample_count; data->data_root->offset + (data->work_var_count-1)*data->sample_count;
for( i = 0; i < n; i++ ) for( i = 0; i < n; i++ )
...@@ -1254,9 +1255,10 @@ CvBoost::update_weights( CvBoostTree* tree ) ...@@ -1254,9 +1255,10 @@ CvBoost::update_weights( CvBoostTree* tree )
if( have_subsample ) if( have_subsample )
{ {
float* values = (float*)cur_buf_pos; float* values = (float*)cur_buf_pos;
cur_buf_pos = (uchar*)(values + data->buf->cols); cur_buf_pos = (uchar*)(values + data->get_length_subbuf());
uchar* missing = cur_buf_pos; uchar* missing = cur_buf_pos;
cur_buf_pos = missing + data->buf->step; cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type);
CvMat _sample, _mask; CvMat _sample, _mask;
// invert the subsample mask // invert the subsample mask
......
...@@ -75,11 +75,14 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag, ...@@ -75,11 +75,14 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
int sample_all = 0, r_type, cv_n; int sample_all = 0, r_type, cv_n;
int total_c_count = 0; int total_c_count = 0;
int tree_block_size, temp_block_size, max_split_size, nv_size, cv_size = 0; int tree_block_size, temp_block_size, max_split_size, nv_size, cv_size = 0;
int ds_step, dv_step, ms_step = 0, mv_step = 0; // {data|mask}{sample|var}_step int64 ds_step, dv_step, ms_step = 0, mv_step = 0; // {data|mask}{sample|var}_step
int vi, i, size; int64 vi, i, size;
char err[100]; char err[100];
const int *sidx = 0, *vidx = 0; const int *sidx = 0, *vidx = 0;
uint64 effective_buf_size = -1;
int effective_buf_height = -1, effective_buf_width = -1;
if ( _params.use_surrogates ) if ( _params.use_surrogates )
CV_ERROR(CV_StsBadArg, "CvERTrees do not support surrogate splits"); CV_ERROR(CV_StsBadArg, "CvERTrees do not support surrogate splits");
...@@ -179,18 +182,34 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag, ...@@ -179,18 +182,34 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
have_labels = cv_n > 0 || (ord_var_count == 1 && cat_var_count == 0) || _add_labels; have_labels = cv_n > 0 || (ord_var_count == 1 && cat_var_count == 0) || _add_labels;
work_var_count = cat_var_count + (is_classifier ? 1 : 0) + (have_labels ? 1 : 0); work_var_count = cat_var_count + (is_classifier ? 1 : 0) + (have_labels ? 1 : 0);
buf_size = (work_var_count + 1)*sample_count;
shared = _shared; shared = _shared;
buf_count = shared ? 2 : 1; buf_count = shared ? 2 : 1;
buf_size = -1; // the member buf_size is obsolete
effective_buf_size = (uint64)(work_var_count + 1)*(uint64)sample_count * buf_count; // this is the total size of "CvMat buf" to be allocated
effective_buf_width = sample_count;
effective_buf_height = work_var_count+1;
if (effective_buf_width >= effective_buf_height)
effective_buf_height *= buf_count;
else
effective_buf_width *= buf_count;
if ((uint64)effective_buf_width * (uint64)effective_buf_height != effective_buf_size)
{
CV_Error(CV_StsBadArg, "The memory buffer cannot be allocated since its size exceeds integer fields limit");
}
if ( is_buf_16u ) if ( is_buf_16u )
{ {
CV_CALL( buf = cvCreateMat( buf_count, buf_size, CV_16UC1 )); CV_CALL( buf = cvCreateMat( effective_buf_height, effective_buf_width, CV_16UC1 ));
CV_CALL( pair16u32s_ptr = (CvPair16u32s*)cvAlloc( sample_count*sizeof(pair16u32s_ptr[0]) )); CV_CALL( pair16u32s_ptr = (CvPair16u32s*)cvAlloc( sample_count*sizeof(pair16u32s_ptr[0]) ));
} }
else else
{ {
CV_CALL( buf = cvCreateMat( buf_count, buf_size, CV_32SC1 )); CV_CALL( buf = cvCreateMat( effective_buf_height, effective_buf_width, CV_32SC1 ));
CV_CALL( int_ptr = (int**)cvAlloc( sample_count*sizeof(int_ptr[0]) )); CV_CALL( int_ptr = (int**)cvAlloc( sample_count*sizeof(int_ptr[0]) ));
} }
...@@ -303,7 +322,7 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag, ...@@ -303,7 +322,7 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
val = cvRound(t); val = cvRound(t);
if( val != t ) if( val != t )
{ {
sprintf( err, "%d-th value of %d-th (categorical) " sprintf( err, "%ld-th value of %ld-th (categorical) "
"variable is not an integer", i, vi ); "variable is not an integer", i, vi );
CV_ERROR( CV_StsBadArg, err ); CV_ERROR( CV_StsBadArg, err );
} }
...@@ -311,7 +330,7 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag, ...@@ -311,7 +330,7 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
if( val == INT_MAX ) if( val == INT_MAX )
{ {
sprintf( err, "%d-th value of %d-th (categorical) " sprintf( err, "%ld-th value of %ld-th (categorical) "
"variable is too large", i, vi ); "variable is too large", i, vi );
CV_ERROR( CV_StsBadArg, err ); CV_ERROR( CV_StsBadArg, err );
} }
...@@ -414,7 +433,7 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag, ...@@ -414,7 +433,7 @@ void CvERTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
if( fabs(val) >= ord_nan ) if( fabs(val) >= ord_nan )
{ {
sprintf( err, "%d-th value of %d-th (ordered) " sprintf( err, "%ld-th value of %ld-th (ordered) "
"variable (=%g) is too large", i, vi, val ); "variable (=%g) is too large", i, vi, val );
CV_ERROR( CV_StsBadArg, err ); CV_ERROR( CV_StsBadArg, err );
} }
...@@ -578,9 +597,9 @@ const int* CvERTreeTrainData::get_cat_var_data( CvDTreeNode* n, int vi, int* cat ...@@ -578,9 +597,9 @@ const int* CvERTreeTrainData::get_cat_var_data( CvDTreeNode* n, int vi, int* cat
int ci = get_var_type( vi); int ci = get_var_type( vi);
const int* cat_values = 0; const int* cat_values = 0;
if( !is_buf_16u ) if( !is_buf_16u )
cat_values = buf->data.i + n->buf_idx*buf->cols + ci*sample_count + n->offset; cat_values = buf->data.i + n->buf_idx*get_length_subbuf() + ci*sample_count + n->offset;
else { else {
const unsigned short* short_values = (const unsigned short*)(buf->data.s + n->buf_idx*buf->cols + const unsigned short* short_values = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() +
ci*sample_count + n->offset); ci*sample_count + n->offset);
for( int i = 0; i < n->sample_count; i++ ) for( int i = 0; i < n->sample_count; i++ )
cat_values_buf[i] = short_values[i]; cat_values_buf[i] = short_values[i];
...@@ -1333,6 +1352,7 @@ void CvForestERTree::split_node_data( CvDTreeNode* node ) ...@@ -1333,6 +1352,7 @@ void CvForestERTree::split_node_data( CvDTreeNode* node )
CvDTreeNode *left = 0, *right = 0; CvDTreeNode *left = 0, *right = 0;
int new_buf_idx = data->get_child_buf_idx( node ); int new_buf_idx = data->get_child_buf_idx( node );
CvMat* buf = data->buf; CvMat* buf = data->buf;
size_t length_buf_row = data->get_length_subbuf();
cv::AutoBuffer<int> temp_buf(n); cv::AutoBuffer<int> temp_buf(n);
complete_node_dir(node); complete_node_dir(node);
...@@ -1385,9 +1405,9 @@ void CvForestERTree::split_node_data( CvDTreeNode* node ) ...@@ -1385,9 +1405,9 @@ void CvForestERTree::split_node_data( CvDTreeNode* node )
if (data->is_buf_16u) if (data->is_buf_16u)
{ {
unsigned short *ldst = (unsigned short *)(buf->data.s + left->buf_idx*buf->cols + unsigned short *ldst = (unsigned short *)(buf->data.s + left->buf_idx*length_buf_row +
ci*scount + left->offset); ci*scount + left->offset);
unsigned short *rdst = (unsigned short *)(buf->data.s + right->buf_idx*buf->cols + unsigned short *rdst = (unsigned short *)(buf->data.s + right->buf_idx*length_buf_row +
ci*scount + right->offset); ci*scount + right->offset);
for( i = 0; i < n; i++ ) for( i = 0; i < n; i++ )
...@@ -1415,9 +1435,9 @@ void CvForestERTree::split_node_data( CvDTreeNode* node ) ...@@ -1415,9 +1435,9 @@ void CvForestERTree::split_node_data( CvDTreeNode* node )
} }
else else
{ {
int *ldst = buf->data.i + left->buf_idx*buf->cols + int *ldst = buf->data.i + left->buf_idx*length_buf_row +
ci*scount + left->offset; ci*scount + left->offset;
int *rdst = buf->data.i + right->buf_idx*buf->cols + int *rdst = buf->data.i + right->buf_idx*length_buf_row +
ci*scount + right->offset; ci*scount + right->offset;
for( i = 0; i < n; i++ ) for( i = 0; i < n; i++ )
...@@ -1460,9 +1480,9 @@ void CvForestERTree::split_node_data( CvDTreeNode* node ) ...@@ -1460,9 +1480,9 @@ void CvForestERTree::split_node_data( CvDTreeNode* node )
if (data->is_buf_16u) if (data->is_buf_16u)
{ {
unsigned short* ldst = (unsigned short*)(buf->data.s + left->buf_idx*buf->cols + unsigned short* ldst = (unsigned short*)(buf->data.s + left->buf_idx*length_buf_row +
pos*scount + left->offset); pos*scount + left->offset);
unsigned short* rdst = (unsigned short*)(buf->data.s + right->buf_idx*buf->cols + unsigned short* rdst = (unsigned short*)(buf->data.s + right->buf_idx*length_buf_row +
pos*scount + right->offset); pos*scount + right->offset);
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
...@@ -1483,9 +1503,9 @@ void CvForestERTree::split_node_data( CvDTreeNode* node ) ...@@ -1483,9 +1503,9 @@ void CvForestERTree::split_node_data( CvDTreeNode* node )
} }
else else
{ {
int* ldst = buf->data.i + left->buf_idx*buf->cols + int* ldst = buf->data.i + left->buf_idx*length_buf_row +
pos*scount + left->offset; pos*scount + left->offset;
int* rdst = buf->data.i + right->buf_idx*buf->cols + int* rdst = buf->data.i + right->buf_idx*length_buf_row +
pos*scount + right->offset; pos*scount + right->offset;
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
{ {
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment