Home | History | Annotate | Download | only in traincascade
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                        Intel License Agreement
     11 //
     12 // Copyright (C) 2000, Intel Corporation, all rights reserved.
     13 // Third party copyrights are property of their respective owners.
     14 //
     15 // Redistribution and use in source and binary forms, with or without modification,
     16 // are permitted provided that the following conditions are met:
     17 //
     18 //   * Redistribution's of source code must retain the above copyright notice,
     19 //     this list of conditions and the following disclaimer.
     20 //
     21 //   * Redistribution's in binary form must reproduce the above copyright notice,
     22 //     this list of conditions and the following disclaimer in the documentation
     23 //     and/or other materials provided with the distribution.
     24 //
     25 //   * The name of Intel Corporation may not be used to endorse or promote products
     26 //     derived from this software without specific prior written permission.
     27 //
     28 // This software is provided by the copyright holders and contributors "as is" and
     29 // any express or implied warranties, including, but not limited to, the implied
     30 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     31 // In no event shall the Intel Corporation or contributors be liable for any direct,
     32 // indirect, incidental, special, exemplary, or consequential damages
     33 // (including, but not limited to, procurement of substitute goods or services;
     34 // loss of use, data, or profits; or business interruption) however caused
     35 // and on any theory of liability, whether in contract, strict liability,
     36 // or tort (including negligence or otherwise) arising in any way out of
     37 // the use of this software, even if advised of the possibility of such damage.
     38 //
     39 //M*/
     40 
     41 #include "old_ml_precomp.hpp"
     42 
     43 static inline double
     44 log_ratio( double val )
     45 {
     46     const double eps = 1e-5;
     47 
     48     val = MAX( val, eps );
     49     val = MIN( val, 1. - eps );
     50     return log( val/(1. - val) );
     51 }
     52 
     53 
     54 CvBoostParams::CvBoostParams()
     55 {
     56     boost_type = CvBoost::REAL;
     57     weak_count = 100;
     58     weight_trim_rate = 0.95;
     59     cv_folds = 0;
     60     max_depth = 1;
     61 }
     62 
     63 
     64 CvBoostParams::CvBoostParams( int _boost_type, int _weak_count,
     65                                         double _weight_trim_rate, int _max_depth,
     66                                         bool _use_surrogates, const float* _priors )
     67 {
     68     boost_type = _boost_type;
     69     weak_count = _weak_count;
     70     weight_trim_rate = _weight_trim_rate;
     71     split_criteria = CvBoost::DEFAULT;
     72     cv_folds = 0;
     73     max_depth = _max_depth;
     74     use_surrogates = _use_surrogates;
     75     priors = _priors;
     76 }
     77 
     78 
     79 
     80 ///////////////////////////////// CvBoostTree ///////////////////////////////////
     81 
     82 CvBoostTree::CvBoostTree()
     83 {
     84     ensemble = 0;
     85 }
     86 
     87 
     88 CvBoostTree::~CvBoostTree()
     89 {
     90     clear();
     91 }
     92 
     93 
     94 void
     95 CvBoostTree::clear()
     96 {
     97     CvDTree::clear();
     98     ensemble = 0;
     99 }
    100 
    101 
    102 bool
    103 CvBoostTree::train( CvDTreeTrainData* _train_data,
    104                     const CvMat* _subsample_idx, CvBoost* _ensemble )
    105 {
    106     clear();
    107     ensemble = _ensemble;
    108     data = _train_data;
    109     data->shared = true;
    110     return do_train( _subsample_idx );
    111 }
    112 
    113 
    114 bool
    115 CvBoostTree::train( const CvMat*, int, const CvMat*, const CvMat*,
    116                     const CvMat*, const CvMat*, const CvMat*, CvDTreeParams )
    117 {
    118     assert(0);
    119     return false;
    120 }
    121 
    122 
    123 bool
    124 CvBoostTree::train( CvDTreeTrainData*, const CvMat* )
    125 {
    126     assert(0);
    127     return false;
    128 }
    129 
    130 
    131 void
    132 CvBoostTree::scale( double _scale )
    133 {
    134     CvDTreeNode* node = root;
    135 
    136     // traverse the tree and scale all the node values
    137     for(;;)
    138     {
    139         CvDTreeNode* parent;
    140         for(;;)
    141         {
    142             node->value *= _scale;
    143             if( !node->left )
    144                 break;
    145             node = node->left;
    146         }
    147 
    148         for( parent = node->parent; parent && parent->right == node;
    149             node = parent, parent = parent->parent )
    150             ;
    151 
    152         if( !parent )
    153             break;
    154 
    155         node = parent->right;
    156     }
    157 }
    158 
    159 
    160 void
    161 CvBoostTree::try_split_node( CvDTreeNode* node )
    162 {
    163     CvDTree::try_split_node( node );
    164 
    165     if( !node->left )
    166     {
    167         // if the node has not been split,
    168         // store the responses for the corresponding training samples
    169         double* weak_eval = ensemble->get_weak_response()->data.db;
    170         cv::AutoBuffer<int> inn_buf(node->sample_count);
    171         const int* labels = data->get_cv_labels( node, (int*)inn_buf );
    172         int i, count = node->sample_count;
    173         double value = node->value;
    174 
    175         for( i = 0; i < count; i++ )
    176             weak_eval[labels[i]] = value;
    177     }
    178 }
    179 
    180 
    181 double
    182 CvBoostTree::calc_node_dir( CvDTreeNode* node )
    183 {
    184     char* dir = (char*)data->direction->data.ptr;
    185     const double* weights = ensemble->get_subtree_weights()->data.db;
    186     int i, n = node->sample_count, vi = node->split->var_idx;
    187     double L, R;
    188 
    189     assert( !node->split->inversed );
    190 
    191     if( data->get_var_type(vi) >= 0 ) // split on categorical var
    192     {
    193         cv::AutoBuffer<int> inn_buf(n);
    194         const int* cat_labels = data->get_cat_var_data( node, vi, (int*)inn_buf );
    195         const int* subset = node->split->subset;
    196         double sum = 0, sum_abs = 0;
    197 
    198         for( i = 0; i < n; i++ )
    199         {
    200             int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
    201             double w = weights[i];
    202             int d = idx >= 0 ? CV_DTREE_CAT_DIR(idx,subset) : 0;
    203             sum += d*w; sum_abs += (d & 1)*w;
    204             dir[i] = (char)d;
    205         }
    206 
    207         R = (sum_abs + sum) * 0.5;
    208         L = (sum_abs - sum) * 0.5;
    209     }
    210     else // split on ordered var
    211     {
    212         cv::AutoBuffer<uchar> inn_buf(2*n*sizeof(int)+n*sizeof(float));
    213         float* values_buf = (float*)(uchar*)inn_buf;
    214         int* sorted_indices_buf = (int*)(values_buf + n);
    215         int* sample_indices_buf = sorted_indices_buf + n;
    216         const float* values = 0;
    217         const int* sorted_indices = 0;
    218         data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
    219         int split_point = node->split->ord.split_point;
    220         int n1 = node->get_num_valid(vi);
    221 
    222         assert( 0 <= split_point && split_point < n1-1 );
    223         L = R = 0;
    224 
    225         for( i = 0; i <= split_point; i++ )
    226         {
    227             int idx = sorted_indices[i];
    228             double w = weights[idx];
    229             dir[idx] = (char)-1;
    230             L += w;
    231         }
    232 
    233         for( ; i < n1; i++ )
    234         {
    235             int idx = sorted_indices[i];
    236             double w = weights[idx];
    237             dir[idx] = (char)1;
    238             R += w;
    239         }
    240 
    241         for( ; i < n; i++ )
    242             dir[sorted_indices[i]] = (char)0;
    243     }
    244 
    245     node->maxlr = MAX( L, R );
    246     return node->split->quality/(L + R);
    247 }
    248 
    249 
    250 CvDTreeSplit*
    251 CvBoostTree::find_split_ord_class( CvDTreeNode* node, int vi, float init_quality,
    252                                     CvDTreeSplit* _split, uchar* _ext_buf )
    253 {
    254     const float epsilon = FLT_EPSILON*2;
    255 
    256     const double* weights = ensemble->get_subtree_weights()->data.db;
    257     int n = node->sample_count;
    258     int n1 = node->get_num_valid(vi);
    259 
    260     cv::AutoBuffer<uchar> inn_buf;
    261     if( !_ext_buf )
    262         inn_buf.allocate(n*(3*sizeof(int)+sizeof(float)));
    263     uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf;
    264     float* values_buf = (float*)ext_buf;
    265     int* sorted_indices_buf = (int*)(values_buf + n);
    266     int* sample_indices_buf = sorted_indices_buf + n;
    267     const float* values = 0;
    268     const int* sorted_indices = 0;
    269     data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
    270     int* responses_buf = sorted_indices_buf + n;
    271     const int* responses = data->get_class_labels( node, responses_buf );
    272     const double* rcw0 = weights + n;
    273     double lcw[2] = {0,0}, rcw[2];
    274     int i, best_i = -1;
    275     double best_val = init_quality;
    276     int boost_type = ensemble->get_params().boost_type;
    277     int split_criteria = ensemble->get_params().split_criteria;
    278 
    279     rcw[0] = rcw0[0]; rcw[1] = rcw0[1];
    280     for( i = n1; i < n; i++ )
    281     {
    282         int idx = sorted_indices[i];
    283         double w = weights[idx];
    284         rcw[responses[idx]] -= w;
    285     }
    286 
    287     if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
    288         split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
    289 
    290     if( split_criteria == CvBoost::GINI )
    291     {
    292         double L = 0, R = rcw[0] + rcw[1];
    293         double lsum2 = 0, rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
    294 
    295         for( i = 0; i < n1 - 1; i++ )
    296         {
    297             int idx = sorted_indices[i];
    298             double w = weights[idx], w2 = w*w;
    299             double lv, rv;
    300             idx = responses[idx];
    301             L += w; R -= w;
    302             lv = lcw[idx]; rv = rcw[idx];
    303             lsum2 += 2*lv*w + w2;
    304             rsum2 -= 2*rv*w - w2;
    305             lcw[idx] = lv + w; rcw[idx] = rv - w;
    306 
    307             if( values[i] + epsilon < values[i+1] )
    308             {
    309                 double val = (lsum2*R + rsum2*L)/(L*R);
    310                 if( best_val < val )
    311                 {
    312                     best_val = val;
    313                     best_i = i;
    314                 }
    315             }
    316         }
    317     }
    318     else
    319     {
    320         for( i = 0; i < n1 - 1; i++ )
    321         {
    322             int idx = sorted_indices[i];
    323             double w = weights[idx];
    324             idx = responses[idx];
    325             lcw[idx] += w;
    326             rcw[idx] -= w;
    327 
    328             if( values[i] + epsilon < values[i+1] )
    329             {
    330                 double val = lcw[0] + rcw[1], val2 = lcw[1] + rcw[0];
    331                 val = MAX(val, val2);
    332                 if( best_val < val )
    333                 {
    334                     best_val = val;
    335                     best_i = i;
    336                 }
    337             }
    338         }
    339     }
    340 
    341     CvDTreeSplit* split = 0;
    342     if( best_i >= 0 )
    343     {
    344         split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
    345         split->var_idx = vi;
    346         split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
    347         split->ord.split_point = best_i;
    348         split->inversed = 0;
    349         split->quality = (float)best_val;
    350     }
    351     return split;
    352 }
    353 
    354 template<typename T>
    355 class LessThanPtr
    356 {
    357 public:
    358     bool operator()(T* a, T* b) const { return *a < *b; }
    359 };
    360 
    361 CvDTreeSplit*
    362 CvBoostTree::find_split_cat_class( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
    363 {
    364     int ci = data->get_var_type(vi);
    365     int n = node->sample_count;
    366     int mi = data->cat_count->data.i[ci];
    367 
    368     int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
    369     cv::AutoBuffer<uchar> inn_buf((2*mi+3)*sizeof(double) + mi*sizeof(double*));
    370     if( !_ext_buf)
    371         inn_buf.allocate( base_size + 2*n*sizeof(int) );
    372     uchar* base_buf = (uchar*)inn_buf;
    373     uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
    374 
    375     int* cat_labels_buf = (int*)ext_buf;
    376     const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
    377     int* responses_buf = cat_labels_buf + n;
    378     const int* responses = data->get_class_labels(node, responses_buf);
    379     double lcw[2]={0,0}, rcw[2]={0,0};
    380 
    381     double* cjk = (double*)cv::alignPtr(base_buf,sizeof(double))+2;
    382     const double* weights = ensemble->get_subtree_weights()->data.db;
    383     double** dbl_ptr = (double**)(cjk + 2*mi);
    384     int i, j, k, idx;
    385     double L = 0, R;
    386     double best_val = init_quality;
    387     int best_subset = -1, subset_i;
    388     int boost_type = ensemble->get_params().boost_type;
    389     int split_criteria = ensemble->get_params().split_criteria;
    390 
    391     // init array of counters:
    392     // c_{jk} - number of samples that have vi-th input variable = j and response = k.
    393     for( j = -1; j < mi; j++ )
    394         cjk[j*2] = cjk[j*2+1] = 0;
    395 
    396     for( i = 0; i < n; i++ )
    397     {
    398         double w = weights[i];
    399         j = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
    400         k = responses[i];
    401         cjk[j*2 + k] += w;
    402     }
    403 
    404     for( j = 0; j < mi; j++ )
    405     {
    406         rcw[0] += cjk[j*2];
    407         rcw[1] += cjk[j*2+1];
    408         dbl_ptr[j] = cjk + j*2 + 1;
    409     }
    410 
    411     R = rcw[0] + rcw[1];
    412 
    413     if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
    414         split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
    415 
    416     // sort rows of c_jk by increasing c_j,1
    417     // (i.e. by the weight of samples in j-th category that belong to class 1)
    418     std::sort(dbl_ptr, dbl_ptr + mi, LessThanPtr<double>());
    419 
    420     for( subset_i = 0; subset_i < mi-1; subset_i++ )
    421     {
    422         idx = (int)(dbl_ptr[subset_i] - cjk)/2;
    423         const double* crow = cjk + idx*2;
    424         double w0 = crow[0], w1 = crow[1];
    425         double weight = w0 + w1;
    426 
    427         if( weight < FLT_EPSILON )
    428             continue;
    429 
    430         lcw[0] += w0; rcw[0] -= w0;
    431         lcw[1] += w1; rcw[1] -= w1;
    432 
    433         if( split_criteria == CvBoost::GINI )
    434         {
    435             double lsum2 = lcw[0]*lcw[0] + lcw[1]*lcw[1];
    436             double rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
    437 
    438             L += weight;
    439             R -= weight;
    440 
    441             if( L > FLT_EPSILON && R > FLT_EPSILON )
    442             {
    443                 double val = (lsum2*R + rsum2*L)/(L*R);
    444                 if( best_val < val )
    445                 {
    446                     best_val = val;
    447                     best_subset = subset_i;
    448                 }
    449             }
    450         }
    451         else
    452         {
    453             double val = lcw[0] + rcw[1];
    454             double val2 = lcw[1] + rcw[0];
    455 
    456             val = MAX(val, val2);
    457             if( best_val < val )
    458             {
    459                 best_val = val;
    460                 best_subset = subset_i;
    461             }
    462         }
    463     }
    464 
    465     CvDTreeSplit* split = 0;
    466     if( best_subset >= 0 )
    467     {
    468         split = _split ? _split : data->new_split_cat( 0, -1.0f);
    469         split->var_idx = vi;
    470         split->quality = (float)best_val;
    471         memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
    472         for( i = 0; i <= best_subset; i++ )
    473         {
    474             idx = (int)(dbl_ptr[i] - cjk) >> 1;
    475             split->subset[idx >> 5] |= 1 << (idx & 31);
    476         }
    477     }
    478     return split;
    479 }
    480 
    481 
    482 CvDTreeSplit*
    483 CvBoostTree::find_split_ord_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
    484 {
    485     const float epsilon = FLT_EPSILON*2;
    486     const double* weights = ensemble->get_subtree_weights()->data.db;
    487     int n = node->sample_count;
    488     int n1 = node->get_num_valid(vi);
    489 
    490     cv::AutoBuffer<uchar> inn_buf;
    491     if( !_ext_buf )
    492         inn_buf.allocate(2*n*(sizeof(int)+sizeof(float)));
    493     uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf;
    494 
    495     float* values_buf = (float*)ext_buf;
    496     int* indices_buf = (int*)(values_buf + n);
    497     int* sample_indices_buf = indices_buf + n;
    498     const float* values = 0;
    499     const int* indices = 0;
    500     data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
    501     float* responses_buf = (float*)(indices_buf + n);
    502     const float* responses = data->get_ord_responses( node, responses_buf, sample_indices_buf );
    503 
    504     int i, best_i = -1;
    505     double L = 0, R = weights[n];
    506     double best_val = init_quality, lsum = 0, rsum = node->value*R;
    507 
    508     // compensate for missing values
    509     for( i = n1; i < n; i++ )
    510     {
    511         int idx = indices[i];
    512         double w = weights[idx];
    513         rsum -= responses[idx]*w;
    514         R -= w;
    515     }
    516 
    517     // find the optimal split
    518     for( i = 0; i < n1 - 1; i++ )
    519     {
    520         int idx = indices[i];
    521         double w = weights[idx];
    522         double t = responses[idx]*w;
    523         L += w; R -= w;
    524         lsum += t; rsum -= t;
    525 
    526         if( values[i] + epsilon < values[i+1] )
    527         {
    528             double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
    529             if( best_val < val )
    530             {
    531                 best_val = val;
    532                 best_i = i;
    533             }
    534         }
    535     }
    536 
    537     CvDTreeSplit* split = 0;
    538     if( best_i >= 0 )
    539     {
    540         split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
    541         split->var_idx = vi;
    542         split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
    543         split->ord.split_point = best_i;
    544         split->inversed = 0;
    545         split->quality = (float)best_val;
    546     }
    547     return split;
    548 }
    549 
    550 
    551 CvDTreeSplit*
    552 CvBoostTree::find_split_cat_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
    553 {
    554     const double* weights = ensemble->get_subtree_weights()->data.db;
    555     int ci = data->get_var_type(vi);
    556     int n = node->sample_count;
    557     int mi = data->cat_count->data.i[ci];
    558     int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
    559     cv::AutoBuffer<uchar> inn_buf(base_size);
    560     if( !_ext_buf )
    561         inn_buf.allocate(base_size + n*(2*sizeof(int) + sizeof(float)));
    562     uchar* base_buf = (uchar*)inn_buf;
    563     uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
    564 
    565     int* cat_labels_buf = (int*)ext_buf;
    566     const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
    567     float* responses_buf = (float*)(cat_labels_buf + n);
    568     int* sample_indices_buf = (int*)(responses_buf + n);
    569     const float* responses = data->get_ord_responses(node, responses_buf, sample_indices_buf);
    570 
    571     double* sum = (double*)cv::alignPtr(base_buf,sizeof(double)) + 1;
    572     double* counts = sum + mi + 1;
    573     double** sum_ptr = (double**)(counts + mi);
    574     double L = 0, R = 0, best_val = init_quality, lsum = 0, rsum = 0;
    575     int i, best_subset = -1, subset_i;
    576 
    577     for( i = -1; i < mi; i++ )
    578         sum[i] = counts[i] = 0;
    579 
    580     // calculate sum response and weight of each category of the input var
    581     for( i = 0; i < n; i++ )
    582     {
    583         int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
    584         double w = weights[i];
    585         double s = sum[idx] + responses[i]*w;
    586         double nc = counts[idx] + w;
    587         sum[idx] = s;
    588         counts[idx] = nc;
    589     }
    590 
    591     // calculate average response in each category
    592     for( i = 0; i < mi; i++ )
    593     {
    594         R += counts[i];
    595         rsum += sum[i];
    596         sum[i] = fabs(counts[i]) > DBL_EPSILON ? sum[i]/counts[i] : 0;
    597         sum_ptr[i] = sum + i;
    598     }
    599 
    600     std::sort(sum_ptr, sum_ptr + mi, LessThanPtr<double>());
    601 
    602     // revert back to unnormalized sums
    603     // (there should be a very little loss in accuracy)
    604     for( i = 0; i < mi; i++ )
    605         sum[i] *= counts[i];
    606 
    607     for( subset_i = 0; subset_i < mi-1; subset_i++ )
    608     {
    609         int idx = (int)(sum_ptr[subset_i] - sum);
    610         double ni = counts[idx];
    611 
    612         if( ni > FLT_EPSILON )
    613         {
    614             double s = sum[idx];
    615             lsum += s; L += ni;
    616             rsum -= s; R -= ni;
    617 
    618             if( L > FLT_EPSILON && R > FLT_EPSILON )
    619             {
    620                 double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
    621                 if( best_val < val )
    622                 {
    623                     best_val = val;
    624                     best_subset = subset_i;
    625                 }
    626             }
    627         }
    628     }
    629 
    630     CvDTreeSplit* split = 0;
    631     if( best_subset >= 0 )
    632     {
    633         split = _split ? _split : data->new_split_cat( 0, -1.0f);
    634         split->var_idx = vi;
    635         split->quality = (float)best_val;
    636         memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
    637         for( i = 0; i <= best_subset; i++ )
    638         {
    639             int idx = (int)(sum_ptr[i] - sum);
    640             split->subset[idx >> 5] |= 1 << (idx & 31);
    641         }
    642     }
    643     return split;
    644 }
    645 
    646 
    647 CvDTreeSplit*
    648 CvBoostTree::find_surrogate_split_ord( CvDTreeNode* node, int vi, uchar* _ext_buf )
    649 {
    650     const float epsilon = FLT_EPSILON*2;
    651     int n = node->sample_count;
    652     cv::AutoBuffer<uchar> inn_buf;
    653     if( !_ext_buf )
    654         inn_buf.allocate(n*(2*sizeof(int)+sizeof(float)));
    655     uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf;
    656     float* values_buf = (float*)ext_buf;
    657     int* indices_buf = (int*)(values_buf + n);
    658     int* sample_indices_buf = indices_buf + n;
    659     const float* values = 0;
    660     const int* indices = 0;
    661     data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
    662 
    663     const double* weights = ensemble->get_subtree_weights()->data.db;
    664     const char* dir = (char*)data->direction->data.ptr;
    665     int n1 = node->get_num_valid(vi);
    666     // LL - number of samples that both the primary and the surrogate splits send to the left
    667     // LR - ... primary split sends to the left and the surrogate split sends to the right
    668     // RL - ... primary split sends to the right and the surrogate split sends to the left
    669     // RR - ... both send to the right
    670     int i, best_i = -1, best_inversed = 0;
    671     double best_val;
    672     double LL = 0, RL = 0, LR, RR;
    673     double worst_val = node->maxlr;
    674     double sum = 0, sum_abs = 0;
    675     best_val = worst_val;
    676 
    677     for( i = 0; i < n1; i++ )
    678     {
    679         int idx = indices[i];
    680         double w = weights[idx];
    681         int d = dir[idx];
    682         sum += d*w; sum_abs += (d & 1)*w;
    683     }
    684 
    685     // sum_abs = R + L; sum = R - L
    686     RR = (sum_abs + sum)*0.5;
    687     LR = (sum_abs - sum)*0.5;
    688 
    689     // initially all the samples are sent to the right by the surrogate split,
    690     // LR of them are sent to the left by primary split, and RR - to the right.
    691     // now iteratively compute LL, LR, RL and RR for every possible surrogate split value.
    692     for( i = 0; i < n1 - 1; i++ )
    693     {
    694         int idx = indices[i];
    695         double w = weights[idx];
    696         int d = dir[idx];
    697 
    698         if( d < 0 )
    699         {
    700             LL += w; LR -= w;
    701             if( LL + RR > best_val && values[i] + epsilon < values[i+1] )
    702             {
    703                 best_val = LL + RR;
    704                 best_i = i; best_inversed = 0;
    705             }
    706         }
    707         else if( d > 0 )
    708         {
    709             RL += w; RR -= w;
    710             if( RL + LR > best_val && values[i] + epsilon < values[i+1] )
    711             {
    712                 best_val = RL + LR;
    713                 best_i = i; best_inversed = 1;
    714             }
    715         }
    716     }
    717 
    718     return best_i >= 0 && best_val > node->maxlr ? data->new_split_ord( vi,
    719         (values[best_i] + values[best_i+1])*0.5f, best_i,
    720         best_inversed, (float)best_val ) : 0;
    721 }
    722 
    723 
    724 CvDTreeSplit*
    725 CvBoostTree::find_surrogate_split_cat( CvDTreeNode* node, int vi, uchar* _ext_buf )
    726 {
    727     const char* dir = (char*)data->direction->data.ptr;
    728     const double* weights = ensemble->get_subtree_weights()->data.db;
    729     int n = node->sample_count;
    730     int i, mi = data->cat_count->data.i[data->get_var_type(vi)];
    731 
    732     int base_size = (2*mi+3)*sizeof(double);
    733     cv::AutoBuffer<uchar> inn_buf(base_size);
    734     if( !_ext_buf )
    735         inn_buf.allocate(base_size + n*sizeof(int));
    736     uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf;
    737     int* cat_labels_buf = (int*)ext_buf;
    738     const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
    739 
    740     // LL - number of samples that both the primary and the surrogate splits send to the left
    741     // LR - ... primary split sends to the left and the surrogate split sends to the right
    742     // RL - ... primary split sends to the right and the surrogate split sends to the left
    743     // RR - ... both send to the right
    744     CvDTreeSplit* split = data->new_split_cat( vi, 0 );
    745     double best_val = 0;
    746     double* lc = (double*)cv::alignPtr(cat_labels_buf + n, sizeof(double)) + 1;
    747     double* rc = lc + mi + 1;
    748 
    749     for( i = -1; i < mi; i++ )
    750         lc[i] = rc[i] = 0;
    751 
    752     // 1. for each category calculate the weight of samples
    753     // sent to the left (lc) and to the right (rc) by the primary split
    754     for( i = 0; i < n; i++ )
    755     {
    756         int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
    757         double w = weights[i];
    758         int d = dir[i];
    759         double sum = lc[idx] + d*w;
    760         double sum_abs = rc[idx] + (d & 1)*w;
    761         lc[idx] = sum; rc[idx] = sum_abs;
    762     }
    763 
    764     for( i = 0; i < mi; i++ )
    765     {
    766         double sum = lc[i];
    767         double sum_abs = rc[i];
    768         lc[i] = (sum_abs - sum) * 0.5;
    769         rc[i] = (sum_abs + sum) * 0.5;
    770     }
    771 
    772     // 2. now form the split.
    773     // in each category send all the samples to the same direction as majority
    774     for( i = 0; i < mi; i++ )
    775     {
    776         double lval = lc[i], rval = rc[i];
    777         if( lval > rval )
    778         {
    779             split->subset[i >> 5] |= 1 << (i & 31);
    780             best_val += lval;
    781         }
    782         else
    783             best_val += rval;
    784     }
    785 
    786     split->quality = (float)best_val;
    787     if( split->quality <= node->maxlr )
    788         cvSetRemoveByPtr( data->split_heap, split ), split = 0;
    789 
    790     return split;
    791 }
    792 
    793 
    794 void
    795 CvBoostTree::calc_node_value( CvDTreeNode* node )
    796 {
    797     int i, n = node->sample_count;
    798     const double* weights = ensemble->get_weights()->data.db;
    799     cv::AutoBuffer<uchar> inn_buf(n*(sizeof(int) + ( data->is_classifier ? sizeof(int) : sizeof(int) + sizeof(float))));
    800     int* labels_buf = (int*)(uchar*)inn_buf;
    801     const int* labels = data->get_cv_labels(node, labels_buf);
    802     double* subtree_weights = ensemble->get_subtree_weights()->data.db;
    803     double rcw[2] = {0,0};
    804     int boost_type = ensemble->get_params().boost_type;
    805 
    806     if( data->is_classifier )
    807     {
    808         int* _responses_buf = labels_buf + n;
    809         const int* _responses = data->get_class_labels(node, _responses_buf);
    810         int m = data->get_num_classes();
    811         int* cls_count = data->counts->data.i;
    812         for( int k = 0; k < m; k++ )
    813             cls_count[k] = 0;
    814 
    815         for( i = 0; i < n; i++ )
    816         {
    817             int idx = labels[i];
    818             double w = weights[idx];
    819             int r = _responses[i];
    820             rcw[r] += w;
    821             cls_count[r]++;
    822             subtree_weights[i] = w;
    823         }
    824 
    825         node->class_idx = rcw[1] > rcw[0];
    826 
    827         if( boost_type == CvBoost::DISCRETE )
    828         {
    829             // ignore cat_map for responses, and use {-1,1},
    830             // as the whole ensemble response is computes as sign(sum_i(weak_response_i)
    831             node->value = node->class_idx*2 - 1;
    832         }
    833         else
    834         {
    835             double p = rcw[1]/(rcw[0] + rcw[1]);
    836             assert( boost_type == CvBoost::REAL );
    837 
    838             // store log-ratio of the probability
    839             node->value = 0.5*log_ratio(p);
    840         }
    841     }
    842     else
    843     {
    844         // in case of regression tree:
    845         //  * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,
    846         //    n is the number of samples in the node.
    847         //  * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)
    848         double sum = 0, sum2 = 0, iw;
    849         float* values_buf = (float*)(labels_buf + n);
    850         int* sample_indices_buf = (int*)(values_buf + n);
    851         const float* values = data->get_ord_responses(node, values_buf, sample_indices_buf);
    852 
    853         for( i = 0; i < n; i++ )
    854         {
    855             int idx = labels[i];
    856             double w = weights[idx]/*priors[values[i] > 0]*/;
    857             double t = values[i];
    858             rcw[0] += w;
    859             subtree_weights[i] = w;
    860             sum += t*w;
    861             sum2 += t*t*w;
    862         }
    863 
    864         iw = 1./rcw[0];
    865         node->value = sum*iw;
    866         node->node_risk = sum2 - (sum*iw)*sum;
    867 
    868         // renormalize the risk, as in try_split_node the unweighted formula
    869         // sqrt(risk)/n is used, rather than sqrt(risk)/sum(weights_i)
    870         node->node_risk *= n*iw*n*iw;
    871     }
    872 
    873     // store summary weights
    874     subtree_weights[n] = rcw[0];
    875     subtree_weights[n+1] = rcw[1];
    876 }
    877 
    878 
    879 void CvBoostTree::read( CvFileStorage* fs, CvFileNode* fnode, CvBoost* _ensemble, CvDTreeTrainData* _data )
    880 {
    881     CvDTree::read( fs, fnode, _data );
    882     ensemble = _ensemble;
    883 }
    884 
    885 void CvBoostTree::read( CvFileStorage*, CvFileNode* )
    886 {
    887     assert(0);
    888 }
    889 
    890 void CvBoostTree::read( CvFileStorage* _fs, CvFileNode* _node,
    891                         CvDTreeTrainData* _data )
    892 {
    893     CvDTree::read( _fs, _node, _data );
    894 }
    895 
    896 
    897 /////////////////////////////////// CvBoost /////////////////////////////////////
    898 
    899 CvBoost::CvBoost()
    900 {
    901     data = 0;
    902     weak = 0;
    903     default_model_name = "my_boost_tree";
    904 
    905     active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
    906         subsample_mask = weights = subtree_weights = 0;
    907     have_active_cat_vars = have_subsample = false;
    908 
    909     clear();
    910 }
    911 
    912 
    913 void CvBoost::prune( CvSlice slice )
    914 {
    915     if( weak && weak->total > 0 )
    916     {
    917         CvSeqReader reader;
    918         int i, count = cvSliceLength( slice, weak );
    919 
    920         cvStartReadSeq( weak, &reader );
    921         cvSetSeqReaderPos( &reader, slice.start_index );
    922 
    923         for( i = 0; i < count; i++ )
    924         {
    925             CvBoostTree* w;
    926             CV_READ_SEQ_ELEM( w, reader );
    927             delete w;
    928         }
    929 
    930         cvSeqRemoveSlice( weak, slice );
    931     }
    932 }
    933 
    934 
    935 void CvBoost::clear()
    936 {
    937     if( weak )
    938     {
    939         prune( CV_WHOLE_SEQ );
    940         cvReleaseMemStorage( &weak->storage );
    941     }
    942     if( data )
    943         delete data;
    944     weak = 0;
    945     data = 0;
    946     cvReleaseMat( &active_vars );
    947     cvReleaseMat( &active_vars_abs );
    948     cvReleaseMat( &orig_response );
    949     cvReleaseMat( &sum_response );
    950     cvReleaseMat( &weak_eval );
    951     cvReleaseMat( &subsample_mask );
    952     cvReleaseMat( &weights );
    953     cvReleaseMat( &subtree_weights );
    954 
    955     have_subsample = false;
    956 }
    957 
    958 
    959 CvBoost::~CvBoost()
    960 {
    961     clear();
    962 }
    963 
    964 
    965 CvBoost::CvBoost( const CvMat* _train_data, int _tflag,
    966                   const CvMat* _responses, const CvMat* _var_idx,
    967                   const CvMat* _sample_idx, const CvMat* _var_type,
    968                   const CvMat* _missing_mask, CvBoostParams _params )
    969 {
    970     weak = 0;
    971     data = 0;
    972     default_model_name = "my_boost_tree";
    973 
    974     active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
    975         subsample_mask = weights = subtree_weights = 0;
    976 
    977     train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
    978            _var_type, _missing_mask, _params );
    979 }
    980 
    981 
    982 bool
    983 CvBoost::set_params( const CvBoostParams& _params )
    984 {
    985     bool ok = false;
    986 
    987     CV_FUNCNAME( "CvBoost::set_params" );
    988 
    989     __BEGIN__;
    990 
    991     params = _params;
    992     if( params.boost_type != DISCRETE && params.boost_type != REAL &&
    993         params.boost_type != LOGIT && params.boost_type != GENTLE )
    994         CV_ERROR( CV_StsBadArg, "Unknown/unsupported boosting type" );
    995 
    996     params.weak_count = MAX( params.weak_count, 1 );
    997     params.weight_trim_rate = MAX( params.weight_trim_rate, 0. );
    998     params.weight_trim_rate = MIN( params.weight_trim_rate, 1. );
    999     if( params.weight_trim_rate < FLT_EPSILON )
   1000         params.weight_trim_rate = 1.f;
   1001 
   1002     if( params.boost_type == DISCRETE &&
   1003         params.split_criteria != GINI && params.split_criteria != MISCLASS )
   1004         params.split_criteria = MISCLASS;
   1005     if( params.boost_type == REAL &&
   1006         params.split_criteria != GINI && params.split_criteria != MISCLASS )
   1007         params.split_criteria = GINI;
   1008     if( (params.boost_type == LOGIT || params.boost_type == GENTLE) &&
   1009         params.split_criteria != SQERR )
   1010         params.split_criteria = SQERR;
   1011 
   1012     ok = true;
   1013 
   1014     __END__;
   1015 
   1016     return ok;
   1017 }
   1018 
   1019 
   1020 bool
   1021 CvBoost::train( const CvMat* _train_data, int _tflag,
   1022               const CvMat* _responses, const CvMat* _var_idx,
   1023               const CvMat* _sample_idx, const CvMat* _var_type,
   1024               const CvMat* _missing_mask,
   1025               CvBoostParams _params, bool _update )
   1026 {
   1027     bool ok = false;
   1028     CvMemStorage* storage = 0;
   1029 
   1030     CV_FUNCNAME( "CvBoost::train" );
   1031 
   1032     __BEGIN__;
   1033 
   1034     int i;
   1035 
   1036     set_params( _params );
   1037 
   1038     cvReleaseMat( &active_vars );
   1039     cvReleaseMat( &active_vars_abs );
   1040 
   1041     if( !_update || !data )
   1042     {
   1043         clear();
   1044         data = new CvDTreeTrainData( _train_data, _tflag, _responses, _var_idx,
   1045             _sample_idx, _var_type, _missing_mask, _params, true, true );
   1046 
   1047         if( data->get_num_classes() != 2 )
   1048             CV_ERROR( CV_StsNotImplemented,
   1049             "Boosted trees can only be used for 2-class classification." );
   1050         CV_CALL( storage = cvCreateMemStorage() );
   1051         weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
   1052         storage = 0;
   1053     }
   1054     else
   1055     {
   1056         data->set_data( _train_data, _tflag, _responses, _var_idx,
   1057             _sample_idx, _var_type, _missing_mask, _params, true, true, true );
   1058     }
   1059 
   1060     if ( (_params.boost_type == LOGIT) || (_params.boost_type == GENTLE) )
   1061         data->do_responses_copy();
   1062 
   1063     update_weights( 0 );
   1064 
   1065     for( i = 0; i < params.weak_count; i++ )
   1066     {
   1067         CvBoostTree* tree = new CvBoostTree;
   1068         if( !tree->train( data, subsample_mask, this ) )
   1069         {
   1070             delete tree;
   1071             break;
   1072         }
   1073         //cvCheckArr( get_weak_response());
   1074         cvSeqPush( weak, &tree );
   1075         update_weights( tree );
   1076         trim_weights();
   1077         if( cvCountNonZero(subsample_mask) == 0 )
   1078             break;
   1079     }
   1080 
   1081     if(weak->total > 0)
   1082     {
   1083         get_active_vars(); // recompute active_vars* maps and condensed_idx's in the splits.
   1084         data->is_classifier = true;
   1085         data->free_train_data();
   1086         ok = true;
   1087     }
   1088     else
   1089         clear();
   1090 
   1091     __END__;
   1092 
   1093     return ok;
   1094 }
   1095 
   1096 bool CvBoost::train( CvMLData* _data,
   1097              CvBoostParams _params,
   1098              bool update )
   1099 {
   1100     bool result = false;
   1101 
   1102     CV_FUNCNAME( "CvBoost::train" );
   1103 
   1104     __BEGIN__;
   1105 
   1106     const CvMat* values = _data->get_values();
   1107     const CvMat* response = _data->get_responses();
   1108     const CvMat* missing = _data->get_missing();
   1109     const CvMat* var_types = _data->get_var_types();
   1110     const CvMat* train_sidx = _data->get_train_sample_idx();
   1111     const CvMat* var_idx = _data->get_var_idx();
   1112 
   1113     CV_CALL( result = train( values, CV_ROW_SAMPLE, response, var_idx,
   1114         train_sidx, var_types, missing, _params, update ) );
   1115 
   1116     __END__;
   1117 
   1118     return result;
   1119 }
   1120 
   1121 void CvBoost::initialize_weights(double (&p)[2])
   1122 {
   1123     p[0] = 1.;
   1124     p[1] = 1.;
   1125 }
   1126 
   1127 void
   1128 CvBoost::update_weights( CvBoostTree* tree )
   1129 {
   1130     CV_FUNCNAME( "CvBoost::update_weights" );
   1131 
   1132     __BEGIN__;
   1133 
   1134     int i, n = data->sample_count;
   1135     double sumw = 0.;
   1136     int step = 0;
   1137     float* fdata = 0;
   1138     int *sample_idx_buf;
   1139     const int* sample_idx = 0;
   1140     cv::AutoBuffer<uchar> inn_buf;
   1141     size_t _buf_size = (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ? (size_t)(data->sample_count)*sizeof(int) : 0;
   1142     if( !tree )
   1143         _buf_size += n*sizeof(int);
   1144     else
   1145     {
   1146         if( have_subsample )
   1147             _buf_size += data->get_length_subbuf()*(sizeof(float)+sizeof(uchar));
   1148     }
   1149     inn_buf.allocate(_buf_size);
   1150     uchar* cur_buf_pos = (uchar*)inn_buf;
   1151 
   1152     if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )
   1153     {
   1154         step = CV_IS_MAT_CONT(data->responses_copy->type) ?
   1155             1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type);
   1156         fdata = data->responses_copy->data.fl;
   1157         sample_idx_buf = (int*)cur_buf_pos;
   1158         cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count);
   1159         sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );
   1160     }
   1161     CvMat* dtree_data_buf = data->buf;
   1162     size_t length_buf_row = data->get_length_subbuf();
   1163     if( !tree ) // before training the first tree, initialize weights and other parameters
   1164     {
   1165         int* class_labels_buf = (int*)cur_buf_pos;
   1166         cur_buf_pos = (uchar*)(class_labels_buf + n);
   1167         const int* class_labels = data->get_class_labels(data->data_root, class_labels_buf);
   1168         // in case of logitboost and gentle adaboost each weak tree is a regression tree,
   1169         // so we need to convert class labels to floating-point values
   1170 
   1171         double w0 = 1./ n;
   1172         double p[2] = { 1., 1. };
   1173         initialize_weights(p);
   1174 
   1175         cvReleaseMat( &orig_response );
   1176         cvReleaseMat( &sum_response );
   1177         cvReleaseMat( &weak_eval );
   1178         cvReleaseMat( &subsample_mask );
   1179         cvReleaseMat( &weights );
   1180         cvReleaseMat( &subtree_weights );
   1181 
   1182         CV_CALL( orig_response = cvCreateMat( 1, n, CV_32S ));
   1183         CV_CALL( weak_eval = cvCreateMat( 1, n, CV_64F ));
   1184         CV_CALL( subsample_mask = cvCreateMat( 1, n, CV_8U ));
   1185         CV_CALL( weights = cvCreateMat( 1, n, CV_64F ));
   1186         CV_CALL( subtree_weights = cvCreateMat( 1, n + 2, CV_64F ));
   1187 
   1188         if( data->have_priors )
   1189         {
   1190             // compute weight scale for each class from their prior probabilities
   1191             int c1 = 0;
   1192             for( i = 0; i < n; i++ )
   1193                 c1 += class_labels[i];
   1194             p[0] = data->priors->data.db[0]*(c1 < n ? 1./(n - c1) : 0.);
   1195             p[1] = data->priors->data.db[1]*(c1 > 0 ? 1./c1 : 0.);
   1196             p[0] /= p[0] + p[1];
   1197             p[1] = 1. - p[0];
   1198         }
   1199 
   1200         if (data->is_buf_16u)
   1201         {
   1202             unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*length_buf_row +
   1203                 data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count);
   1204             for( i = 0; i < n; i++ )
   1205             {
   1206                 // save original categorical responses {0,1}, convert them to {-1,1}
   1207                 orig_response->data.i[i] = class_labels[i]*2 - 1;
   1208                 // make all the samples active at start.
   1209                 // later, in trim_weights() deactivate/reactive again some, if need
   1210                 subsample_mask->data.ptr[i] = (uchar)1;
   1211                 // make all the initial weights the same.
   1212                 weights->data.db[i] = w0*p[class_labels[i]];
   1213                 // set the labels to find (from within weak tree learning proc)
   1214                 // the particular sample weight, and where to store the response.
   1215                 labels[i] = (unsigned short)i;
   1216             }
   1217         }
   1218         else
   1219         {
   1220             int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*length_buf_row +
   1221                 data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count;
   1222 
   1223             for( i = 0; i < n; i++ )
   1224             {
   1225                 // save original categorical responses {0,1}, convert them to {-1,1}
   1226                 orig_response->data.i[i] = class_labels[i]*2 - 1;
   1227                 // make all the samples active at start.
   1228                 // later, in trim_weights() deactivate/reactive again some, if need
   1229                 subsample_mask->data.ptr[i] = (uchar)1;
   1230                 // make all the initial weights the same.
   1231                 weights->data.db[i] = w0*p[class_labels[i]];
   1232                 // set the labels to find (from within weak tree learning proc)
   1233                 // the particular sample weight, and where to store the response.
   1234                 labels[i] = i;
   1235             }
   1236         }
   1237 
   1238         if( params.boost_type == LOGIT )
   1239         {
   1240             CV_CALL( sum_response = cvCreateMat( 1, n, CV_64F ));
   1241 
   1242             for( i = 0; i < n; i++ )
   1243             {
   1244                 sum_response->data.db[i] = 0;
   1245                 fdata[sample_idx[i]*step] = orig_response->data.i[i] > 0 ? 2.f : -2.f;
   1246             }
   1247 
   1248             // in case of logitboost each weak tree is a regression tree.
   1249             // the target function values are recalculated for each of the trees
   1250             data->is_classifier = false;
   1251         }
   1252         else if( params.boost_type == GENTLE )
   1253         {
   1254             for( i = 0; i < n; i++ )
   1255                 fdata[sample_idx[i]*step] = (float)orig_response->data.i[i];
   1256 
   1257             data->is_classifier = false;
   1258         }
   1259     }
   1260     else
   1261     {
   1262         // at this moment, for all the samples that participated in the training of the most
   1263         // recent weak classifier we know the responses. For other samples we need to compute them
   1264         if( have_subsample )
   1265         {
   1266             float* values = (float*)cur_buf_pos;
   1267             cur_buf_pos = (uchar*)(values + data->get_length_subbuf());
   1268             uchar* missing = cur_buf_pos;
   1269             cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type);
   1270 
   1271             CvMat _sample, _mask;
   1272 
   1273             // invert the subsample mask
   1274             cvXorS( subsample_mask, cvScalar(1.), subsample_mask );
   1275             data->get_vectors( subsample_mask, values, missing, 0 );
   1276 
   1277             _sample = cvMat( 1, data->var_count, CV_32F );
   1278             _mask = cvMat( 1, data->var_count, CV_8U );
   1279 
   1280             // run tree through all the non-processed samples
   1281             for( i = 0; i < n; i++ )
   1282                 if( subsample_mask->data.ptr[i] )
   1283                 {
   1284                     _sample.data.fl = values;
   1285                     _mask.data.ptr = missing;
   1286                     values += _sample.cols;
   1287                     missing += _mask.cols;
   1288                     weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value;
   1289                 }
   1290         }
   1291 
   1292         // now update weights and other parameters for each type of boosting
   1293         if( params.boost_type == DISCRETE )
   1294         {
   1295             // Discrete AdaBoost:
   1296             //   weak_eval[i] (=f(x_i)) is in {-1,1}
   1297             //   err = sum(w_i*(f(x_i) != y_i))/sum(w_i)
   1298             //   C = log((1-err)/err)
   1299             //   w_i *= exp(C*(f(x_i) != y_i))
   1300 
   1301             double C, err = 0.;
   1302             double scale[] = { 1., 0. };
   1303 
   1304             for( i = 0; i < n; i++ )
   1305             {
   1306                 double w = weights->data.db[i];
   1307                 sumw += w;
   1308                 err += w*(weak_eval->data.db[i] != orig_response->data.i[i]);
   1309             }
   1310 
   1311             if( sumw != 0 )
   1312                 err /= sumw;
   1313             C = err = -log_ratio( err );
   1314             scale[1] = exp(err);
   1315 
   1316             sumw = 0;
   1317             for( i = 0; i < n; i++ )
   1318             {
   1319                 double w = weights->data.db[i]*
   1320                     scale[weak_eval->data.db[i] != orig_response->data.i[i]];
   1321                 sumw += w;
   1322                 weights->data.db[i] = w;
   1323             }
   1324 
   1325             tree->scale( C );
   1326         }
   1327         else if( params.boost_type == REAL )
   1328         {
   1329             // Real AdaBoost:
   1330             //   weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)
   1331             //   w_i *= exp(-y_i*f(x_i))
   1332 
   1333             for( i = 0; i < n; i++ )
   1334                 weak_eval->data.db[i] *= -orig_response->data.i[i];
   1335 
   1336             cvExp( weak_eval, weak_eval );
   1337 
   1338             for( i = 0; i < n; i++ )
   1339             {
   1340                 double w = weights->data.db[i]*weak_eval->data.db[i];
   1341                 sumw += w;
   1342                 weights->data.db[i] = w;
   1343             }
   1344         }
   1345         else if( params.boost_type == LOGIT )
   1346         {
   1347             // LogitBoost:
   1348             //   weak_eval[i] = f(x_i) in [-z_max,z_max]
   1349             //   sum_response = F(x_i).
   1350             //   F(x_i) += 0.5*f(x_i)
   1351             //   p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i)))
   1352             //   reuse weak_eval: weak_eval[i] <- p(x_i)
   1353             //   w_i = p(x_i)*1(1 - p(x_i))
   1354             //   z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i)))
   1355             //   store z_i to the data->data_root as the new target responses
   1356 
   1357             const double lb_weight_thresh = FLT_EPSILON;
   1358             const double lb_z_max = 10.;
   1359             /*float* responses_buf = data->get_resp_float_buf();
   1360             const float* responses = 0;
   1361             data->get_ord_responses(data->data_root, responses_buf, &responses);*/
   1362 
   1363             /*if( weak->total == 7 )
   1364                 putchar('*');*/
   1365 
   1366             for( i = 0; i < n; i++ )
   1367             {
   1368                 double s = sum_response->data.db[i] + 0.5*weak_eval->data.db[i];
   1369                 sum_response->data.db[i] = s;
   1370                 weak_eval->data.db[i] = -2*s;
   1371             }
   1372 
   1373             cvExp( weak_eval, weak_eval );
   1374 
   1375             for( i = 0; i < n; i++ )
   1376             {
   1377                 double p = 1./(1. + weak_eval->data.db[i]);
   1378                 double w = p*(1 - p), z;
   1379                 w = MAX( w, lb_weight_thresh );
   1380                 weights->data.db[i] = w;
   1381                 sumw += w;
   1382                 if( orig_response->data.i[i] > 0 )
   1383                 {
   1384                     z = 1./p;
   1385                     fdata[sample_idx[i]*step] = (float)MIN(z, lb_z_max);
   1386                 }
   1387                 else
   1388                 {
   1389                     z = 1./(1-p);
   1390                     fdata[sample_idx[i]*step] = (float)-MIN(z, lb_z_max);
   1391                 }
   1392             }
   1393         }
   1394         else
   1395         {
   1396             // Gentle AdaBoost:
   1397             //   weak_eval[i] = f(x_i) in [-1,1]
   1398             //   w_i *= exp(-y_i*f(x_i))
   1399             assert( params.boost_type == GENTLE );
   1400 
   1401             for( i = 0; i < n; i++ )
   1402                 weak_eval->data.db[i] *= -orig_response->data.i[i];
   1403 
   1404             cvExp( weak_eval, weak_eval );
   1405 
   1406             for( i = 0; i < n; i++ )
   1407             {
   1408                 double w = weights->data.db[i] * weak_eval->data.db[i];
   1409                 weights->data.db[i] = w;
   1410                 sumw += w;
   1411             }
   1412         }
   1413     }
   1414 
   1415     // renormalize weights
   1416     if( sumw > FLT_EPSILON )
   1417     {
   1418         sumw = 1./sumw;
   1419         for( i = 0; i < n; ++i )
   1420             weights->data.db[i] *= sumw;
   1421     }
   1422 
   1423     __END__;
   1424 }
   1425 
   1426 
   1427 void
   1428 CvBoost::trim_weights()
   1429 {
   1430     //CV_FUNCNAME( "CvBoost::trim_weights" );
   1431 
   1432     __BEGIN__;
   1433 
   1434     int i, count = data->sample_count, nz_count = 0;
   1435     double sum, threshold;
   1436 
   1437     if( params.weight_trim_rate <= 0. || params.weight_trim_rate >= 1. )
   1438         EXIT;
   1439 
   1440     // use weak_eval as temporary buffer for sorted weights
   1441     cvCopy( weights, weak_eval );
   1442 
   1443     std::sort(weak_eval->data.db, weak_eval->data.db + count);
   1444 
   1445     // as weight trimming occurs immediately after updating the weights,
   1446     // where they are renormalized, we assume that the weight sum = 1.
   1447     sum = 1. - params.weight_trim_rate;
   1448 
   1449     for( i = 0; i < count; i++ )
   1450     {
   1451         double w = weak_eval->data.db[i];
   1452         if( sum <= 0 )
   1453             break;
   1454         sum -= w;
   1455     }
   1456 
   1457     threshold = i < count ? weak_eval->data.db[i] : DBL_MAX;
   1458 
   1459     for( i = 0; i < count; i++ )
   1460     {
   1461         double w = weights->data.db[i];
   1462         int f = w >= threshold;
   1463         subsample_mask->data.ptr[i] = (uchar)f;
   1464         nz_count += f;
   1465     }
   1466 
   1467     have_subsample = nz_count < count;
   1468 
   1469     __END__;
   1470 }
   1471 
   1472 
   1473 const CvMat*
   1474 CvBoost::get_active_vars( bool absolute_idx )
   1475 {
   1476     CvMat* mask = 0;
   1477     CvMat* inv_map = 0;
   1478     CvMat* result = 0;
   1479 
   1480     CV_FUNCNAME( "CvBoost::get_active_vars" );
   1481 
   1482     __BEGIN__;
   1483 
   1484     if( !weak )
   1485         CV_ERROR( CV_StsError, "The boosted tree ensemble has not been trained yet" );
   1486 
   1487     if( !active_vars || !active_vars_abs )
   1488     {
   1489         CvSeqReader reader;
   1490         int i, j, nactive_vars;
   1491         CvBoostTree* wtree;
   1492         const CvDTreeNode* node;
   1493 
   1494         assert(!active_vars && !active_vars_abs);
   1495         mask = cvCreateMat( 1, data->var_count, CV_8U );
   1496         inv_map = cvCreateMat( 1, data->var_count, CV_32S );
   1497         cvZero( mask );
   1498         cvSet( inv_map, cvScalar(-1) );
   1499 
   1500         // first pass: compute the mask of used variables
   1501         cvStartReadSeq( weak, &reader );
   1502         for( i = 0; i < weak->total; i++ )
   1503         {
   1504             CV_READ_SEQ_ELEM(wtree, reader);
   1505 
   1506             node = wtree->get_root();
   1507             assert( node != 0 );
   1508             for(;;)
   1509             {
   1510                 const CvDTreeNode* parent;
   1511                 for(;;)
   1512                 {
   1513                     CvDTreeSplit* split = node->split;
   1514                     for( ; split != 0; split = split->next )
   1515                         mask->data.ptr[split->var_idx] = 1;
   1516                     if( !node->left )
   1517                         break;
   1518                     node = node->left;
   1519                 }
   1520 
   1521                 for( parent = node->parent; parent && parent->right == node;
   1522                     node = parent, parent = parent->parent )
   1523                     ;
   1524 
   1525                 if( !parent )
   1526                     break;
   1527 
   1528                 node = parent->right;
   1529             }
   1530         }
   1531 
   1532         nactive_vars = cvCountNonZero(mask);
   1533 
   1534         //if ( nactive_vars > 0 )
   1535         {
   1536             active_vars = cvCreateMat( 1, nactive_vars, CV_32S );
   1537             active_vars_abs = cvCreateMat( 1, nactive_vars, CV_32S );
   1538 
   1539             have_active_cat_vars = false;
   1540 
   1541             for( i = j = 0; i < data->var_count; i++ )
   1542             {
   1543                 if( mask->data.ptr[i] )
   1544                 {
   1545                     active_vars->data.i[j] = i;
   1546                     active_vars_abs->data.i[j] = data->var_idx ? data->var_idx->data.i[i] : i;
   1547                     inv_map->data.i[i] = j;
   1548                     if( data->var_type->data.i[i] >= 0 )
   1549                         have_active_cat_vars = true;
   1550                     j++;
   1551                 }
   1552             }
   1553 
   1554 
   1555             // second pass: now compute the condensed indices
   1556             cvStartReadSeq( weak, &reader );
   1557             for( i = 0; i < weak->total; i++ )
   1558             {
   1559                 CV_READ_SEQ_ELEM(wtree, reader);
   1560                 node = wtree->get_root();
   1561                 for(;;)
   1562                 {
   1563                     const CvDTreeNode* parent;
   1564                     for(;;)
   1565                     {
   1566                         CvDTreeSplit* split = node->split;
   1567                         for( ; split != 0; split = split->next )
   1568                         {
   1569                             split->condensed_idx = inv_map->data.i[split->var_idx];
   1570                             assert( split->condensed_idx >= 0 );
   1571                         }
   1572 
   1573                         if( !node->left )
   1574                             break;
   1575                         node = node->left;
   1576                     }
   1577 
   1578                     for( parent = node->parent; parent && parent->right == node;
   1579                         node = parent, parent = parent->parent )
   1580                         ;
   1581 
   1582                     if( !parent )
   1583                         break;
   1584 
   1585                     node = parent->right;
   1586                 }
   1587             }
   1588         }
   1589     }
   1590 
   1591     result = absolute_idx ? active_vars_abs : active_vars;
   1592 
   1593     __END__;
   1594 
   1595     cvReleaseMat( &mask );
   1596     cvReleaseMat( &inv_map );
   1597 
   1598     return result;
   1599 }
   1600 
   1601 
   1602 float
   1603 CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
   1604                   CvMat* weak_responses, CvSlice slice,
   1605                   bool raw_mode, bool return_sum ) const
   1606 {
   1607     float value = -FLT_MAX;
   1608 
   1609     CvSeqReader reader;
   1610     double sum = 0;
   1611     int wstep = 0;
   1612     const float* sample_data;
   1613 
   1614     if( !weak )
   1615         CV_Error( CV_StsError, "The boosted tree ensemble has not been trained yet" );
   1616 
   1617     if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 ||
   1618         (_sample->cols != 1 && _sample->rows != 1) ||
   1619         (_sample->cols + _sample->rows - 1 != data->var_all && !raw_mode) ||
   1620         (active_vars && _sample->cols + _sample->rows - 1 != active_vars->cols && raw_mode) )
   1621             CV_Error( CV_StsBadArg,
   1622         "the input sample must be 1d floating-point vector with the same "
   1623         "number of elements as the total number of variables or "
   1624         "as the number of variables used for training" );
   1625 
   1626     if( _missing )
   1627     {
   1628         if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) ||
   1629             !CV_ARE_SIZES_EQ(_missing, _sample) )
   1630             CV_Error( CV_StsBadArg,
   1631             "the missing data mask must be 8-bit vector of the same size as input sample" );
   1632     }
   1633 
   1634     int i, weak_count = cvSliceLength( slice, weak );
   1635     if( weak_count >= weak->total )
   1636     {
   1637         weak_count = weak->total;
   1638         slice.start_index = 0;
   1639     }
   1640 
   1641     if( weak_responses )
   1642     {
   1643         if( !CV_IS_MAT(weak_responses) ||
   1644             CV_MAT_TYPE(weak_responses->type) != CV_32FC1 ||
   1645             (weak_responses->cols != 1 && weak_responses->rows != 1) ||
   1646             weak_responses->cols + weak_responses->rows - 1 != weak_count )
   1647             CV_Error( CV_StsBadArg,
   1648             "The output matrix of weak classifier responses must be valid "
   1649             "floating-point vector of the same number of components as the length of input slice" );
   1650         wstep = CV_IS_MAT_CONT(weak_responses->type) ? 1 : weak_responses->step/sizeof(float);
   1651     }
   1652 
   1653     int var_count = active_vars->cols;
   1654     const int* vtype = data->var_type->data.i;
   1655     const int* cmap = data->cat_map->data.i;
   1656     const int* cofs = data->cat_ofs->data.i;
   1657 
   1658     cv::Mat sample = cv::cvarrToMat(_sample);
   1659     cv::Mat missing;
   1660     if(!_missing)
   1661         missing = cv::cvarrToMat(_missing);
   1662 
   1663     // if need, preprocess the input vector
   1664     if( !raw_mode )
   1665     {
   1666         int sstep, mstep = 0;
   1667         const float* src_sample;
   1668         const uchar* src_mask = 0;
   1669         float* dst_sample;
   1670         uchar* dst_mask;
   1671         const int* vidx = active_vars->data.i;
   1672         const int* vidx_abs = active_vars_abs->data.i;
   1673         bool have_mask = _missing != 0;
   1674 
   1675         sample = cv::Mat(1, var_count, CV_32FC1);
   1676         missing = cv::Mat(1, var_count, CV_8UC1);
   1677 
   1678         dst_sample = sample.ptr<float>();
   1679         dst_mask = missing.ptr<uchar>();
   1680 
   1681         src_sample = _sample->data.fl;
   1682         sstep = CV_IS_MAT_CONT(_sample->type) ? 1 : _sample->step/sizeof(src_sample[0]);
   1683 
   1684         if( _missing )
   1685         {
   1686             src_mask = _missing->data.ptr;
   1687             mstep = CV_IS_MAT_CONT(_missing->type) ? 1 : _missing->step;
   1688         }
   1689 
   1690         for( i = 0; i < var_count; i++ )
   1691         {
   1692             int idx = vidx[i], idx_abs = vidx_abs[i];
   1693             float val = src_sample[idx_abs*sstep];
   1694             int ci = vtype[idx];
   1695             uchar m = src_mask ? src_mask[idx_abs*mstep] : (uchar)0;
   1696 
   1697             if( ci >= 0 )
   1698             {
   1699                 int a = cofs[ci], b = (ci+1 >= data->cat_ofs->cols) ? data->cat_map->cols : cofs[ci+1],
   1700                     c = a;
   1701                 int ival = cvRound(val);
   1702                 if ( (ival != val) && (!m) )
   1703                     CV_Error( CV_StsBadArg,
   1704                         "one of input categorical variable is not an integer" );
   1705 
   1706                 while( a < b )
   1707                 {
   1708                     c = (a + b) >> 1;
   1709                     if( ival < cmap[c] )
   1710                         b = c;
   1711                     else if( ival > cmap[c] )
   1712                         a = c+1;
   1713                     else
   1714                         break;
   1715                 }
   1716 
   1717                 if( c < 0 || ival != cmap[c] )
   1718                 {
   1719                     m = 1;
   1720                     have_mask = true;
   1721                 }
   1722                 else
   1723                 {
   1724                     val = (float)(c - cofs[ci]);
   1725                 }
   1726             }
   1727 
   1728             dst_sample[i] = val;
   1729             dst_mask[i] = m;
   1730         }
   1731 
   1732         if( !have_mask )
   1733             missing.release();
   1734     }
   1735     else
   1736     {
   1737         if( !CV_IS_MAT_CONT(_sample->type & (_missing ? _missing->type : -1)) )
   1738             CV_Error( CV_StsBadArg, "In raw mode the input vectors must be continuous" );
   1739     }
   1740 
   1741     cvStartReadSeq( weak, &reader );
   1742     cvSetSeqReaderPos( &reader, slice.start_index );
   1743 
   1744     sample_data = sample.ptr<float>();
   1745 
   1746     if( !have_active_cat_vars && missing.empty() && !weak_responses )
   1747     {
   1748         for( i = 0; i < weak_count; i++ )
   1749         {
   1750             CvBoostTree* wtree;
   1751             const CvDTreeNode* node;
   1752             CV_READ_SEQ_ELEM( wtree, reader );
   1753 
   1754             node = wtree->get_root();
   1755             while( node->left )
   1756             {
   1757                 CvDTreeSplit* split = node->split;
   1758                 int vi = split->condensed_idx;
   1759                 float val = sample_data[vi];
   1760                 int dir = val <= split->ord.c ? -1 : 1;
   1761                 if( split->inversed )
   1762                     dir = -dir;
   1763                 node = dir < 0 ? node->left : node->right;
   1764             }
   1765             sum += node->value;
   1766         }
   1767     }
   1768     else
   1769     {
   1770         const int* avars = active_vars->data.i;
   1771         const uchar* m = !missing.empty() ? missing.ptr<uchar>() : 0;
   1772 
   1773         // full-featured version
   1774         for( i = 0; i < weak_count; i++ )
   1775         {
   1776             CvBoostTree* wtree;
   1777             const CvDTreeNode* node;
   1778             CV_READ_SEQ_ELEM( wtree, reader );
   1779 
   1780             node = wtree->get_root();
   1781             while( node->left )
   1782             {
   1783                 const CvDTreeSplit* split = node->split;
   1784                 int dir = 0;
   1785                 for( ; !dir && split != 0; split = split->next )
   1786                 {
   1787                     int vi = split->condensed_idx;
   1788                     int ci = vtype[avars[vi]];
   1789                     float val = sample_data[vi];
   1790                     if( m && m[vi] )
   1791                         continue;
   1792                     if( ci < 0 ) // ordered
   1793                         dir = val <= split->ord.c ? -1 : 1;
   1794                     else // categorical
   1795                     {
   1796                         int c = cvRound(val);
   1797                         dir = CV_DTREE_CAT_DIR(c, split->subset);
   1798                     }
   1799                     if( split->inversed )
   1800                         dir = -dir;
   1801                 }
   1802 
   1803                 if( !dir )
   1804                 {
   1805                     int diff = node->right->sample_count - node->left->sample_count;
   1806                     dir = diff < 0 ? -1 : 1;
   1807                 }
   1808                 node = dir < 0 ? node->left : node->right;
   1809             }
   1810             if( weak_responses )
   1811                 weak_responses->data.fl[i*wstep] = (float)node->value;
   1812             sum += node->value;
   1813         }
   1814     }
   1815 
   1816     if( return_sum )
   1817         value = (float)sum;
   1818     else
   1819     {
   1820         int cls_idx = sum >= 0;
   1821         if( raw_mode )
   1822             value = (float)cls_idx;
   1823         else
   1824             value = (float)cmap[cofs[vtype[data->var_count]] + cls_idx];
   1825     }
   1826 
   1827     return value;
   1828 }
   1829 
   1830 float CvBoost::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
   1831 {
   1832     float err = 0;
   1833     const CvMat* values = _data->get_values();
   1834     const CvMat* response = _data->get_responses();
   1835     const CvMat* missing = _data->get_missing();
   1836     const CvMat* sample_idx = (type == CV_TEST_ERROR) ? _data->get_test_sample_idx() : _data->get_train_sample_idx();
   1837     const CvMat* var_types = _data->get_var_types();
   1838     int* sidx = sample_idx ? sample_idx->data.i : 0;
   1839     int r_step = CV_IS_MAT_CONT(response->type) ?
   1840                 1 : response->step / CV_ELEM_SIZE(response->type);
   1841     bool is_classifier = var_types->data.ptr[var_types->cols-1] == CV_VAR_CATEGORICAL;
   1842     int sample_count = sample_idx ? sample_idx->cols : 0;
   1843     sample_count = (type == CV_TRAIN_ERROR && sample_count == 0) ? values->rows : sample_count;
   1844     float* pred_resp = 0;
   1845     if( resp && (sample_count > 0) )
   1846     {
   1847         resp->resize( sample_count );
   1848         pred_resp = &((*resp)[0]);
   1849     }
   1850     if ( is_classifier )
   1851     {
   1852         for( int i = 0; i < sample_count; i++ )
   1853         {
   1854             CvMat sample, miss;
   1855             int si = sidx ? sidx[i] : i;
   1856             cvGetRow( values, &sample, si );
   1857             if( missing )
   1858                 cvGetRow( missing, &miss, si );
   1859             float r = (float)predict( &sample, missing ? &miss : 0 );
   1860             if( pred_resp )
   1861                 pred_resp[i] = r;
   1862             int d = fabs((double)r - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
   1863             err += d;
   1864         }
   1865         err = sample_count ? err / (float)sample_count * 100 : -FLT_MAX;
   1866     }
   1867     else
   1868     {
   1869         for( int i = 0; i < sample_count; i++ )
   1870         {
   1871             CvMat sample, miss;
   1872             int si = sidx ? sidx[i] : i;
   1873             cvGetRow( values, &sample, si );
   1874             if( missing )
   1875                 cvGetRow( missing, &miss, si );
   1876             float r = (float)predict( &sample, missing ? &miss : 0 );
   1877             if( pred_resp )
   1878                 pred_resp[i] = r;
   1879             float d = r - response->data.fl[si*r_step];
   1880             err += d*d;
   1881         }
   1882         err = sample_count ? err / (float)sample_count : -FLT_MAX;
   1883     }
   1884     return err;
   1885 }
   1886 
   1887 void CvBoost::write_params( CvFileStorage* fs ) const
   1888 {
   1889     const char* boost_type_str =
   1890         params.boost_type == DISCRETE ? "DiscreteAdaboost" :
   1891         params.boost_type == REAL ? "RealAdaboost" :
   1892         params.boost_type == LOGIT ? "LogitBoost" :
   1893         params.boost_type == GENTLE ? "GentleAdaboost" : 0;
   1894 
   1895     const char* split_crit_str =
   1896         params.split_criteria == DEFAULT ? "Default" :
   1897         params.split_criteria == GINI ? "Gini" :
   1898         params.boost_type == MISCLASS ? "Misclassification" :
   1899         params.boost_type == SQERR ? "SquaredErr" : 0;
   1900 
   1901     if( boost_type_str )
   1902         cvWriteString( fs, "boosting_type", boost_type_str );
   1903     else
   1904         cvWriteInt( fs, "boosting_type", params.boost_type );
   1905 
   1906     if( split_crit_str )
   1907         cvWriteString( fs, "splitting_criteria", split_crit_str );
   1908     else
   1909         cvWriteInt( fs, "splitting_criteria", params.split_criteria );
   1910 
   1911     cvWriteInt( fs, "ntrees", weak->total );
   1912     cvWriteReal( fs, "weight_trimming_rate", params.weight_trim_rate );
   1913 
   1914     data->write_params( fs );
   1915 }
   1916 
   1917 
   1918 void CvBoost::read_params( CvFileStorage* fs, CvFileNode* fnode )
   1919 {
   1920     CV_FUNCNAME( "CvBoost::read_params" );
   1921 
   1922     __BEGIN__;
   1923 
   1924     CvFileNode* temp;
   1925 
   1926     if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
   1927         return;
   1928 
   1929     data = new CvDTreeTrainData();
   1930     CV_CALL( data->read_params(fs, fnode));
   1931     data->shared = true;
   1932 
   1933     params.max_depth = data->params.max_depth;
   1934     params.min_sample_count = data->params.min_sample_count;
   1935     params.max_categories = data->params.max_categories;
   1936     params.priors = data->params.priors;
   1937     params.regression_accuracy = data->params.regression_accuracy;
   1938     params.use_surrogates = data->params.use_surrogates;
   1939 
   1940     temp = cvGetFileNodeByName( fs, fnode, "boosting_type" );
   1941     if( !temp )
   1942         return;
   1943 
   1944     if( temp && CV_NODE_IS_STRING(temp->tag) )
   1945     {
   1946         const char* boost_type_str = cvReadString( temp, "" );
   1947         params.boost_type = strcmp( boost_type_str, "DiscreteAdaboost" ) == 0 ? DISCRETE :
   1948                             strcmp( boost_type_str, "RealAdaboost" ) == 0 ? REAL :
   1949                             strcmp( boost_type_str, "LogitBoost" ) == 0 ? LOGIT :
   1950                             strcmp( boost_type_str, "GentleAdaboost" ) == 0 ? GENTLE : -1;
   1951     }
   1952     else
   1953         params.boost_type = cvReadInt( temp, -1 );
   1954 
   1955     if( params.boost_type < DISCRETE || params.boost_type > GENTLE )
   1956         CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
   1957 
   1958     temp = cvGetFileNodeByName( fs, fnode, "splitting_criteria" );
   1959     if( temp && CV_NODE_IS_STRING(temp->tag) )
   1960     {
   1961         const char* split_crit_str = cvReadString( temp, "" );
   1962         params.split_criteria = strcmp( split_crit_str, "Default" ) == 0 ? DEFAULT :
   1963                                 strcmp( split_crit_str, "Gini" ) == 0 ? GINI :
   1964                                 strcmp( split_crit_str, "Misclassification" ) == 0 ? MISCLASS :
   1965                                 strcmp( split_crit_str, "SquaredErr" ) == 0 ? SQERR : -1;
   1966     }
   1967     else
   1968         params.split_criteria = cvReadInt( temp, -1 );
   1969 
   1970     if( params.split_criteria < DEFAULT || params.boost_type > SQERR )
   1971         CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
   1972 
   1973     params.weak_count = cvReadIntByName( fs, fnode, "ntrees" );
   1974     params.weight_trim_rate = cvReadRealByName( fs, fnode, "weight_trimming_rate", 0. );
   1975 
   1976     __END__;
   1977 }
   1978 
   1979 
   1980 
   1981 void
   1982 CvBoost::read( CvFileStorage* fs, CvFileNode* node )
   1983 {
   1984     CV_FUNCNAME( "CvBoost::read" );
   1985 
   1986     __BEGIN__;
   1987 
   1988     CvSeqReader reader;
   1989     CvFileNode* trees_fnode;
   1990     CvMemStorage* storage;
   1991     int i, ntrees;
   1992 
   1993     clear();
   1994     read_params( fs, node );
   1995 
   1996     if( !data )
   1997         EXIT;
   1998 
   1999     trees_fnode = cvGetFileNodeByName( fs, node, "trees" );
   2000     if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
   2001         CV_ERROR( CV_StsParseError, "<trees> tag is missing" );
   2002 
   2003     cvStartReadSeq( trees_fnode->data.seq, &reader );
   2004     ntrees = trees_fnode->data.seq->total;
   2005 
   2006     if( ntrees != params.weak_count )
   2007         CV_ERROR( CV_StsUnmatchedSizes,
   2008         "The number of trees stored does not match <ntrees> tag value" );
   2009 
   2010     CV_CALL( storage = cvCreateMemStorage() );
   2011     weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
   2012 
   2013     for( i = 0; i < ntrees; i++ )
   2014     {
   2015         CvBoostTree* tree = new CvBoostTree();
   2016         CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, this, data ));
   2017         CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
   2018         cvSeqPush( weak, &tree );
   2019     }
   2020     get_active_vars();
   2021 
   2022     __END__;
   2023 }
   2024 
   2025 
   2026 void
   2027 CvBoost::write( CvFileStorage* fs, const char* name ) const
   2028 {
   2029     CV_FUNCNAME( "CvBoost::write" );
   2030 
   2031     __BEGIN__;
   2032 
   2033     CvSeqReader reader;
   2034     int i;
   2035 
   2036     cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_BOOSTING );
   2037 
   2038     if( !weak )
   2039         CV_ERROR( CV_StsBadArg, "The classifier has not been trained yet" );
   2040 
   2041     write_params( fs );
   2042     cvStartWriteStruct( fs, "trees", CV_NODE_SEQ );
   2043 
   2044     cvStartReadSeq( weak, &reader );
   2045 
   2046     for( i = 0; i < weak->total; i++ )
   2047     {
   2048         CvBoostTree* tree;
   2049         CV_READ_SEQ_ELEM( tree, reader );
   2050         cvStartWriteStruct( fs, 0, CV_NODE_MAP );
   2051         tree->write( fs );
   2052         cvEndWriteStruct( fs );
   2053     }
   2054 
   2055     cvEndWriteStruct( fs );
   2056     cvEndWriteStruct( fs );
   2057 
   2058     __END__;
   2059 }
   2060 
   2061 
   2062 CvMat*
   2063 CvBoost::get_weights()
   2064 {
   2065     return weights;
   2066 }
   2067 
   2068 
   2069 CvMat*
   2070 CvBoost::get_subtree_weights()
   2071 {
   2072     return subtree_weights;
   2073 }
   2074 
   2075 
   2076 CvMat*
   2077 CvBoost::get_weak_response()
   2078 {
   2079     return weak_eval;
   2080 }
   2081 
   2082 
   2083 const CvBoostParams&
   2084 CvBoost::get_params() const
   2085 {
   2086     return params;
   2087 }
   2088 
   2089 CvSeq* CvBoost::get_weak_predictors()
   2090 {
   2091     return weak;
   2092 }
   2093 
   2094 const CvDTreeTrainData* CvBoost::get_data() const
   2095 {
   2096     return data;
   2097 }
   2098 
   2099 using namespace cv;
   2100 
   2101 CvBoost::CvBoost( const Mat& _train_data, int _tflag,
   2102                const Mat& _responses, const Mat& _var_idx,
   2103                const Mat& _sample_idx, const Mat& _var_type,
   2104                const Mat& _missing_mask,
   2105                CvBoostParams _params )
   2106 {
   2107     weak = 0;
   2108     data = 0;
   2109     default_model_name = "my_boost_tree";
   2110     active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
   2111         subsample_mask = weights = subtree_weights = 0;
   2112 
   2113     train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
   2114           _var_type, _missing_mask, _params );
   2115 }
   2116 
   2117 
   2118 bool
   2119 CvBoost::train( const Mat& _train_data, int _tflag,
   2120                const Mat& _responses, const Mat& _var_idx,
   2121                const Mat& _sample_idx, const Mat& _var_type,
   2122                const Mat& _missing_mask,
   2123                CvBoostParams _params, bool _update )
   2124 {
   2125     train_data_hdr = _train_data;
   2126     train_data_mat = _train_data;
   2127     responses_hdr = _responses;
   2128     responses_mat = _responses;
   2129 
   2130     CvMat vidx = _var_idx, sidx = _sample_idx, vtype = _var_type, mmask = _missing_mask;
   2131 
   2132     return train(&train_data_hdr, _tflag, &responses_hdr, vidx.data.ptr ? &vidx : 0,
   2133           sidx.data.ptr ? &sidx : 0, vtype.data.ptr ? &vtype : 0,
   2134           mmask.data.ptr ? &mmask : 0, _params, _update);
   2135 }
   2136 
   2137 float
   2138 CvBoost::predict( const Mat& _sample, const Mat& _missing,
   2139                   const Range& slice, bool raw_mode, bool return_sum ) const
   2140 {
   2141     CvMat sample = _sample, mmask = _missing;
   2142     /*if( weak_responses )
   2143     {
   2144         int weak_count = cvSliceLength( slice, weak );
   2145         if( weak_count >= weak->total )
   2146         {
   2147             weak_count = weak->total;
   2148             slice.start_index = 0;
   2149         }
   2150 
   2151         if( !(weak_responses->data && weak_responses->type() == CV_32FC1 &&
   2152               (weak_responses->cols == 1 || weak_responses->rows == 1) &&
   2153               weak_responses->cols + weak_responses->rows - 1 == weak_count) )
   2154             weak_responses->create(weak_count, 1, CV_32FC1);
   2155         pwr = &(wr = *weak_responses);
   2156     }*/
   2157     return predict(&sample, _missing.empty() ? 0 : &mmask, 0,
   2158                    slice == Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end),
   2159                    raw_mode, return_sum);
   2160 }
   2161 
   2162 /* End of file. */
   2163