1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // Intel License Agreement 11 // 12 // Copyright (C) 2000, Intel Corporation, all rights reserved. 13 // Third party copyrights are property of their respective owners. 14 // 15 // Redistribution and use in source and binary forms, with or without modification, 16 // are permitted provided that the following conditions are met: 17 // 18 // * Redistribution's of source code must retain the above copyright notice, 19 // this list of conditions and the following disclaimer. 20 // 21 // * Redistribution's in binary form must reproduce the above copyright notice, 22 // this list of conditions and the following disclaimer in the documentation 23 // and/or other materials provided with the distribution. 24 // 25 // * The name of Intel Corporation may not be used to endorse or promote products 26 // derived from this software without specific prior written permission. 27 // 28 // This software is provided by the copyright holders and contributors "as is" and 29 // any express or implied warranties, including, but not limited to, the implied 30 // warranties of merchantability and fitness for a particular purpose are disclaimed. 31 // In no event shall the Intel Corporation or contributors be liable for any direct, 32 // indirect, incidental, special, exemplary, or consequential damages 33 // (including, but not limited to, procurement of substitute goods or services; 34 // loss of use, data, or profits; or business interruption) however caused 35 // and on any theory of liability, whether in contract, strict liability, 36 // or tort (including negligence or otherwise) arising in any way out of 37 // the use of this software, even if advised of the possibility of such damage. 38 // 39 //M*/ 40 41 #include "old_ml_precomp.hpp" 42 43 static inline double 44 log_ratio( double val ) 45 { 46 const double eps = 1e-5; 47 48 val = MAX( val, eps ); 49 val = MIN( val, 1. - eps ); 50 return log( val/(1. - val) ); 51 } 52 53 54 CvBoostParams::CvBoostParams() 55 { 56 boost_type = CvBoost::REAL; 57 weak_count = 100; 58 weight_trim_rate = 0.95; 59 cv_folds = 0; 60 max_depth = 1; 61 } 62 63 64 CvBoostParams::CvBoostParams( int _boost_type, int _weak_count, 65 double _weight_trim_rate, int _max_depth, 66 bool _use_surrogates, const float* _priors ) 67 { 68 boost_type = _boost_type; 69 weak_count = _weak_count; 70 weight_trim_rate = _weight_trim_rate; 71 split_criteria = CvBoost::DEFAULT; 72 cv_folds = 0; 73 max_depth = _max_depth; 74 use_surrogates = _use_surrogates; 75 priors = _priors; 76 } 77 78 79 80 ///////////////////////////////// CvBoostTree /////////////////////////////////// 81 82 CvBoostTree::CvBoostTree() 83 { 84 ensemble = 0; 85 } 86 87 88 CvBoostTree::~CvBoostTree() 89 { 90 clear(); 91 } 92 93 94 void 95 CvBoostTree::clear() 96 { 97 CvDTree::clear(); 98 ensemble = 0; 99 } 100 101 102 bool 103 CvBoostTree::train( CvDTreeTrainData* _train_data, 104 const CvMat* _subsample_idx, CvBoost* _ensemble ) 105 { 106 clear(); 107 ensemble = _ensemble; 108 data = _train_data; 109 data->shared = true; 110 return do_train( _subsample_idx ); 111 } 112 113 114 bool 115 CvBoostTree::train( const CvMat*, int, const CvMat*, const CvMat*, 116 const CvMat*, const CvMat*, const CvMat*, CvDTreeParams ) 117 { 118 assert(0); 119 return false; 120 } 121 122 123 bool 124 CvBoostTree::train( CvDTreeTrainData*, const CvMat* ) 125 { 126 assert(0); 127 return false; 128 } 129 130 131 void 132 CvBoostTree::scale( double _scale ) 133 { 134 CvDTreeNode* node = root; 135 136 // traverse the tree and scale all the node values 137 for(;;) 138 { 139 CvDTreeNode* parent; 140 for(;;) 141 { 142 node->value *= _scale; 143 if( !node->left ) 144 break; 145 node = node->left; 146 } 147 148 for( parent = node->parent; parent && parent->right == node; 149 node = parent, parent = parent->parent ) 150 ; 151 152 if( !parent ) 153 break; 154 155 node = parent->right; 156 } 157 } 158 159 160 void 161 CvBoostTree::try_split_node( CvDTreeNode* node ) 162 { 163 CvDTree::try_split_node( node ); 164 165 if( !node->left ) 166 { 167 // if the node has not been split, 168 // store the responses for the corresponding training samples 169 double* weak_eval = ensemble->get_weak_response()->data.db; 170 cv::AutoBuffer<int> inn_buf(node->sample_count); 171 const int* labels = data->get_cv_labels( node, (int*)inn_buf ); 172 int i, count = node->sample_count; 173 double value = node->value; 174 175 for( i = 0; i < count; i++ ) 176 weak_eval[labels[i]] = value; 177 } 178 } 179 180 181 double 182 CvBoostTree::calc_node_dir( CvDTreeNode* node ) 183 { 184 char* dir = (char*)data->direction->data.ptr; 185 const double* weights = ensemble->get_subtree_weights()->data.db; 186 int i, n = node->sample_count, vi = node->split->var_idx; 187 double L, R; 188 189 assert( !node->split->inversed ); 190 191 if( data->get_var_type(vi) >= 0 ) // split on categorical var 192 { 193 cv::AutoBuffer<int> inn_buf(n); 194 const int* cat_labels = data->get_cat_var_data( node, vi, (int*)inn_buf ); 195 const int* subset = node->split->subset; 196 double sum = 0, sum_abs = 0; 197 198 for( i = 0; i < n; i++ ) 199 { 200 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i]; 201 double w = weights[i]; 202 int d = idx >= 0 ? CV_DTREE_CAT_DIR(idx,subset) : 0; 203 sum += d*w; sum_abs += (d & 1)*w; 204 dir[i] = (char)d; 205 } 206 207 R = (sum_abs + sum) * 0.5; 208 L = (sum_abs - sum) * 0.5; 209 } 210 else // split on ordered var 211 { 212 cv::AutoBuffer<uchar> inn_buf(2*n*sizeof(int)+n*sizeof(float)); 213 float* values_buf = (float*)(uchar*)inn_buf; 214 int* sorted_indices_buf = (int*)(values_buf + n); 215 int* sample_indices_buf = sorted_indices_buf + n; 216 const float* values = 0; 217 const int* sorted_indices = 0; 218 data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf ); 219 int split_point = node->split->ord.split_point; 220 int n1 = node->get_num_valid(vi); 221 222 assert( 0 <= split_point && split_point < n1-1 ); 223 L = R = 0; 224 225 for( i = 0; i <= split_point; i++ ) 226 { 227 int idx = sorted_indices[i]; 228 double w = weights[idx]; 229 dir[idx] = (char)-1; 230 L += w; 231 } 232 233 for( ; i < n1; i++ ) 234 { 235 int idx = sorted_indices[i]; 236 double w = weights[idx]; 237 dir[idx] = (char)1; 238 R += w; 239 } 240 241 for( ; i < n; i++ ) 242 dir[sorted_indices[i]] = (char)0; 243 } 244 245 node->maxlr = MAX( L, R ); 246 return node->split->quality/(L + R); 247 } 248 249 250 CvDTreeSplit* 251 CvBoostTree::find_split_ord_class( CvDTreeNode* node, int vi, float init_quality, 252 CvDTreeSplit* _split, uchar* _ext_buf ) 253 { 254 const float epsilon = FLT_EPSILON*2; 255 256 const double* weights = ensemble->get_subtree_weights()->data.db; 257 int n = node->sample_count; 258 int n1 = node->get_num_valid(vi); 259 260 cv::AutoBuffer<uchar> inn_buf; 261 if( !_ext_buf ) 262 inn_buf.allocate(n*(3*sizeof(int)+sizeof(float))); 263 uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf; 264 float* values_buf = (float*)ext_buf; 265 int* sorted_indices_buf = (int*)(values_buf + n); 266 int* sample_indices_buf = sorted_indices_buf + n; 267 const float* values = 0; 268 const int* sorted_indices = 0; 269 data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf ); 270 int* responses_buf = sorted_indices_buf + n; 271 const int* responses = data->get_class_labels( node, responses_buf ); 272 const double* rcw0 = weights + n; 273 double lcw[2] = {0,0}, rcw[2]; 274 int i, best_i = -1; 275 double best_val = init_quality; 276 int boost_type = ensemble->get_params().boost_type; 277 int split_criteria = ensemble->get_params().split_criteria; 278 279 rcw[0] = rcw0[0]; rcw[1] = rcw0[1]; 280 for( i = n1; i < n; i++ ) 281 { 282 int idx = sorted_indices[i]; 283 double w = weights[idx]; 284 rcw[responses[idx]] -= w; 285 } 286 287 if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS ) 288 split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI; 289 290 if( split_criteria == CvBoost::GINI ) 291 { 292 double L = 0, R = rcw[0] + rcw[1]; 293 double lsum2 = 0, rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1]; 294 295 for( i = 0; i < n1 - 1; i++ ) 296 { 297 int idx = sorted_indices[i]; 298 double w = weights[idx], w2 = w*w; 299 double lv, rv; 300 idx = responses[idx]; 301 L += w; R -= w; 302 lv = lcw[idx]; rv = rcw[idx]; 303 lsum2 += 2*lv*w + w2; 304 rsum2 -= 2*rv*w - w2; 305 lcw[idx] = lv + w; rcw[idx] = rv - w; 306 307 if( values[i] + epsilon < values[i+1] ) 308 { 309 double val = (lsum2*R + rsum2*L)/(L*R); 310 if( best_val < val ) 311 { 312 best_val = val; 313 best_i = i; 314 } 315 } 316 } 317 } 318 else 319 { 320 for( i = 0; i < n1 - 1; i++ ) 321 { 322 int idx = sorted_indices[i]; 323 double w = weights[idx]; 324 idx = responses[idx]; 325 lcw[idx] += w; 326 rcw[idx] -= w; 327 328 if( values[i] + epsilon < values[i+1] ) 329 { 330 double val = lcw[0] + rcw[1], val2 = lcw[1] + rcw[0]; 331 val = MAX(val, val2); 332 if( best_val < val ) 333 { 334 best_val = val; 335 best_i = i; 336 } 337 } 338 } 339 } 340 341 CvDTreeSplit* split = 0; 342 if( best_i >= 0 ) 343 { 344 split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f ); 345 split->var_idx = vi; 346 split->ord.c = (values[best_i] + values[best_i+1])*0.5f; 347 split->ord.split_point = best_i; 348 split->inversed = 0; 349 split->quality = (float)best_val; 350 } 351 return split; 352 } 353 354 template<typename T> 355 class LessThanPtr 356 { 357 public: 358 bool operator()(T* a, T* b) const { return *a < *b; } 359 }; 360 361 CvDTreeSplit* 362 CvBoostTree::find_split_cat_class( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf ) 363 { 364 int ci = data->get_var_type(vi); 365 int n = node->sample_count; 366 int mi = data->cat_count->data.i[ci]; 367 368 int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*); 369 cv::AutoBuffer<uchar> inn_buf((2*mi+3)*sizeof(double) + mi*sizeof(double*)); 370 if( !_ext_buf) 371 inn_buf.allocate( base_size + 2*n*sizeof(int) ); 372 uchar* base_buf = (uchar*)inn_buf; 373 uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size; 374 375 int* cat_labels_buf = (int*)ext_buf; 376 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf); 377 int* responses_buf = cat_labels_buf + n; 378 const int* responses = data->get_class_labels(node, responses_buf); 379 double lcw[2]={0,0}, rcw[2]={0,0}; 380 381 double* cjk = (double*)cv::alignPtr(base_buf,sizeof(double))+2; 382 const double* weights = ensemble->get_subtree_weights()->data.db; 383 double** dbl_ptr = (double**)(cjk + 2*mi); 384 int i, j, k, idx; 385 double L = 0, R; 386 double best_val = init_quality; 387 int best_subset = -1, subset_i; 388 int boost_type = ensemble->get_params().boost_type; 389 int split_criteria = ensemble->get_params().split_criteria; 390 391 // init array of counters: 392 // c_{jk} - number of samples that have vi-th input variable = j and response = k. 393 for( j = -1; j < mi; j++ ) 394 cjk[j*2] = cjk[j*2+1] = 0; 395 396 for( i = 0; i < n; i++ ) 397 { 398 double w = weights[i]; 399 j = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i]; 400 k = responses[i]; 401 cjk[j*2 + k] += w; 402 } 403 404 for( j = 0; j < mi; j++ ) 405 { 406 rcw[0] += cjk[j*2]; 407 rcw[1] += cjk[j*2+1]; 408 dbl_ptr[j] = cjk + j*2 + 1; 409 } 410 411 R = rcw[0] + rcw[1]; 412 413 if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS ) 414 split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI; 415 416 // sort rows of c_jk by increasing c_j,1 417 // (i.e. by the weight of samples in j-th category that belong to class 1) 418 std::sort(dbl_ptr, dbl_ptr + mi, LessThanPtr<double>()); 419 420 for( subset_i = 0; subset_i < mi-1; subset_i++ ) 421 { 422 idx = (int)(dbl_ptr[subset_i] - cjk)/2; 423 const double* crow = cjk + idx*2; 424 double w0 = crow[0], w1 = crow[1]; 425 double weight = w0 + w1; 426 427 if( weight < FLT_EPSILON ) 428 continue; 429 430 lcw[0] += w0; rcw[0] -= w0; 431 lcw[1] += w1; rcw[1] -= w1; 432 433 if( split_criteria == CvBoost::GINI ) 434 { 435 double lsum2 = lcw[0]*lcw[0] + lcw[1]*lcw[1]; 436 double rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1]; 437 438 L += weight; 439 R -= weight; 440 441 if( L > FLT_EPSILON && R > FLT_EPSILON ) 442 { 443 double val = (lsum2*R + rsum2*L)/(L*R); 444 if( best_val < val ) 445 { 446 best_val = val; 447 best_subset = subset_i; 448 } 449 } 450 } 451 else 452 { 453 double val = lcw[0] + rcw[1]; 454 double val2 = lcw[1] + rcw[0]; 455 456 val = MAX(val, val2); 457 if( best_val < val ) 458 { 459 best_val = val; 460 best_subset = subset_i; 461 } 462 } 463 } 464 465 CvDTreeSplit* split = 0; 466 if( best_subset >= 0 ) 467 { 468 split = _split ? _split : data->new_split_cat( 0, -1.0f); 469 split->var_idx = vi; 470 split->quality = (float)best_val; 471 memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int)); 472 for( i = 0; i <= best_subset; i++ ) 473 { 474 idx = (int)(dbl_ptr[i] - cjk) >> 1; 475 split->subset[idx >> 5] |= 1 << (idx & 31); 476 } 477 } 478 return split; 479 } 480 481 482 CvDTreeSplit* 483 CvBoostTree::find_split_ord_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf ) 484 { 485 const float epsilon = FLT_EPSILON*2; 486 const double* weights = ensemble->get_subtree_weights()->data.db; 487 int n = node->sample_count; 488 int n1 = node->get_num_valid(vi); 489 490 cv::AutoBuffer<uchar> inn_buf; 491 if( !_ext_buf ) 492 inn_buf.allocate(2*n*(sizeof(int)+sizeof(float))); 493 uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf; 494 495 float* values_buf = (float*)ext_buf; 496 int* indices_buf = (int*)(values_buf + n); 497 int* sample_indices_buf = indices_buf + n; 498 const float* values = 0; 499 const int* indices = 0; 500 data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf ); 501 float* responses_buf = (float*)(indices_buf + n); 502 const float* responses = data->get_ord_responses( node, responses_buf, sample_indices_buf ); 503 504 int i, best_i = -1; 505 double L = 0, R = weights[n]; 506 double best_val = init_quality, lsum = 0, rsum = node->value*R; 507 508 // compensate for missing values 509 for( i = n1; i < n; i++ ) 510 { 511 int idx = indices[i]; 512 double w = weights[idx]; 513 rsum -= responses[idx]*w; 514 R -= w; 515 } 516 517 // find the optimal split 518 for( i = 0; i < n1 - 1; i++ ) 519 { 520 int idx = indices[i]; 521 double w = weights[idx]; 522 double t = responses[idx]*w; 523 L += w; R -= w; 524 lsum += t; rsum -= t; 525 526 if( values[i] + epsilon < values[i+1] ) 527 { 528 double val = (lsum*lsum*R + rsum*rsum*L)/(L*R); 529 if( best_val < val ) 530 { 531 best_val = val; 532 best_i = i; 533 } 534 } 535 } 536 537 CvDTreeSplit* split = 0; 538 if( best_i >= 0 ) 539 { 540 split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f ); 541 split->var_idx = vi; 542 split->ord.c = (values[best_i] + values[best_i+1])*0.5f; 543 split->ord.split_point = best_i; 544 split->inversed = 0; 545 split->quality = (float)best_val; 546 } 547 return split; 548 } 549 550 551 CvDTreeSplit* 552 CvBoostTree::find_split_cat_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf ) 553 { 554 const double* weights = ensemble->get_subtree_weights()->data.db; 555 int ci = data->get_var_type(vi); 556 int n = node->sample_count; 557 int mi = data->cat_count->data.i[ci]; 558 int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*); 559 cv::AutoBuffer<uchar> inn_buf(base_size); 560 if( !_ext_buf ) 561 inn_buf.allocate(base_size + n*(2*sizeof(int) + sizeof(float))); 562 uchar* base_buf = (uchar*)inn_buf; 563 uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size; 564 565 int* cat_labels_buf = (int*)ext_buf; 566 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf); 567 float* responses_buf = (float*)(cat_labels_buf + n); 568 int* sample_indices_buf = (int*)(responses_buf + n); 569 const float* responses = data->get_ord_responses(node, responses_buf, sample_indices_buf); 570 571 double* sum = (double*)cv::alignPtr(base_buf,sizeof(double)) + 1; 572 double* counts = sum + mi + 1; 573 double** sum_ptr = (double**)(counts + mi); 574 double L = 0, R = 0, best_val = init_quality, lsum = 0, rsum = 0; 575 int i, best_subset = -1, subset_i; 576 577 for( i = -1; i < mi; i++ ) 578 sum[i] = counts[i] = 0; 579 580 // calculate sum response and weight of each category of the input var 581 for( i = 0; i < n; i++ ) 582 { 583 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i]; 584 double w = weights[i]; 585 double s = sum[idx] + responses[i]*w; 586 double nc = counts[idx] + w; 587 sum[idx] = s; 588 counts[idx] = nc; 589 } 590 591 // calculate average response in each category 592 for( i = 0; i < mi; i++ ) 593 { 594 R += counts[i]; 595 rsum += sum[i]; 596 sum[i] = fabs(counts[i]) > DBL_EPSILON ? sum[i]/counts[i] : 0; 597 sum_ptr[i] = sum + i; 598 } 599 600 std::sort(sum_ptr, sum_ptr + mi, LessThanPtr<double>()); 601 602 // revert back to unnormalized sums 603 // (there should be a very little loss in accuracy) 604 for( i = 0; i < mi; i++ ) 605 sum[i] *= counts[i]; 606 607 for( subset_i = 0; subset_i < mi-1; subset_i++ ) 608 { 609 int idx = (int)(sum_ptr[subset_i] - sum); 610 double ni = counts[idx]; 611 612 if( ni > FLT_EPSILON ) 613 { 614 double s = sum[idx]; 615 lsum += s; L += ni; 616 rsum -= s; R -= ni; 617 618 if( L > FLT_EPSILON && R > FLT_EPSILON ) 619 { 620 double val = (lsum*lsum*R + rsum*rsum*L)/(L*R); 621 if( best_val < val ) 622 { 623 best_val = val; 624 best_subset = subset_i; 625 } 626 } 627 } 628 } 629 630 CvDTreeSplit* split = 0; 631 if( best_subset >= 0 ) 632 { 633 split = _split ? _split : data->new_split_cat( 0, -1.0f); 634 split->var_idx = vi; 635 split->quality = (float)best_val; 636 memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int)); 637 for( i = 0; i <= best_subset; i++ ) 638 { 639 int idx = (int)(sum_ptr[i] - sum); 640 split->subset[idx >> 5] |= 1 << (idx & 31); 641 } 642 } 643 return split; 644 } 645 646 647 CvDTreeSplit* 648 CvBoostTree::find_surrogate_split_ord( CvDTreeNode* node, int vi, uchar* _ext_buf ) 649 { 650 const float epsilon = FLT_EPSILON*2; 651 int n = node->sample_count; 652 cv::AutoBuffer<uchar> inn_buf; 653 if( !_ext_buf ) 654 inn_buf.allocate(n*(2*sizeof(int)+sizeof(float))); 655 uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf; 656 float* values_buf = (float*)ext_buf; 657 int* indices_buf = (int*)(values_buf + n); 658 int* sample_indices_buf = indices_buf + n; 659 const float* values = 0; 660 const int* indices = 0; 661 data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf ); 662 663 const double* weights = ensemble->get_subtree_weights()->data.db; 664 const char* dir = (char*)data->direction->data.ptr; 665 int n1 = node->get_num_valid(vi); 666 // LL - number of samples that both the primary and the surrogate splits send to the left 667 // LR - ... primary split sends to the left and the surrogate split sends to the right 668 // RL - ... primary split sends to the right and the surrogate split sends to the left 669 // RR - ... both send to the right 670 int i, best_i = -1, best_inversed = 0; 671 double best_val; 672 double LL = 0, RL = 0, LR, RR; 673 double worst_val = node->maxlr; 674 double sum = 0, sum_abs = 0; 675 best_val = worst_val; 676 677 for( i = 0; i < n1; i++ ) 678 { 679 int idx = indices[i]; 680 double w = weights[idx]; 681 int d = dir[idx]; 682 sum += d*w; sum_abs += (d & 1)*w; 683 } 684 685 // sum_abs = R + L; sum = R - L 686 RR = (sum_abs + sum)*0.5; 687 LR = (sum_abs - sum)*0.5; 688 689 // initially all the samples are sent to the right by the surrogate split, 690 // LR of them are sent to the left by primary split, and RR - to the right. 691 // now iteratively compute LL, LR, RL and RR for every possible surrogate split value. 692 for( i = 0; i < n1 - 1; i++ ) 693 { 694 int idx = indices[i]; 695 double w = weights[idx]; 696 int d = dir[idx]; 697 698 if( d < 0 ) 699 { 700 LL += w; LR -= w; 701 if( LL + RR > best_val && values[i] + epsilon < values[i+1] ) 702 { 703 best_val = LL + RR; 704 best_i = i; best_inversed = 0; 705 } 706 } 707 else if( d > 0 ) 708 { 709 RL += w; RR -= w; 710 if( RL + LR > best_val && values[i] + epsilon < values[i+1] ) 711 { 712 best_val = RL + LR; 713 best_i = i; best_inversed = 1; 714 } 715 } 716 } 717 718 return best_i >= 0 && best_val > node->maxlr ? data->new_split_ord( vi, 719 (values[best_i] + values[best_i+1])*0.5f, best_i, 720 best_inversed, (float)best_val ) : 0; 721 } 722 723 724 CvDTreeSplit* 725 CvBoostTree::find_surrogate_split_cat( CvDTreeNode* node, int vi, uchar* _ext_buf ) 726 { 727 const char* dir = (char*)data->direction->data.ptr; 728 const double* weights = ensemble->get_subtree_weights()->data.db; 729 int n = node->sample_count; 730 int i, mi = data->cat_count->data.i[data->get_var_type(vi)]; 731 732 int base_size = (2*mi+3)*sizeof(double); 733 cv::AutoBuffer<uchar> inn_buf(base_size); 734 if( !_ext_buf ) 735 inn_buf.allocate(base_size + n*sizeof(int)); 736 uchar* ext_buf = _ext_buf ? _ext_buf : (uchar*)inn_buf; 737 int* cat_labels_buf = (int*)ext_buf; 738 const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf); 739 740 // LL - number of samples that both the primary and the surrogate splits send to the left 741 // LR - ... primary split sends to the left and the surrogate split sends to the right 742 // RL - ... primary split sends to the right and the surrogate split sends to the left 743 // RR - ... both send to the right 744 CvDTreeSplit* split = data->new_split_cat( vi, 0 ); 745 double best_val = 0; 746 double* lc = (double*)cv::alignPtr(cat_labels_buf + n, sizeof(double)) + 1; 747 double* rc = lc + mi + 1; 748 749 for( i = -1; i < mi; i++ ) 750 lc[i] = rc[i] = 0; 751 752 // 1. for each category calculate the weight of samples 753 // sent to the left (lc) and to the right (rc) by the primary split 754 for( i = 0; i < n; i++ ) 755 { 756 int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i]; 757 double w = weights[i]; 758 int d = dir[i]; 759 double sum = lc[idx] + d*w; 760 double sum_abs = rc[idx] + (d & 1)*w; 761 lc[idx] = sum; rc[idx] = sum_abs; 762 } 763 764 for( i = 0; i < mi; i++ ) 765 { 766 double sum = lc[i]; 767 double sum_abs = rc[i]; 768 lc[i] = (sum_abs - sum) * 0.5; 769 rc[i] = (sum_abs + sum) * 0.5; 770 } 771 772 // 2. now form the split. 773 // in each category send all the samples to the same direction as majority 774 for( i = 0; i < mi; i++ ) 775 { 776 double lval = lc[i], rval = rc[i]; 777 if( lval > rval ) 778 { 779 split->subset[i >> 5] |= 1 << (i & 31); 780 best_val += lval; 781 } 782 else 783 best_val += rval; 784 } 785 786 split->quality = (float)best_val; 787 if( split->quality <= node->maxlr ) 788 cvSetRemoveByPtr( data->split_heap, split ), split = 0; 789 790 return split; 791 } 792 793 794 void 795 CvBoostTree::calc_node_value( CvDTreeNode* node ) 796 { 797 int i, n = node->sample_count; 798 const double* weights = ensemble->get_weights()->data.db; 799 cv::AutoBuffer<uchar> inn_buf(n*(sizeof(int) + ( data->is_classifier ? sizeof(int) : sizeof(int) + sizeof(float)))); 800 int* labels_buf = (int*)(uchar*)inn_buf; 801 const int* labels = data->get_cv_labels(node, labels_buf); 802 double* subtree_weights = ensemble->get_subtree_weights()->data.db; 803 double rcw[2] = {0,0}; 804 int boost_type = ensemble->get_params().boost_type; 805 806 if( data->is_classifier ) 807 { 808 int* _responses_buf = labels_buf + n; 809 const int* _responses = data->get_class_labels(node, _responses_buf); 810 int m = data->get_num_classes(); 811 int* cls_count = data->counts->data.i; 812 for( int k = 0; k < m; k++ ) 813 cls_count[k] = 0; 814 815 for( i = 0; i < n; i++ ) 816 { 817 int idx = labels[i]; 818 double w = weights[idx]; 819 int r = _responses[i]; 820 rcw[r] += w; 821 cls_count[r]++; 822 subtree_weights[i] = w; 823 } 824 825 node->class_idx = rcw[1] > rcw[0]; 826 827 if( boost_type == CvBoost::DISCRETE ) 828 { 829 // ignore cat_map for responses, and use {-1,1}, 830 // as the whole ensemble response is computes as sign(sum_i(weak_response_i) 831 node->value = node->class_idx*2 - 1; 832 } 833 else 834 { 835 double p = rcw[1]/(rcw[0] + rcw[1]); 836 assert( boost_type == CvBoost::REAL ); 837 838 // store log-ratio of the probability 839 node->value = 0.5*log_ratio(p); 840 } 841 } 842 else 843 { 844 // in case of regression tree: 845 // * node value is 1/n*sum_i(Y_i), where Y_i is i-th response, 846 // n is the number of samples in the node. 847 // * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2) 848 double sum = 0, sum2 = 0, iw; 849 float* values_buf = (float*)(labels_buf + n); 850 int* sample_indices_buf = (int*)(values_buf + n); 851 const float* values = data->get_ord_responses(node, values_buf, sample_indices_buf); 852 853 for( i = 0; i < n; i++ ) 854 { 855 int idx = labels[i]; 856 double w = weights[idx]/*priors[values[i] > 0]*/; 857 double t = values[i]; 858 rcw[0] += w; 859 subtree_weights[i] = w; 860 sum += t*w; 861 sum2 += t*t*w; 862 } 863 864 iw = 1./rcw[0]; 865 node->value = sum*iw; 866 node->node_risk = sum2 - (sum*iw)*sum; 867 868 // renormalize the risk, as in try_split_node the unweighted formula 869 // sqrt(risk)/n is used, rather than sqrt(risk)/sum(weights_i) 870 node->node_risk *= n*iw*n*iw; 871 } 872 873 // store summary weights 874 subtree_weights[n] = rcw[0]; 875 subtree_weights[n+1] = rcw[1]; 876 } 877 878 879 void CvBoostTree::read( CvFileStorage* fs, CvFileNode* fnode, CvBoost* _ensemble, CvDTreeTrainData* _data ) 880 { 881 CvDTree::read( fs, fnode, _data ); 882 ensemble = _ensemble; 883 } 884 885 void CvBoostTree::read( CvFileStorage*, CvFileNode* ) 886 { 887 assert(0); 888 } 889 890 void CvBoostTree::read( CvFileStorage* _fs, CvFileNode* _node, 891 CvDTreeTrainData* _data ) 892 { 893 CvDTree::read( _fs, _node, _data ); 894 } 895 896 897 /////////////////////////////////// CvBoost ///////////////////////////////////// 898 899 CvBoost::CvBoost() 900 { 901 data = 0; 902 weak = 0; 903 default_model_name = "my_boost_tree"; 904 905 active_vars = active_vars_abs = orig_response = sum_response = weak_eval = 906 subsample_mask = weights = subtree_weights = 0; 907 have_active_cat_vars = have_subsample = false; 908 909 clear(); 910 } 911 912 913 void CvBoost::prune( CvSlice slice ) 914 { 915 if( weak && weak->total > 0 ) 916 { 917 CvSeqReader reader; 918 int i, count = cvSliceLength( slice, weak ); 919 920 cvStartReadSeq( weak, &reader ); 921 cvSetSeqReaderPos( &reader, slice.start_index ); 922 923 for( i = 0; i < count; i++ ) 924 { 925 CvBoostTree* w; 926 CV_READ_SEQ_ELEM( w, reader ); 927 delete w; 928 } 929 930 cvSeqRemoveSlice( weak, slice ); 931 } 932 } 933 934 935 void CvBoost::clear() 936 { 937 if( weak ) 938 { 939 prune( CV_WHOLE_SEQ ); 940 cvReleaseMemStorage( &weak->storage ); 941 } 942 if( data ) 943 delete data; 944 weak = 0; 945 data = 0; 946 cvReleaseMat( &active_vars ); 947 cvReleaseMat( &active_vars_abs ); 948 cvReleaseMat( &orig_response ); 949 cvReleaseMat( &sum_response ); 950 cvReleaseMat( &weak_eval ); 951 cvReleaseMat( &subsample_mask ); 952 cvReleaseMat( &weights ); 953 cvReleaseMat( &subtree_weights ); 954 955 have_subsample = false; 956 } 957 958 959 CvBoost::~CvBoost() 960 { 961 clear(); 962 } 963 964 965 CvBoost::CvBoost( const CvMat* _train_data, int _tflag, 966 const CvMat* _responses, const CvMat* _var_idx, 967 const CvMat* _sample_idx, const CvMat* _var_type, 968 const CvMat* _missing_mask, CvBoostParams _params ) 969 { 970 weak = 0; 971 data = 0; 972 default_model_name = "my_boost_tree"; 973 974 active_vars = active_vars_abs = orig_response = sum_response = weak_eval = 975 subsample_mask = weights = subtree_weights = 0; 976 977 train( _train_data, _tflag, _responses, _var_idx, _sample_idx, 978 _var_type, _missing_mask, _params ); 979 } 980 981 982 bool 983 CvBoost::set_params( const CvBoostParams& _params ) 984 { 985 bool ok = false; 986 987 CV_FUNCNAME( "CvBoost::set_params" ); 988 989 __BEGIN__; 990 991 params = _params; 992 if( params.boost_type != DISCRETE && params.boost_type != REAL && 993 params.boost_type != LOGIT && params.boost_type != GENTLE ) 994 CV_ERROR( CV_StsBadArg, "Unknown/unsupported boosting type" ); 995 996 params.weak_count = MAX( params.weak_count, 1 ); 997 params.weight_trim_rate = MAX( params.weight_trim_rate, 0. ); 998 params.weight_trim_rate = MIN( params.weight_trim_rate, 1. ); 999 if( params.weight_trim_rate < FLT_EPSILON ) 1000 params.weight_trim_rate = 1.f; 1001 1002 if( params.boost_type == DISCRETE && 1003 params.split_criteria != GINI && params.split_criteria != MISCLASS ) 1004 params.split_criteria = MISCLASS; 1005 if( params.boost_type == REAL && 1006 params.split_criteria != GINI && params.split_criteria != MISCLASS ) 1007 params.split_criteria = GINI; 1008 if( (params.boost_type == LOGIT || params.boost_type == GENTLE) && 1009 params.split_criteria != SQERR ) 1010 params.split_criteria = SQERR; 1011 1012 ok = true; 1013 1014 __END__; 1015 1016 return ok; 1017 } 1018 1019 1020 bool 1021 CvBoost::train( const CvMat* _train_data, int _tflag, 1022 const CvMat* _responses, const CvMat* _var_idx, 1023 const CvMat* _sample_idx, const CvMat* _var_type, 1024 const CvMat* _missing_mask, 1025 CvBoostParams _params, bool _update ) 1026 { 1027 bool ok = false; 1028 CvMemStorage* storage = 0; 1029 1030 CV_FUNCNAME( "CvBoost::train" ); 1031 1032 __BEGIN__; 1033 1034 int i; 1035 1036 set_params( _params ); 1037 1038 cvReleaseMat( &active_vars ); 1039 cvReleaseMat( &active_vars_abs ); 1040 1041 if( !_update || !data ) 1042 { 1043 clear(); 1044 data = new CvDTreeTrainData( _train_data, _tflag, _responses, _var_idx, 1045 _sample_idx, _var_type, _missing_mask, _params, true, true ); 1046 1047 if( data->get_num_classes() != 2 ) 1048 CV_ERROR( CV_StsNotImplemented, 1049 "Boosted trees can only be used for 2-class classification." ); 1050 CV_CALL( storage = cvCreateMemStorage() ); 1051 weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage ); 1052 storage = 0; 1053 } 1054 else 1055 { 1056 data->set_data( _train_data, _tflag, _responses, _var_idx, 1057 _sample_idx, _var_type, _missing_mask, _params, true, true, true ); 1058 } 1059 1060 if ( (_params.boost_type == LOGIT) || (_params.boost_type == GENTLE) ) 1061 data->do_responses_copy(); 1062 1063 update_weights( 0 ); 1064 1065 for( i = 0; i < params.weak_count; i++ ) 1066 { 1067 CvBoostTree* tree = new CvBoostTree; 1068 if( !tree->train( data, subsample_mask, this ) ) 1069 { 1070 delete tree; 1071 break; 1072 } 1073 //cvCheckArr( get_weak_response()); 1074 cvSeqPush( weak, &tree ); 1075 update_weights( tree ); 1076 trim_weights(); 1077 if( cvCountNonZero(subsample_mask) == 0 ) 1078 break; 1079 } 1080 1081 if(weak->total > 0) 1082 { 1083 get_active_vars(); // recompute active_vars* maps and condensed_idx's in the splits. 1084 data->is_classifier = true; 1085 data->free_train_data(); 1086 ok = true; 1087 } 1088 else 1089 clear(); 1090 1091 __END__; 1092 1093 return ok; 1094 } 1095 1096 bool CvBoost::train( CvMLData* _data, 1097 CvBoostParams _params, 1098 bool update ) 1099 { 1100 bool result = false; 1101 1102 CV_FUNCNAME( "CvBoost::train" ); 1103 1104 __BEGIN__; 1105 1106 const CvMat* values = _data->get_values(); 1107 const CvMat* response = _data->get_responses(); 1108 const CvMat* missing = _data->get_missing(); 1109 const CvMat* var_types = _data->get_var_types(); 1110 const CvMat* train_sidx = _data->get_train_sample_idx(); 1111 const CvMat* var_idx = _data->get_var_idx(); 1112 1113 CV_CALL( result = train( values, CV_ROW_SAMPLE, response, var_idx, 1114 train_sidx, var_types, missing, _params, update ) ); 1115 1116 __END__; 1117 1118 return result; 1119 } 1120 1121 void CvBoost::initialize_weights(double (&p)[2]) 1122 { 1123 p[0] = 1.; 1124 p[1] = 1.; 1125 } 1126 1127 void 1128 CvBoost::update_weights( CvBoostTree* tree ) 1129 { 1130 CV_FUNCNAME( "CvBoost::update_weights" ); 1131 1132 __BEGIN__; 1133 1134 int i, n = data->sample_count; 1135 double sumw = 0.; 1136 int step = 0; 1137 float* fdata = 0; 1138 int *sample_idx_buf; 1139 const int* sample_idx = 0; 1140 cv::AutoBuffer<uchar> inn_buf; 1141 size_t _buf_size = (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ? (size_t)(data->sample_count)*sizeof(int) : 0; 1142 if( !tree ) 1143 _buf_size += n*sizeof(int); 1144 else 1145 { 1146 if( have_subsample ) 1147 _buf_size += data->get_length_subbuf()*(sizeof(float)+sizeof(uchar)); 1148 } 1149 inn_buf.allocate(_buf_size); 1150 uchar* cur_buf_pos = (uchar*)inn_buf; 1151 1152 if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ) 1153 { 1154 step = CV_IS_MAT_CONT(data->responses_copy->type) ? 1155 1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type); 1156 fdata = data->responses_copy->data.fl; 1157 sample_idx_buf = (int*)cur_buf_pos; 1158 cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count); 1159 sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf ); 1160 } 1161 CvMat* dtree_data_buf = data->buf; 1162 size_t length_buf_row = data->get_length_subbuf(); 1163 if( !tree ) // before training the first tree, initialize weights and other parameters 1164 { 1165 int* class_labels_buf = (int*)cur_buf_pos; 1166 cur_buf_pos = (uchar*)(class_labels_buf + n); 1167 const int* class_labels = data->get_class_labels(data->data_root, class_labels_buf); 1168 // in case of logitboost and gentle adaboost each weak tree is a regression tree, 1169 // so we need to convert class labels to floating-point values 1170 1171 double w0 = 1./ n; 1172 double p[2] = { 1., 1. }; 1173 initialize_weights(p); 1174 1175 cvReleaseMat( &orig_response ); 1176 cvReleaseMat( &sum_response ); 1177 cvReleaseMat( &weak_eval ); 1178 cvReleaseMat( &subsample_mask ); 1179 cvReleaseMat( &weights ); 1180 cvReleaseMat( &subtree_weights ); 1181 1182 CV_CALL( orig_response = cvCreateMat( 1, n, CV_32S )); 1183 CV_CALL( weak_eval = cvCreateMat( 1, n, CV_64F )); 1184 CV_CALL( subsample_mask = cvCreateMat( 1, n, CV_8U )); 1185 CV_CALL( weights = cvCreateMat( 1, n, CV_64F )); 1186 CV_CALL( subtree_weights = cvCreateMat( 1, n + 2, CV_64F )); 1187 1188 if( data->have_priors ) 1189 { 1190 // compute weight scale for each class from their prior probabilities 1191 int c1 = 0; 1192 for( i = 0; i < n; i++ ) 1193 c1 += class_labels[i]; 1194 p[0] = data->priors->data.db[0]*(c1 < n ? 1./(n - c1) : 0.); 1195 p[1] = data->priors->data.db[1]*(c1 > 0 ? 1./c1 : 0.); 1196 p[0] /= p[0] + p[1]; 1197 p[1] = 1. - p[0]; 1198 } 1199 1200 if (data->is_buf_16u) 1201 { 1202 unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*length_buf_row + 1203 data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count); 1204 for( i = 0; i < n; i++ ) 1205 { 1206 // save original categorical responses {0,1}, convert them to {-1,1} 1207 orig_response->data.i[i] = class_labels[i]*2 - 1; 1208 // make all the samples active at start. 1209 // later, in trim_weights() deactivate/reactive again some, if need 1210 subsample_mask->data.ptr[i] = (uchar)1; 1211 // make all the initial weights the same. 1212 weights->data.db[i] = w0*p[class_labels[i]]; 1213 // set the labels to find (from within weak tree learning proc) 1214 // the particular sample weight, and where to store the response. 1215 labels[i] = (unsigned short)i; 1216 } 1217 } 1218 else 1219 { 1220 int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*length_buf_row + 1221 data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count; 1222 1223 for( i = 0; i < n; i++ ) 1224 { 1225 // save original categorical responses {0,1}, convert them to {-1,1} 1226 orig_response->data.i[i] = class_labels[i]*2 - 1; 1227 // make all the samples active at start. 1228 // later, in trim_weights() deactivate/reactive again some, if need 1229 subsample_mask->data.ptr[i] = (uchar)1; 1230 // make all the initial weights the same. 1231 weights->data.db[i] = w0*p[class_labels[i]]; 1232 // set the labels to find (from within weak tree learning proc) 1233 // the particular sample weight, and where to store the response. 1234 labels[i] = i; 1235 } 1236 } 1237 1238 if( params.boost_type == LOGIT ) 1239 { 1240 CV_CALL( sum_response = cvCreateMat( 1, n, CV_64F )); 1241 1242 for( i = 0; i < n; i++ ) 1243 { 1244 sum_response->data.db[i] = 0; 1245 fdata[sample_idx[i]*step] = orig_response->data.i[i] > 0 ? 2.f : -2.f; 1246 } 1247 1248 // in case of logitboost each weak tree is a regression tree. 1249 // the target function values are recalculated for each of the trees 1250 data->is_classifier = false; 1251 } 1252 else if( params.boost_type == GENTLE ) 1253 { 1254 for( i = 0; i < n; i++ ) 1255 fdata[sample_idx[i]*step] = (float)orig_response->data.i[i]; 1256 1257 data->is_classifier = false; 1258 } 1259 } 1260 else 1261 { 1262 // at this moment, for all the samples that participated in the training of the most 1263 // recent weak classifier we know the responses. For other samples we need to compute them 1264 if( have_subsample ) 1265 { 1266 float* values = (float*)cur_buf_pos; 1267 cur_buf_pos = (uchar*)(values + data->get_length_subbuf()); 1268 uchar* missing = cur_buf_pos; 1269 cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type); 1270 1271 CvMat _sample, _mask; 1272 1273 // invert the subsample mask 1274 cvXorS( subsample_mask, cvScalar(1.), subsample_mask ); 1275 data->get_vectors( subsample_mask, values, missing, 0 ); 1276 1277 _sample = cvMat( 1, data->var_count, CV_32F ); 1278 _mask = cvMat( 1, data->var_count, CV_8U ); 1279 1280 // run tree through all the non-processed samples 1281 for( i = 0; i < n; i++ ) 1282 if( subsample_mask->data.ptr[i] ) 1283 { 1284 _sample.data.fl = values; 1285 _mask.data.ptr = missing; 1286 values += _sample.cols; 1287 missing += _mask.cols; 1288 weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value; 1289 } 1290 } 1291 1292 // now update weights and other parameters for each type of boosting 1293 if( params.boost_type == DISCRETE ) 1294 { 1295 // Discrete AdaBoost: 1296 // weak_eval[i] (=f(x_i)) is in {-1,1} 1297 // err = sum(w_i*(f(x_i) != y_i))/sum(w_i) 1298 // C = log((1-err)/err) 1299 // w_i *= exp(C*(f(x_i) != y_i)) 1300 1301 double C, err = 0.; 1302 double scale[] = { 1., 0. }; 1303 1304 for( i = 0; i < n; i++ ) 1305 { 1306 double w = weights->data.db[i]; 1307 sumw += w; 1308 err += w*(weak_eval->data.db[i] != orig_response->data.i[i]); 1309 } 1310 1311 if( sumw != 0 ) 1312 err /= sumw; 1313 C = err = -log_ratio( err ); 1314 scale[1] = exp(err); 1315 1316 sumw = 0; 1317 for( i = 0; i < n; i++ ) 1318 { 1319 double w = weights->data.db[i]* 1320 scale[weak_eval->data.db[i] != orig_response->data.i[i]]; 1321 sumw += w; 1322 weights->data.db[i] = w; 1323 } 1324 1325 tree->scale( C ); 1326 } 1327 else if( params.boost_type == REAL ) 1328 { 1329 // Real AdaBoost: 1330 // weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i) 1331 // w_i *= exp(-y_i*f(x_i)) 1332 1333 for( i = 0; i < n; i++ ) 1334 weak_eval->data.db[i] *= -orig_response->data.i[i]; 1335 1336 cvExp( weak_eval, weak_eval ); 1337 1338 for( i = 0; i < n; i++ ) 1339 { 1340 double w = weights->data.db[i]*weak_eval->data.db[i]; 1341 sumw += w; 1342 weights->data.db[i] = w; 1343 } 1344 } 1345 else if( params.boost_type == LOGIT ) 1346 { 1347 // LogitBoost: 1348 // weak_eval[i] = f(x_i) in [-z_max,z_max] 1349 // sum_response = F(x_i). 1350 // F(x_i) += 0.5*f(x_i) 1351 // p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i))) 1352 // reuse weak_eval: weak_eval[i] <- p(x_i) 1353 // w_i = p(x_i)*1(1 - p(x_i)) 1354 // z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i))) 1355 // store z_i to the data->data_root as the new target responses 1356 1357 const double lb_weight_thresh = FLT_EPSILON; 1358 const double lb_z_max = 10.; 1359 /*float* responses_buf = data->get_resp_float_buf(); 1360 const float* responses = 0; 1361 data->get_ord_responses(data->data_root, responses_buf, &responses);*/ 1362 1363 /*if( weak->total == 7 ) 1364 putchar('*');*/ 1365 1366 for( i = 0; i < n; i++ ) 1367 { 1368 double s = sum_response->data.db[i] + 0.5*weak_eval->data.db[i]; 1369 sum_response->data.db[i] = s; 1370 weak_eval->data.db[i] = -2*s; 1371 } 1372 1373 cvExp( weak_eval, weak_eval ); 1374 1375 for( i = 0; i < n; i++ ) 1376 { 1377 double p = 1./(1. + weak_eval->data.db[i]); 1378 double w = p*(1 - p), z; 1379 w = MAX( w, lb_weight_thresh ); 1380 weights->data.db[i] = w; 1381 sumw += w; 1382 if( orig_response->data.i[i] > 0 ) 1383 { 1384 z = 1./p; 1385 fdata[sample_idx[i]*step] = (float)MIN(z, lb_z_max); 1386 } 1387 else 1388 { 1389 z = 1./(1-p); 1390 fdata[sample_idx[i]*step] = (float)-MIN(z, lb_z_max); 1391 } 1392 } 1393 } 1394 else 1395 { 1396 // Gentle AdaBoost: 1397 // weak_eval[i] = f(x_i) in [-1,1] 1398 // w_i *= exp(-y_i*f(x_i)) 1399 assert( params.boost_type == GENTLE ); 1400 1401 for( i = 0; i < n; i++ ) 1402 weak_eval->data.db[i] *= -orig_response->data.i[i]; 1403 1404 cvExp( weak_eval, weak_eval ); 1405 1406 for( i = 0; i < n; i++ ) 1407 { 1408 double w = weights->data.db[i] * weak_eval->data.db[i]; 1409 weights->data.db[i] = w; 1410 sumw += w; 1411 } 1412 } 1413 } 1414 1415 // renormalize weights 1416 if( sumw > FLT_EPSILON ) 1417 { 1418 sumw = 1./sumw; 1419 for( i = 0; i < n; ++i ) 1420 weights->data.db[i] *= sumw; 1421 } 1422 1423 __END__; 1424 } 1425 1426 1427 void 1428 CvBoost::trim_weights() 1429 { 1430 //CV_FUNCNAME( "CvBoost::trim_weights" ); 1431 1432 __BEGIN__; 1433 1434 int i, count = data->sample_count, nz_count = 0; 1435 double sum, threshold; 1436 1437 if( params.weight_trim_rate <= 0. || params.weight_trim_rate >= 1. ) 1438 EXIT; 1439 1440 // use weak_eval as temporary buffer for sorted weights 1441 cvCopy( weights, weak_eval ); 1442 1443 std::sort(weak_eval->data.db, weak_eval->data.db + count); 1444 1445 // as weight trimming occurs immediately after updating the weights, 1446 // where they are renormalized, we assume that the weight sum = 1. 1447 sum = 1. - params.weight_trim_rate; 1448 1449 for( i = 0; i < count; i++ ) 1450 { 1451 double w = weak_eval->data.db[i]; 1452 if( sum <= 0 ) 1453 break; 1454 sum -= w; 1455 } 1456 1457 threshold = i < count ? weak_eval->data.db[i] : DBL_MAX; 1458 1459 for( i = 0; i < count; i++ ) 1460 { 1461 double w = weights->data.db[i]; 1462 int f = w >= threshold; 1463 subsample_mask->data.ptr[i] = (uchar)f; 1464 nz_count += f; 1465 } 1466 1467 have_subsample = nz_count < count; 1468 1469 __END__; 1470 } 1471 1472 1473 const CvMat* 1474 CvBoost::get_active_vars( bool absolute_idx ) 1475 { 1476 CvMat* mask = 0; 1477 CvMat* inv_map = 0; 1478 CvMat* result = 0; 1479 1480 CV_FUNCNAME( "CvBoost::get_active_vars" ); 1481 1482 __BEGIN__; 1483 1484 if( !weak ) 1485 CV_ERROR( CV_StsError, "The boosted tree ensemble has not been trained yet" ); 1486 1487 if( !active_vars || !active_vars_abs ) 1488 { 1489 CvSeqReader reader; 1490 int i, j, nactive_vars; 1491 CvBoostTree* wtree; 1492 const CvDTreeNode* node; 1493 1494 assert(!active_vars && !active_vars_abs); 1495 mask = cvCreateMat( 1, data->var_count, CV_8U ); 1496 inv_map = cvCreateMat( 1, data->var_count, CV_32S ); 1497 cvZero( mask ); 1498 cvSet( inv_map, cvScalar(-1) ); 1499 1500 // first pass: compute the mask of used variables 1501 cvStartReadSeq( weak, &reader ); 1502 for( i = 0; i < weak->total; i++ ) 1503 { 1504 CV_READ_SEQ_ELEM(wtree, reader); 1505 1506 node = wtree->get_root(); 1507 assert( node != 0 ); 1508 for(;;) 1509 { 1510 const CvDTreeNode* parent; 1511 for(;;) 1512 { 1513 CvDTreeSplit* split = node->split; 1514 for( ; split != 0; split = split->next ) 1515 mask->data.ptr[split->var_idx] = 1; 1516 if( !node->left ) 1517 break; 1518 node = node->left; 1519 } 1520 1521 for( parent = node->parent; parent && parent->right == node; 1522 node = parent, parent = parent->parent ) 1523 ; 1524 1525 if( !parent ) 1526 break; 1527 1528 node = parent->right; 1529 } 1530 } 1531 1532 nactive_vars = cvCountNonZero(mask); 1533 1534 //if ( nactive_vars > 0 ) 1535 { 1536 active_vars = cvCreateMat( 1, nactive_vars, CV_32S ); 1537 active_vars_abs = cvCreateMat( 1, nactive_vars, CV_32S ); 1538 1539 have_active_cat_vars = false; 1540 1541 for( i = j = 0; i < data->var_count; i++ ) 1542 { 1543 if( mask->data.ptr[i] ) 1544 { 1545 active_vars->data.i[j] = i; 1546 active_vars_abs->data.i[j] = data->var_idx ? data->var_idx->data.i[i] : i; 1547 inv_map->data.i[i] = j; 1548 if( data->var_type->data.i[i] >= 0 ) 1549 have_active_cat_vars = true; 1550 j++; 1551 } 1552 } 1553 1554 1555 // second pass: now compute the condensed indices 1556 cvStartReadSeq( weak, &reader ); 1557 for( i = 0; i < weak->total; i++ ) 1558 { 1559 CV_READ_SEQ_ELEM(wtree, reader); 1560 node = wtree->get_root(); 1561 for(;;) 1562 { 1563 const CvDTreeNode* parent; 1564 for(;;) 1565 { 1566 CvDTreeSplit* split = node->split; 1567 for( ; split != 0; split = split->next ) 1568 { 1569 split->condensed_idx = inv_map->data.i[split->var_idx]; 1570 assert( split->condensed_idx >= 0 ); 1571 } 1572 1573 if( !node->left ) 1574 break; 1575 node = node->left; 1576 } 1577 1578 for( parent = node->parent; parent && parent->right == node; 1579 node = parent, parent = parent->parent ) 1580 ; 1581 1582 if( !parent ) 1583 break; 1584 1585 node = parent->right; 1586 } 1587 } 1588 } 1589 } 1590 1591 result = absolute_idx ? active_vars_abs : active_vars; 1592 1593 __END__; 1594 1595 cvReleaseMat( &mask ); 1596 cvReleaseMat( &inv_map ); 1597 1598 return result; 1599 } 1600 1601 1602 float 1603 CvBoost::predict( const CvMat* _sample, const CvMat* _missing, 1604 CvMat* weak_responses, CvSlice slice, 1605 bool raw_mode, bool return_sum ) const 1606 { 1607 float value = -FLT_MAX; 1608 1609 CvSeqReader reader; 1610 double sum = 0; 1611 int wstep = 0; 1612 const float* sample_data; 1613 1614 if( !weak ) 1615 CV_Error( CV_StsError, "The boosted tree ensemble has not been trained yet" ); 1616 1617 if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 || 1618 (_sample->cols != 1 && _sample->rows != 1) || 1619 (_sample->cols + _sample->rows - 1 != data->var_all && !raw_mode) || 1620 (active_vars && _sample->cols + _sample->rows - 1 != active_vars->cols && raw_mode) ) 1621 CV_Error( CV_StsBadArg, 1622 "the input sample must be 1d floating-point vector with the same " 1623 "number of elements as the total number of variables or " 1624 "as the number of variables used for training" ); 1625 1626 if( _missing ) 1627 { 1628 if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) || 1629 !CV_ARE_SIZES_EQ(_missing, _sample) ) 1630 CV_Error( CV_StsBadArg, 1631 "the missing data mask must be 8-bit vector of the same size as input sample" ); 1632 } 1633 1634 int i, weak_count = cvSliceLength( slice, weak ); 1635 if( weak_count >= weak->total ) 1636 { 1637 weak_count = weak->total; 1638 slice.start_index = 0; 1639 } 1640 1641 if( weak_responses ) 1642 { 1643 if( !CV_IS_MAT(weak_responses) || 1644 CV_MAT_TYPE(weak_responses->type) != CV_32FC1 || 1645 (weak_responses->cols != 1 && weak_responses->rows != 1) || 1646 weak_responses->cols + weak_responses->rows - 1 != weak_count ) 1647 CV_Error( CV_StsBadArg, 1648 "The output matrix of weak classifier responses must be valid " 1649 "floating-point vector of the same number of components as the length of input slice" ); 1650 wstep = CV_IS_MAT_CONT(weak_responses->type) ? 1 : weak_responses->step/sizeof(float); 1651 } 1652 1653 int var_count = active_vars->cols; 1654 const int* vtype = data->var_type->data.i; 1655 const int* cmap = data->cat_map->data.i; 1656 const int* cofs = data->cat_ofs->data.i; 1657 1658 cv::Mat sample = cv::cvarrToMat(_sample); 1659 cv::Mat missing; 1660 if(!_missing) 1661 missing = cv::cvarrToMat(_missing); 1662 1663 // if need, preprocess the input vector 1664 if( !raw_mode ) 1665 { 1666 int sstep, mstep = 0; 1667 const float* src_sample; 1668 const uchar* src_mask = 0; 1669 float* dst_sample; 1670 uchar* dst_mask; 1671 const int* vidx = active_vars->data.i; 1672 const int* vidx_abs = active_vars_abs->data.i; 1673 bool have_mask = _missing != 0; 1674 1675 sample = cv::Mat(1, var_count, CV_32FC1); 1676 missing = cv::Mat(1, var_count, CV_8UC1); 1677 1678 dst_sample = sample.ptr<float>(); 1679 dst_mask = missing.ptr<uchar>(); 1680 1681 src_sample = _sample->data.fl; 1682 sstep = CV_IS_MAT_CONT(_sample->type) ? 1 : _sample->step/sizeof(src_sample[0]); 1683 1684 if( _missing ) 1685 { 1686 src_mask = _missing->data.ptr; 1687 mstep = CV_IS_MAT_CONT(_missing->type) ? 1 : _missing->step; 1688 } 1689 1690 for( i = 0; i < var_count; i++ ) 1691 { 1692 int idx = vidx[i], idx_abs = vidx_abs[i]; 1693 float val = src_sample[idx_abs*sstep]; 1694 int ci = vtype[idx]; 1695 uchar m = src_mask ? src_mask[idx_abs*mstep] : (uchar)0; 1696 1697 if( ci >= 0 ) 1698 { 1699 int a = cofs[ci], b = (ci+1 >= data->cat_ofs->cols) ? data->cat_map->cols : cofs[ci+1], 1700 c = a; 1701 int ival = cvRound(val); 1702 if ( (ival != val) && (!m) ) 1703 CV_Error( CV_StsBadArg, 1704 "one of input categorical variable is not an integer" ); 1705 1706 while( a < b ) 1707 { 1708 c = (a + b) >> 1; 1709 if( ival < cmap[c] ) 1710 b = c; 1711 else if( ival > cmap[c] ) 1712 a = c+1; 1713 else 1714 break; 1715 } 1716 1717 if( c < 0 || ival != cmap[c] ) 1718 { 1719 m = 1; 1720 have_mask = true; 1721 } 1722 else 1723 { 1724 val = (float)(c - cofs[ci]); 1725 } 1726 } 1727 1728 dst_sample[i] = val; 1729 dst_mask[i] = m; 1730 } 1731 1732 if( !have_mask ) 1733 missing.release(); 1734 } 1735 else 1736 { 1737 if( !CV_IS_MAT_CONT(_sample->type & (_missing ? _missing->type : -1)) ) 1738 CV_Error( CV_StsBadArg, "In raw mode the input vectors must be continuous" ); 1739 } 1740 1741 cvStartReadSeq( weak, &reader ); 1742 cvSetSeqReaderPos( &reader, slice.start_index ); 1743 1744 sample_data = sample.ptr<float>(); 1745 1746 if( !have_active_cat_vars && missing.empty() && !weak_responses ) 1747 { 1748 for( i = 0; i < weak_count; i++ ) 1749 { 1750 CvBoostTree* wtree; 1751 const CvDTreeNode* node; 1752 CV_READ_SEQ_ELEM( wtree, reader ); 1753 1754 node = wtree->get_root(); 1755 while( node->left ) 1756 { 1757 CvDTreeSplit* split = node->split; 1758 int vi = split->condensed_idx; 1759 float val = sample_data[vi]; 1760 int dir = val <= split->ord.c ? -1 : 1; 1761 if( split->inversed ) 1762 dir = -dir; 1763 node = dir < 0 ? node->left : node->right; 1764 } 1765 sum += node->value; 1766 } 1767 } 1768 else 1769 { 1770 const int* avars = active_vars->data.i; 1771 const uchar* m = !missing.empty() ? missing.ptr<uchar>() : 0; 1772 1773 // full-featured version 1774 for( i = 0; i < weak_count; i++ ) 1775 { 1776 CvBoostTree* wtree; 1777 const CvDTreeNode* node; 1778 CV_READ_SEQ_ELEM( wtree, reader ); 1779 1780 node = wtree->get_root(); 1781 while( node->left ) 1782 { 1783 const CvDTreeSplit* split = node->split; 1784 int dir = 0; 1785 for( ; !dir && split != 0; split = split->next ) 1786 { 1787 int vi = split->condensed_idx; 1788 int ci = vtype[avars[vi]]; 1789 float val = sample_data[vi]; 1790 if( m && m[vi] ) 1791 continue; 1792 if( ci < 0 ) // ordered 1793 dir = val <= split->ord.c ? -1 : 1; 1794 else // categorical 1795 { 1796 int c = cvRound(val); 1797 dir = CV_DTREE_CAT_DIR(c, split->subset); 1798 } 1799 if( split->inversed ) 1800 dir = -dir; 1801 } 1802 1803 if( !dir ) 1804 { 1805 int diff = node->right->sample_count - node->left->sample_count; 1806 dir = diff < 0 ? -1 : 1; 1807 } 1808 node = dir < 0 ? node->left : node->right; 1809 } 1810 if( weak_responses ) 1811 weak_responses->data.fl[i*wstep] = (float)node->value; 1812 sum += node->value; 1813 } 1814 } 1815 1816 if( return_sum ) 1817 value = (float)sum; 1818 else 1819 { 1820 int cls_idx = sum >= 0; 1821 if( raw_mode ) 1822 value = (float)cls_idx; 1823 else 1824 value = (float)cmap[cofs[vtype[data->var_count]] + cls_idx]; 1825 } 1826 1827 return value; 1828 } 1829 1830 float CvBoost::calc_error( CvMLData* _data, int type, std::vector<float> *resp ) 1831 { 1832 float err = 0; 1833 const CvMat* values = _data->get_values(); 1834 const CvMat* response = _data->get_responses(); 1835 const CvMat* missing = _data->get_missing(); 1836 const CvMat* sample_idx = (type == CV_TEST_ERROR) ? _data->get_test_sample_idx() : _data->get_train_sample_idx(); 1837 const CvMat* var_types = _data->get_var_types(); 1838 int* sidx = sample_idx ? sample_idx->data.i : 0; 1839 int r_step = CV_IS_MAT_CONT(response->type) ? 1840 1 : response->step / CV_ELEM_SIZE(response->type); 1841 bool is_classifier = var_types->data.ptr[var_types->cols-1] == CV_VAR_CATEGORICAL; 1842 int sample_count = sample_idx ? sample_idx->cols : 0; 1843 sample_count = (type == CV_TRAIN_ERROR && sample_count == 0) ? values->rows : sample_count; 1844 float* pred_resp = 0; 1845 if( resp && (sample_count > 0) ) 1846 { 1847 resp->resize( sample_count ); 1848 pred_resp = &((*resp)[0]); 1849 } 1850 if ( is_classifier ) 1851 { 1852 for( int i = 0; i < sample_count; i++ ) 1853 { 1854 CvMat sample, miss; 1855 int si = sidx ? sidx[i] : i; 1856 cvGetRow( values, &sample, si ); 1857 if( missing ) 1858 cvGetRow( missing, &miss, si ); 1859 float r = (float)predict( &sample, missing ? &miss : 0 ); 1860 if( pred_resp ) 1861 pred_resp[i] = r; 1862 int d = fabs((double)r - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1; 1863 err += d; 1864 } 1865 err = sample_count ? err / (float)sample_count * 100 : -FLT_MAX; 1866 } 1867 else 1868 { 1869 for( int i = 0; i < sample_count; i++ ) 1870 { 1871 CvMat sample, miss; 1872 int si = sidx ? sidx[i] : i; 1873 cvGetRow( values, &sample, si ); 1874 if( missing ) 1875 cvGetRow( missing, &miss, si ); 1876 float r = (float)predict( &sample, missing ? &miss : 0 ); 1877 if( pred_resp ) 1878 pred_resp[i] = r; 1879 float d = r - response->data.fl[si*r_step]; 1880 err += d*d; 1881 } 1882 err = sample_count ? err / (float)sample_count : -FLT_MAX; 1883 } 1884 return err; 1885 } 1886 1887 void CvBoost::write_params( CvFileStorage* fs ) const 1888 { 1889 const char* boost_type_str = 1890 params.boost_type == DISCRETE ? "DiscreteAdaboost" : 1891 params.boost_type == REAL ? "RealAdaboost" : 1892 params.boost_type == LOGIT ? "LogitBoost" : 1893 params.boost_type == GENTLE ? "GentleAdaboost" : 0; 1894 1895 const char* split_crit_str = 1896 params.split_criteria == DEFAULT ? "Default" : 1897 params.split_criteria == GINI ? "Gini" : 1898 params.boost_type == MISCLASS ? "Misclassification" : 1899 params.boost_type == SQERR ? "SquaredErr" : 0; 1900 1901 if( boost_type_str ) 1902 cvWriteString( fs, "boosting_type", boost_type_str ); 1903 else 1904 cvWriteInt( fs, "boosting_type", params.boost_type ); 1905 1906 if( split_crit_str ) 1907 cvWriteString( fs, "splitting_criteria", split_crit_str ); 1908 else 1909 cvWriteInt( fs, "splitting_criteria", params.split_criteria ); 1910 1911 cvWriteInt( fs, "ntrees", weak->total ); 1912 cvWriteReal( fs, "weight_trimming_rate", params.weight_trim_rate ); 1913 1914 data->write_params( fs ); 1915 } 1916 1917 1918 void CvBoost::read_params( CvFileStorage* fs, CvFileNode* fnode ) 1919 { 1920 CV_FUNCNAME( "CvBoost::read_params" ); 1921 1922 __BEGIN__; 1923 1924 CvFileNode* temp; 1925 1926 if( !fnode || !CV_NODE_IS_MAP(fnode->tag) ) 1927 return; 1928 1929 data = new CvDTreeTrainData(); 1930 CV_CALL( data->read_params(fs, fnode)); 1931 data->shared = true; 1932 1933 params.max_depth = data->params.max_depth; 1934 params.min_sample_count = data->params.min_sample_count; 1935 params.max_categories = data->params.max_categories; 1936 params.priors = data->params.priors; 1937 params.regression_accuracy = data->params.regression_accuracy; 1938 params.use_surrogates = data->params.use_surrogates; 1939 1940 temp = cvGetFileNodeByName( fs, fnode, "boosting_type" ); 1941 if( !temp ) 1942 return; 1943 1944 if( temp && CV_NODE_IS_STRING(temp->tag) ) 1945 { 1946 const char* boost_type_str = cvReadString( temp, "" ); 1947 params.boost_type = strcmp( boost_type_str, "DiscreteAdaboost" ) == 0 ? DISCRETE : 1948 strcmp( boost_type_str, "RealAdaboost" ) == 0 ? REAL : 1949 strcmp( boost_type_str, "LogitBoost" ) == 0 ? LOGIT : 1950 strcmp( boost_type_str, "GentleAdaboost" ) == 0 ? GENTLE : -1; 1951 } 1952 else 1953 params.boost_type = cvReadInt( temp, -1 ); 1954 1955 if( params.boost_type < DISCRETE || params.boost_type > GENTLE ) 1956 CV_ERROR( CV_StsBadArg, "Unknown boosting type" ); 1957 1958 temp = cvGetFileNodeByName( fs, fnode, "splitting_criteria" ); 1959 if( temp && CV_NODE_IS_STRING(temp->tag) ) 1960 { 1961 const char* split_crit_str = cvReadString( temp, "" ); 1962 params.split_criteria = strcmp( split_crit_str, "Default" ) == 0 ? DEFAULT : 1963 strcmp( split_crit_str, "Gini" ) == 0 ? GINI : 1964 strcmp( split_crit_str, "Misclassification" ) == 0 ? MISCLASS : 1965 strcmp( split_crit_str, "SquaredErr" ) == 0 ? SQERR : -1; 1966 } 1967 else 1968 params.split_criteria = cvReadInt( temp, -1 ); 1969 1970 if( params.split_criteria < DEFAULT || params.boost_type > SQERR ) 1971 CV_ERROR( CV_StsBadArg, "Unknown boosting type" ); 1972 1973 params.weak_count = cvReadIntByName( fs, fnode, "ntrees" ); 1974 params.weight_trim_rate = cvReadRealByName( fs, fnode, "weight_trimming_rate", 0. ); 1975 1976 __END__; 1977 } 1978 1979 1980 1981 void 1982 CvBoost::read( CvFileStorage* fs, CvFileNode* node ) 1983 { 1984 CV_FUNCNAME( "CvBoost::read" ); 1985 1986 __BEGIN__; 1987 1988 CvSeqReader reader; 1989 CvFileNode* trees_fnode; 1990 CvMemStorage* storage; 1991 int i, ntrees; 1992 1993 clear(); 1994 read_params( fs, node ); 1995 1996 if( !data ) 1997 EXIT; 1998 1999 trees_fnode = cvGetFileNodeByName( fs, node, "trees" ); 2000 if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) ) 2001 CV_ERROR( CV_StsParseError, "<trees> tag is missing" ); 2002 2003 cvStartReadSeq( trees_fnode->data.seq, &reader ); 2004 ntrees = trees_fnode->data.seq->total; 2005 2006 if( ntrees != params.weak_count ) 2007 CV_ERROR( CV_StsUnmatchedSizes, 2008 "The number of trees stored does not match <ntrees> tag value" ); 2009 2010 CV_CALL( storage = cvCreateMemStorage() ); 2011 weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage ); 2012 2013 for( i = 0; i < ntrees; i++ ) 2014 { 2015 CvBoostTree* tree = new CvBoostTree(); 2016 CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, this, data )); 2017 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader ); 2018 cvSeqPush( weak, &tree ); 2019 } 2020 get_active_vars(); 2021 2022 __END__; 2023 } 2024 2025 2026 void 2027 CvBoost::write( CvFileStorage* fs, const char* name ) const 2028 { 2029 CV_FUNCNAME( "CvBoost::write" ); 2030 2031 __BEGIN__; 2032 2033 CvSeqReader reader; 2034 int i; 2035 2036 cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_BOOSTING ); 2037 2038 if( !weak ) 2039 CV_ERROR( CV_StsBadArg, "The classifier has not been trained yet" ); 2040 2041 write_params( fs ); 2042 cvStartWriteStruct( fs, "trees", CV_NODE_SEQ ); 2043 2044 cvStartReadSeq( weak, &reader ); 2045 2046 for( i = 0; i < weak->total; i++ ) 2047 { 2048 CvBoostTree* tree; 2049 CV_READ_SEQ_ELEM( tree, reader ); 2050 cvStartWriteStruct( fs, 0, CV_NODE_MAP ); 2051 tree->write( fs ); 2052 cvEndWriteStruct( fs ); 2053 } 2054 2055 cvEndWriteStruct( fs ); 2056 cvEndWriteStruct( fs ); 2057 2058 __END__; 2059 } 2060 2061 2062 CvMat* 2063 CvBoost::get_weights() 2064 { 2065 return weights; 2066 } 2067 2068 2069 CvMat* 2070 CvBoost::get_subtree_weights() 2071 { 2072 return subtree_weights; 2073 } 2074 2075 2076 CvMat* 2077 CvBoost::get_weak_response() 2078 { 2079 return weak_eval; 2080 } 2081 2082 2083 const CvBoostParams& 2084 CvBoost::get_params() const 2085 { 2086 return params; 2087 } 2088 2089 CvSeq* CvBoost::get_weak_predictors() 2090 { 2091 return weak; 2092 } 2093 2094 const CvDTreeTrainData* CvBoost::get_data() const 2095 { 2096 return data; 2097 } 2098 2099 using namespace cv; 2100 2101 CvBoost::CvBoost( const Mat& _train_data, int _tflag, 2102 const Mat& _responses, const Mat& _var_idx, 2103 const Mat& _sample_idx, const Mat& _var_type, 2104 const Mat& _missing_mask, 2105 CvBoostParams _params ) 2106 { 2107 weak = 0; 2108 data = 0; 2109 default_model_name = "my_boost_tree"; 2110 active_vars = active_vars_abs = orig_response = sum_response = weak_eval = 2111 subsample_mask = weights = subtree_weights = 0; 2112 2113 train( _train_data, _tflag, _responses, _var_idx, _sample_idx, 2114 _var_type, _missing_mask, _params ); 2115 } 2116 2117 2118 bool 2119 CvBoost::train( const Mat& _train_data, int _tflag, 2120 const Mat& _responses, const Mat& _var_idx, 2121 const Mat& _sample_idx, const Mat& _var_type, 2122 const Mat& _missing_mask, 2123 CvBoostParams _params, bool _update ) 2124 { 2125 train_data_hdr = _train_data; 2126 train_data_mat = _train_data; 2127 responses_hdr = _responses; 2128 responses_mat = _responses; 2129 2130 CvMat vidx = _var_idx, sidx = _sample_idx, vtype = _var_type, mmask = _missing_mask; 2131 2132 return train(&train_data_hdr, _tflag, &responses_hdr, vidx.data.ptr ? &vidx : 0, 2133 sidx.data.ptr ? &sidx : 0, vtype.data.ptr ? &vtype : 0, 2134 mmask.data.ptr ? &mmask : 0, _params, _update); 2135 } 2136 2137 float 2138 CvBoost::predict( const Mat& _sample, const Mat& _missing, 2139 const Range& slice, bool raw_mode, bool return_sum ) const 2140 { 2141 CvMat sample = _sample, mmask = _missing; 2142 /*if( weak_responses ) 2143 { 2144 int weak_count = cvSliceLength( slice, weak ); 2145 if( weak_count >= weak->total ) 2146 { 2147 weak_count = weak->total; 2148 slice.start_index = 0; 2149 } 2150 2151 if( !(weak_responses->data && weak_responses->type() == CV_32FC1 && 2152 (weak_responses->cols == 1 || weak_responses->rows == 1) && 2153 weak_responses->cols + weak_responses->rows - 1 == weak_count) ) 2154 weak_responses->create(weak_count, 1, CV_32FC1); 2155 pwr = &(wr = *weak_responses); 2156 }*/ 2157 return predict(&sample, _missing.empty() ? 0 : &mmask, 0, 2158 slice == Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end), 2159 raw_mode, return_sum); 2160 } 2161 2162 /* End of file. */ 2163