1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "onyx_int.h" 12 #include "vp8/common/threading.h" 13 #include "vp8/common/common.h" 14 #include "vp8/common/extend.h" 15 16 #if CONFIG_MULTITHREAD 17 18 extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, 19 TOKENEXTRA **t, int recon_yoffset, 20 int recon_uvoffset); 21 extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, 22 TOKENEXTRA **t); 23 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x); 24 extern void vp8_build_block_offsets(MACROBLOCK *x); 25 extern void vp8_setup_block_ptrs(MACROBLOCK *x); 26 27 extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); 28 29 static THREAD_FUNCTION loopfilter_thread(void *p_data) 30 { 31 VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); 32 VP8_COMMON *cm = &cpi->common; 33 34 while (1) 35 { 36 if (cpi->b_multi_threaded == 0) 37 break; 38 39 if (sem_wait(&cpi->h_event_start_lpf) == 0) 40 { 41 if (cpi->b_multi_threaded == FALSE) // we're shutting down 42 break; 43 44 loopfilter_frame(cpi, cm); 45 46 sem_post(&cpi->h_event_end_lpf); 47 } 48 } 49 50 return 0; 51 } 52 53 static 54 THREAD_FUNCTION thread_encoding_proc(void *p_data) 55 { 56 int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread; 57 VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1); 58 MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2); 59 ENTROPY_CONTEXT_PLANES mb_row_left_context; 60 61 const int nsync = cpi->mt_sync_range; 62 //printf("Started thread %d\n", ithread); 63 64 while (1) 65 { 66 if (cpi->b_multi_threaded == 0) 67 break; 68 69 //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0) 70 if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) 71 { 72 VP8_COMMON *cm = &cpi->common; 73 int mb_row; 74 MACROBLOCK *x = &mbri->mb; 75 MACROBLOCKD *xd = &x->e_mbd; 76 TOKENEXTRA *tp ; 77 78 int *segment_counts = mbri->segment_counts; 79 int *totalrate = &mbri->totalrate; 80 81 if (cpi->b_multi_threaded == FALSE) // we're shutting down 82 break; 83 84 for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1)) 85 { 86 87 int i; 88 int recon_yoffset, recon_uvoffset; 89 int mb_col; 90 int ref_fb_idx = cm->lst_fb_idx; 91 int dst_fb_idx = cm->new_fb_idx; 92 int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; 93 int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; 94 volatile int *last_row_current_mb_col; 95 INT64 activity_sum = 0; 96 97 tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24)); 98 99 last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; 100 101 // reset above block coeffs 102 xd->above_context = cm->above_context; 103 xd->left_context = &mb_row_left_context; 104 105 vp8_zero(mb_row_left_context); 106 107 xd->up_available = (mb_row != 0); 108 recon_yoffset = (mb_row * recon_y_stride * 16); 109 recon_uvoffset = (mb_row * recon_uv_stride * 8); 110 111 cpi->tplist[mb_row].start = tp; 112 113 //printf("Thread mb_row = %d\n", mb_row); 114 115 // for each macroblock col in image 116 for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) 117 { 118 int seg_map_index = (mb_row * cm->mb_cols); 119 120 if ((mb_col & (nsync - 1)) == 0) 121 { 122 while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != cm->mb_cols - 1) 123 { 124 x86_pause_hint(); 125 thread_sleep(0); 126 } 127 } 128 129 // Distance of Mb to the various image edges. 130 // These specified to 8th pel as they are always compared to values that are in 1/8th pel units 131 xd->mb_to_left_edge = -((mb_col * 16) << 3); 132 xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3; 133 xd->mb_to_top_edge = -((mb_row * 16) << 3); 134 xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3; 135 136 // Set up limit values for motion vectors used to prevent them extending outside the UMV borders 137 x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16)); 138 x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16); 139 x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16)); 140 x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16); 141 142 xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset; 143 xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset; 144 xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset; 145 xd->left_available = (mb_col != 0); 146 147 x->rddiv = cpi->RDDIV; 148 x->rdmult = cpi->RDMULT; 149 150 if (cpi->oxcf.tuning == VP8_TUNE_SSIM) 151 activity_sum += vp8_activity_masking(cpi, x); 152 153 // Is segmentation enabled 154 // MB level adjutment to quantizer 155 if (xd->segmentation_enabled) 156 { 157 // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking) 158 if (cpi->segmentation_map[seg_map_index + mb_col] <= 3) 159 xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[seg_map_index + mb_col]; 160 else 161 xd->mode_info_context->mbmi.segment_id = 0; 162 163 vp8cx_mb_init_quantizer(cpi, x); 164 } 165 else 166 xd->mode_info_context->mbmi.segment_id = 0; // Set to Segment 0 by default 167 168 x->active_ptr = cpi->active_map + seg_map_index + mb_col; 169 170 if (cm->frame_type == KEY_FRAME) 171 { 172 *totalrate += vp8cx_encode_intra_macro_block(cpi, x, &tp); 173 #ifdef MODE_STATS 174 y_modes[xd->mbmi.mode] ++; 175 #endif 176 } 177 else 178 { 179 *totalrate += vp8cx_encode_inter_macroblock(cpi, x, &tp, recon_yoffset, recon_uvoffset); 180 181 #ifdef MODE_STATS 182 inter_y_modes[xd->mbmi.mode] ++; 183 184 if (xd->mbmi.mode == SPLITMV) 185 { 186 int b; 187 188 for (b = 0; b < xd->mbmi.partition_count; b++) 189 { 190 inter_b_modes[x->partition->bmi[b].mode] ++; 191 } 192 } 193 194 #endif 195 196 // Count of last ref frame 0,0 useage 197 if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)) 198 cpi->inter_zz_count++; 199 200 // Special case code for cyclic refresh 201 // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode 202 // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map 203 if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled) 204 { 205 const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; 206 cpi->segmentation_map[seg_map_index + mb_col] = mbmi->segment_id; 207 208 // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh): 209 // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0) 210 // else mark it as dirty (1). 211 if (mbmi->segment_id) 212 cpi->cyclic_refresh_map[seg_map_index + mb_col] = -1; 213 else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME)) 214 { 215 if (cpi->cyclic_refresh_map[seg_map_index + mb_col] == 1) 216 cpi->cyclic_refresh_map[seg_map_index + mb_col] = 0; 217 } 218 else 219 cpi->cyclic_refresh_map[seg_map_index + mb_col] = 1; 220 221 } 222 } 223 cpi->tplist[mb_row].stop = tp; 224 225 x->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb 226 227 for (i = 0; i < 16; i++) 228 vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi)); 229 230 // adjust to the next column of macroblocks 231 x->src.y_buffer += 16; 232 x->src.u_buffer += 8; 233 x->src.v_buffer += 8; 234 235 recon_yoffset += 16; 236 recon_uvoffset += 8; 237 238 // Keep track of segment useage 239 segment_counts[xd->mode_info_context->mbmi.segment_id]++; 240 241 // skip to next mb 242 xd->mode_info_context++; 243 x->partition_info++; 244 xd->above_context++; 245 246 cpi->mt_current_mb_col[mb_row] = mb_col; 247 } 248 249 //extend the recon for intra prediction 250 vp8_extend_mb_row( 251 &cm->yv12_fb[dst_fb_idx], 252 xd->dst.y_buffer + 16, 253 xd->dst.u_buffer + 8, 254 xd->dst.v_buffer + 8); 255 256 // this is to account for the border 257 xd->mode_info_context++; 258 x->partition_info++; 259 x->activity_sum += activity_sum; 260 261 x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols; 262 x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols; 263 x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols; 264 265 xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count; 266 x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count; 267 268 if (mb_row == cm->mb_rows - 1) 269 { 270 //SetEvent(cpi->h_event_main); 271 sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */ 272 } 273 } 274 } 275 } 276 277 //printf("exit thread %d\n", ithread); 278 return 0; 279 } 280 281 static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) 282 { 283 284 MACROBLOCK *x = mbsrc; 285 MACROBLOCK *z = mbdst; 286 int i; 287 288 z->ss = x->ss; 289 z->ss_count = x->ss_count; 290 z->searches_per_step = x->searches_per_step; 291 z->errorperbit = x->errorperbit; 292 293 z->sadperbit16 = x->sadperbit16; 294 z->sadperbit4 = x->sadperbit4; 295 z->errthresh = x->errthresh; 296 297 /* 298 z->mv_col_min = x->mv_col_min; 299 z->mv_col_max = x->mv_col_max; 300 z->mv_row_min = x->mv_row_min; 301 z->mv_row_max = x->mv_row_max; 302 z->vector_range = x->vector_range ; 303 */ 304 305 z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4; 306 z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4; 307 z->short_walsh4x4 = x->short_walsh4x4; 308 z->quantize_b = x->quantize_b; 309 z->optimize = x->optimize; 310 311 /* 312 z->mvc = x->mvc; 313 z->src.y_buffer = x->src.y_buffer; 314 z->src.u_buffer = x->src.u_buffer; 315 z->src.v_buffer = x->src.v_buffer; 316 */ 317 318 319 vpx_memcpy(z->mvcosts, x->mvcosts, sizeof(x->mvcosts)); 320 z->mvcost[0] = &z->mvcosts[0][mv_max+1]; 321 z->mvcost[1] = &z->mvcosts[1][mv_max+1]; 322 z->mvsadcost[0] = &z->mvsadcosts[0][mv_max+1]; 323 z->mvsadcost[1] = &z->mvsadcosts[1][mv_max+1]; 324 325 326 vpx_memcpy(z->token_costs, x->token_costs, sizeof(x->token_costs)); 327 vpx_memcpy(z->inter_bmode_costs, x->inter_bmode_costs, sizeof(x->inter_bmode_costs)); 328 //memcpy(z->mvcosts, x->mvcosts, sizeof(x->mvcosts)); 329 //memcpy(z->mvcost, x->mvcost, sizeof(x->mvcost)); 330 vpx_memcpy(z->mbmode_cost, x->mbmode_cost, sizeof(x->mbmode_cost)); 331 vpx_memcpy(z->intra_uv_mode_cost, x->intra_uv_mode_cost, sizeof(x->intra_uv_mode_cost)); 332 vpx_memcpy(z->bmode_costs, x->bmode_costs, sizeof(x->bmode_costs)); 333 334 for (i = 0; i < 25; i++) 335 { 336 z->block[i].quant = x->block[i].quant; 337 z->block[i].quant_fast = x->block[i].quant_fast; 338 z->block[i].quant_shift = x->block[i].quant_shift; 339 z->block[i].zbin = x->block[i].zbin; 340 z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost; 341 z->block[i].round = x->block[i].round; 342 /* 343 z->block[i].src = x->block[i].src; 344 */ 345 z->block[i].src_stride = x->block[i].src_stride; 346 z->block[i].force_empty = x->block[i].force_empty; 347 348 } 349 350 { 351 MACROBLOCKD *xd = &x->e_mbd; 352 MACROBLOCKD *zd = &z->e_mbd; 353 354 /* 355 zd->mode_info_context = xd->mode_info_context; 356 zd->mode_info = xd->mode_info; 357 358 zd->mode_info_stride = xd->mode_info_stride; 359 zd->frame_type = xd->frame_type; 360 zd->up_available = xd->up_available ; 361 zd->left_available = xd->left_available; 362 zd->left_context = xd->left_context; 363 zd->last_frame_dc = xd->last_frame_dc; 364 zd->last_frame_dccons = xd->last_frame_dccons; 365 zd->gold_frame_dc = xd->gold_frame_dc; 366 zd->gold_frame_dccons = xd->gold_frame_dccons; 367 zd->mb_to_left_edge = xd->mb_to_left_edge; 368 zd->mb_to_right_edge = xd->mb_to_right_edge; 369 zd->mb_to_top_edge = xd->mb_to_top_edge ; 370 zd->mb_to_bottom_edge = xd->mb_to_bottom_edge; 371 zd->gf_active_ptr = xd->gf_active_ptr; 372 zd->frames_since_golden = xd->frames_since_golden; 373 zd->frames_till_alt_ref_frame = xd->frames_till_alt_ref_frame; 374 */ 375 zd->subpixel_predict = xd->subpixel_predict; 376 zd->subpixel_predict8x4 = xd->subpixel_predict8x4; 377 zd->subpixel_predict8x8 = xd->subpixel_predict8x8; 378 zd->subpixel_predict16x16 = xd->subpixel_predict16x16; 379 zd->segmentation_enabled = xd->segmentation_enabled; 380 zd->mb_segement_abs_delta = xd->mb_segement_abs_delta; 381 vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); 382 383 for (i = 0; i < 25; i++) 384 { 385 zd->block[i].dequant = xd->block[i].dequant; 386 } 387 } 388 } 389 390 void vp8cx_init_mbrthread_data(VP8_COMP *cpi, 391 MACROBLOCK *x, 392 MB_ROW_COMP *mbr_ei, 393 int mb_row, 394 int count 395 ) 396 { 397 398 VP8_COMMON *const cm = & cpi->common; 399 MACROBLOCKD *const xd = & x->e_mbd; 400 int i; 401 (void) mb_row; 402 403 for (i = 0; i < count; i++) 404 { 405 MACROBLOCK *mb = & mbr_ei[i].mb; 406 MACROBLOCKD *mbd = &mb->e_mbd; 407 408 mbd->subpixel_predict = xd->subpixel_predict; 409 mbd->subpixel_predict8x4 = xd->subpixel_predict8x4; 410 mbd->subpixel_predict8x8 = xd->subpixel_predict8x8; 411 mbd->subpixel_predict16x16 = xd->subpixel_predict16x16; 412 #if CONFIG_RUNTIME_CPU_DETECT 413 mbd->rtcd = xd->rtcd; 414 #endif 415 mb->gf_active_ptr = x->gf_active_ptr; 416 417 mb->vector_range = 32; 418 419 vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts)); 420 mbr_ei[i].totalrate = 0; 421 422 mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1); 423 424 mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1); 425 mbd->mode_info_stride = cm->mode_info_stride; 426 427 mbd->frame_type = cm->frame_type; 428 429 mbd->frames_since_golden = cm->frames_since_golden; 430 mbd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame; 431 432 mb->src = * cpi->Source; 433 mbd->pre = cm->yv12_fb[cm->lst_fb_idx]; 434 mbd->dst = cm->yv12_fb[cm->new_fb_idx]; 435 436 mb->src.y_buffer += 16 * x->src.y_stride * (i + 1); 437 mb->src.u_buffer += 8 * x->src.uv_stride * (i + 1); 438 mb->src.v_buffer += 8 * x->src.uv_stride * (i + 1); 439 440 vp8_build_block_offsets(mb); 441 442 vp8_setup_block_dptrs(mbd); 443 444 vp8_setup_block_ptrs(mb); 445 446 mb->activity_sum = 0; 447 448 mbd->left_context = &cm->left_context; 449 mb->mvc = cm->fc.mvc; 450 451 setup_mbby_copy(&mbr_ei[i].mb, x); 452 453 } 454 } 455 456 void vp8cx_create_encoder_threads(VP8_COMP *cpi) 457 { 458 const VP8_COMMON * cm = &cpi->common; 459 460 cpi->b_multi_threaded = 0; 461 cpi->encoding_thread_count = 0; 462 cpi->processor_core_count = 32; //vp8_get_proc_core_count(); 463 464 if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) 465 { 466 int ithread; 467 int th_count = cpi->oxcf.multi_threaded - 1; 468 469 if (cpi->oxcf.multi_threaded > cpi->processor_core_count) 470 th_count = cpi->processor_core_count - 1; 471 472 /* we have th_count + 1 (main) threads processing one row each */ 473 /* no point to have more threads than the sync range allows */ 474 if(th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) 475 { 476 th_count = (cm->mb_cols / cpi->mt_sync_range) - 1; 477 } 478 479 if(th_count == 0) 480 return; 481 482 CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count)); 483 CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * th_count)); 484 CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count)); 485 vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count); 486 CHECK_MEM_ERROR(cpi->en_thread_data, 487 vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count)); 488 CHECK_MEM_ERROR(cpi->mt_current_mb_col, 489 vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); 490 491 sem_init(&cpi->h_event_end_encoding, 0, 0); 492 493 cpi->b_multi_threaded = 1; 494 cpi->encoding_thread_count = th_count; 495 496 /* 497 printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", 498 (cpi->encoding_thread_count +1)); 499 */ 500 501 for (ithread = 0; ithread < th_count; ithread++) 502 { 503 ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread]; 504 505 sem_init(&cpi->h_event_start_encoding[ithread], 0, 0); 506 ethd->ithread = ithread; 507 ethd->ptr1 = (void *)cpi; 508 ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread]; 509 510 pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd); 511 } 512 513 { 514 LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data; 515 516 sem_init(&cpi->h_event_start_lpf, 0, 0); 517 sem_init(&cpi->h_event_end_lpf, 0, 0); 518 519 lpfthd->ptr1 = (void *)cpi; 520 pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd); 521 } 522 } 523 524 } 525 526 void vp8cx_remove_encoder_threads(VP8_COMP *cpi) 527 { 528 if (cpi->b_multi_threaded) 529 { 530 //shutdown other threads 531 cpi->b_multi_threaded = 0; 532 { 533 int i; 534 535 for (i = 0; i < cpi->encoding_thread_count; i++) 536 { 537 //SetEvent(cpi->h_event_mbrencoding[i]); 538 sem_post(&cpi->h_event_start_encoding[i]); 539 pthread_join(cpi->h_encoding_thread[i], 0); 540 541 sem_destroy(&cpi->h_event_start_encoding[i]); 542 } 543 544 sem_post(&cpi->h_event_start_lpf); 545 pthread_join(cpi->h_filter_thread, 0); 546 } 547 548 sem_destroy(&cpi->h_event_end_encoding); 549 sem_destroy(&cpi->h_event_end_lpf); 550 sem_destroy(&cpi->h_event_start_lpf); 551 552 //free thread related resources 553 vpx_free(cpi->h_event_start_encoding); 554 vpx_free(cpi->h_encoding_thread); 555 vpx_free(cpi->mb_row_ei); 556 vpx_free(cpi->en_thread_data); 557 vpx_free(cpi->mt_current_mb_col); 558 } 559 } 560 #endif 561