1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include "av1/encoder/av1_multi_thread.h" 13 #include "av1/encoder/encodeframe.h" 14 #include "av1/encoder/encoder.h" 15 #include "av1/encoder/ethread.h" 16 #include "av1/encoder/rdopt.h" 17 #include "aom_dsp/aom_dsp_common.h" 18 19 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { 20 for (int i = 0; i < REFERENCE_MODES; i++) 21 td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i]; 22 23 for (int i = 0; i < REF_FRAMES; i++) 24 td->rd_counts.global_motion_used[i] += 25 td_t->rd_counts.global_motion_used[i]; 26 27 td->rd_counts.compound_ref_used_flag |= 28 td_t->rd_counts.compound_ref_used_flag; 29 td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; 30 } 31 32 static void update_delta_lf_for_row_mt(AV1_COMP *cpi) { 33 AV1_COMMON *cm = &cpi->common; 34 MACROBLOCKD *xd = &cpi->td.mb.e_mbd; 35 const int mib_size = cm->seq_params.mib_size; 36 const int frame_lf_count = 37 av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; 38 for (int row = 0; row < cm->tile_rows; row++) { 39 for (int col = 0; col < cm->tile_cols; col++) { 40 TileDataEnc *tile_data = &cpi->tile_data[row * cm->tile_cols + col]; 41 const TileInfo *const tile_info = &tile_data->tile_info; 42 for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; 43 mi_row += mib_size) { 44 if (mi_row == tile_info->mi_row_start) 45 av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); 46 for (int mi_col = tile_info->mi_col_start; 47 mi_col < tile_info->mi_col_end; mi_col += mib_size) { 48 const int idx_str = cm->mi_stride * mi_row + mi_col; 49 MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str; 50 MB_MODE_INFO *mbmi = mi[0]; 51 if (mbmi->skip == 1 && (mbmi->sb_type == cm->seq_params.sb_size)) { 52 for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) 53 mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; 54 mbmi->delta_lf_from_base = xd->delta_lf_from_base; 55 } else { 56 if (cm->delta_q_info.delta_lf_multi) { 57 for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) 58 xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; 59 } else { 60 xd->delta_lf_from_base = mbmi->delta_lf_from_base; 61 } 62 } 63 } 64 } 65 } 66 } 67 } 68 69 void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync, 70 int r, int c) { 71 (void)row_mt_sync; 72 (void)r; 73 (void)c; 74 return; 75 } 76 77 void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync, 78 int r, int c, const int cols) { 79 (void)row_mt_sync; 80 (void)r; 81 (void)c; 82 (void)cols; 83 return; 84 } 85 86 void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c) { 87 #if CONFIG_MULTITHREAD 88 const int nsync = row_mt_sync->sync_range; 89 90 if (r) { 91 pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; 92 pthread_mutex_lock(mutex); 93 94 while (c > row_mt_sync->cur_col[r - 1] - nsync) { 95 pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); 96 } 97 pthread_mutex_unlock(mutex); 98 } 99 #else 100 (void)row_mt_sync; 101 (void)r; 102 (void)c; 103 #endif // CONFIG_MULTITHREAD 104 } 105 106 void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c, 107 const int cols) { 108 #if CONFIG_MULTITHREAD 109 const int nsync = row_mt_sync->sync_range; 110 int cur; 111 // Only signal when there are enough encoded blocks for next row to run. 112 int sig = 1; 113 114 if (c < cols - 1) { 115 cur = c; 116 if (c % nsync) sig = 0; 117 } else { 118 cur = cols + nsync; 119 } 120 121 if (sig) { 122 pthread_mutex_lock(&row_mt_sync->mutex_[r]); 123 124 row_mt_sync->cur_col[r] = cur; 125 126 pthread_cond_signal(&row_mt_sync->cond_[r]); 127 pthread_mutex_unlock(&row_mt_sync->mutex_[r]); 128 } 129 #else 130 (void)row_mt_sync; 131 (void)r; 132 (void)c; 133 (void)cols; 134 #endif // CONFIG_MULTITHREAD 135 } 136 137 // Allocate memory for row synchronization 138 void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, AV1_COMMON *cm, 139 int rows) { 140 row_mt_sync->rows = rows; 141 #if CONFIG_MULTITHREAD 142 { 143 int i; 144 145 CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, 146 aom_malloc(sizeof(*row_mt_sync->mutex_) * rows)); 147 if (row_mt_sync->mutex_) { 148 for (i = 0; i < rows; ++i) { 149 pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); 150 } 151 } 152 153 CHECK_MEM_ERROR(cm, row_mt_sync->cond_, 154 aom_malloc(sizeof(*row_mt_sync->cond_) * rows)); 155 if (row_mt_sync->cond_) { 156 for (i = 0; i < rows; ++i) { 157 pthread_cond_init(&row_mt_sync->cond_[i], NULL); 158 } 159 } 160 } 161 #endif // CONFIG_MULTITHREAD 162 163 CHECK_MEM_ERROR(cm, row_mt_sync->cur_col, 164 aom_malloc(sizeof(*row_mt_sync->cur_col) * rows)); 165 166 // Set up nsync. 167 row_mt_sync->sync_range = 1; 168 } 169 170 // Deallocate row based multi-threading synchronization related mutex and data 171 void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync) { 172 if (row_mt_sync != NULL) { 173 #if CONFIG_MULTITHREAD 174 int i; 175 176 if (row_mt_sync->mutex_ != NULL) { 177 for (i = 0; i < row_mt_sync->rows; ++i) { 178 pthread_mutex_destroy(&row_mt_sync->mutex_[i]); 179 } 180 aom_free(row_mt_sync->mutex_); 181 } 182 if (row_mt_sync->cond_ != NULL) { 183 for (i = 0; i < row_mt_sync->rows; ++i) { 184 pthread_cond_destroy(&row_mt_sync->cond_[i]); 185 } 186 aom_free(row_mt_sync->cond_); 187 } 188 #endif // CONFIG_MULTITHREAD 189 aom_free(row_mt_sync->cur_col); 190 // clear the structure as the source of this call may be dynamic change 191 // in tiles in which case this call will be followed by an _alloc() 192 // which may fail. 193 av1_zero(*row_mt_sync); 194 } 195 } 196 197 static void assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, 198 int num_tiles, int num_workers) { 199 int tile_id = 0; 200 int i; 201 202 for (i = 0; i < num_workers; i++) { 203 multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++; 204 if (tile_id == num_tiles) tile_id = 0; 205 } 206 } 207 208 static int get_next_job(AV1_COMP *const cpi, int *current_mi_row, 209 int cur_tile_id) { 210 AV1_COMMON *const cm = &cpi->common; 211 TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; 212 AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info; 213 214 if (row_mt_info->current_mi_row < this_tile->tile_info.mi_row_end) { 215 *current_mi_row = row_mt_info->current_mi_row; 216 row_mt_info->num_threads_working++; 217 row_mt_info->current_mi_row += cm->seq_params.mib_size; 218 return 1; 219 } 220 return 0; 221 } 222 223 static void switch_tile_and_get_next_job(AV1_COMP *const cpi, int *cur_tile_id, 224 int *current_mi_row, 225 int *end_of_frame) { 226 AV1_COMMON *const cm = &cpi->common; 227 const int tile_cols = cm->tile_cols; 228 const int tile_rows = cm->tile_rows; 229 230 int tile_id = -1; // Stores the tile ID with minimum proc done 231 int max_mis_to_encode = 0; 232 int min_num_threads_working = INT_MAX; 233 234 for (int tile_row = 0; tile_row < tile_rows; tile_row++) { 235 for (int tile_col = 0; tile_col < tile_cols; tile_col++) { 236 int tile_index = tile_row * tile_cols + tile_col; 237 TileDataEnc *this_tile = &cpi->tile_data[tile_index]; 238 AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info; 239 int num_sb_rows_in_tile = 240 av1_get_sb_rows_in_tile(cm, this_tile->tile_info); 241 int num_sb_cols_in_tile = 242 av1_get_sb_cols_in_tile(cm, this_tile->tile_info); 243 int theoretical_limit_on_threads = 244 AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); 245 int num_threads_working = row_mt_info->num_threads_working; 246 if (num_threads_working < theoretical_limit_on_threads) { 247 int num_mis_to_encode = 248 this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row; 249 250 // Tile to be processed by this thread is selected on the basis of 251 // availability of jobs: 252 // 1) If jobs are available, tile to be processed is chosen on the 253 // basis of minimum number of threads working for that tile. If two or 254 // more tiles have same number of threads working for them, then the 255 // tile with maximum number of jobs available will be chosen. 256 // 2) If no jobs are available, then end_of_frame is reached. 257 if (num_mis_to_encode > 0) { 258 if (num_threads_working < min_num_threads_working) { 259 min_num_threads_working = num_threads_working; 260 max_mis_to_encode = 0; 261 } 262 if (num_threads_working == min_num_threads_working && 263 num_mis_to_encode > max_mis_to_encode) { 264 tile_id = tile_index; 265 max_mis_to_encode = num_mis_to_encode; 266 } 267 } 268 } 269 } 270 } 271 if (tile_id == -1) { 272 *end_of_frame = 1; 273 } else { 274 // Update the cur ID to the next tile ID that will be processed, 275 // which will be the least processed tile 276 *cur_tile_id = tile_id; 277 get_next_job(cpi, current_mi_row, *cur_tile_id); 278 } 279 } 280 281 static int enc_row_mt_worker_hook(void *arg1, void *unused) { 282 EncWorkerData *const thread_data = (EncWorkerData *)arg1; 283 AV1_COMP *const cpi = thread_data->cpi; 284 AV1_COMMON *const cm = &cpi->common; 285 286 MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; 287 int thread_id = thread_data->thread_id; 288 int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; 289 (void)unused; 290 291 assert(cur_tile_id != -1); 292 293 int end_of_frame = 0; 294 while (1) { 295 int current_mi_row = -1; 296 #if CONFIG_MULTITHREAD 297 pthread_mutex_lock(cpi->row_mt_mutex_); 298 #endif 299 if (!get_next_job(cpi, ¤t_mi_row, cur_tile_id)) { 300 // No jobs are available for the current tile. Query for the status of 301 // other tiles and get the next job if available 302 switch_tile_and_get_next_job(cpi, &cur_tile_id, ¤t_mi_row, 303 &end_of_frame); 304 } 305 #if CONFIG_MULTITHREAD 306 pthread_mutex_unlock(cpi->row_mt_mutex_); 307 #endif 308 if (end_of_frame == 1) break; 309 310 TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; 311 int tile_row = this_tile->tile_info.tile_row; 312 int tile_col = this_tile->tile_info.tile_col; 313 314 assert(current_mi_row != -1 && 315 current_mi_row <= this_tile->tile_info.mi_row_end); 316 317 ThreadData *td = thread_data->td; 318 319 td->mb.e_mbd.tile_ctx = td->tctx; 320 td->mb.tile_pb_ctx = &this_tile->tctx; 321 if (this_tile->allow_update_cdf) { 322 td->mb.row_ctx = this_tile->row_ctx; 323 if (current_mi_row == this_tile->tile_info.mi_row_start) 324 memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); 325 } else { 326 memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); 327 } 328 329 av1_init_above_context(cm, &td->mb.e_mbd, tile_row); 330 331 // Disable exhaustive search speed features for row based multi-threading of 332 // encoder. 333 td->mb.m_search_count_ptr = NULL; 334 td->mb.ex_search_count_ptr = NULL; 335 336 cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params); 337 av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator); 338 339 av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row); 340 #if CONFIG_MULTITHREAD 341 pthread_mutex_lock(cpi->row_mt_mutex_); 342 #endif 343 this_tile->row_mt_info.num_threads_working--; 344 #if CONFIG_MULTITHREAD 345 pthread_mutex_unlock(cpi->row_mt_mutex_); 346 #endif 347 } 348 349 return 1; 350 } 351 352 static int enc_worker_hook(void *arg1, void *unused) { 353 EncWorkerData *const thread_data = (EncWorkerData *)arg1; 354 AV1_COMP *const cpi = thread_data->cpi; 355 const AV1_COMMON *const cm = &cpi->common; 356 const int tile_cols = cm->tile_cols; 357 const int tile_rows = cm->tile_rows; 358 int t; 359 360 (void)unused; 361 362 for (t = thread_data->start; t < tile_rows * tile_cols; 363 t += cpi->num_workers) { 364 int tile_row = t / tile_cols; 365 int tile_col = t % tile_cols; 366 367 TileDataEnc *const this_tile = 368 &cpi->tile_data[tile_row * cm->tile_cols + tile_col]; 369 thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; 370 thread_data->td->mb.tile_pb_ctx = &this_tile->tctx; 371 av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); 372 } 373 374 return 1; 375 } 376 377 static void create_enc_workers(AV1_COMP *cpi, int num_workers) { 378 AV1_COMMON *const cm = &cpi->common; 379 const AVxWorkerInterface *const winterface = aom_get_worker_interface(); 380 381 CHECK_MEM_ERROR(cm, cpi->workers, 382 aom_malloc(num_workers * sizeof(*cpi->workers))); 383 384 CHECK_MEM_ERROR(cm, cpi->tile_thr_data, 385 aom_calloc(num_workers, sizeof(*cpi->tile_thr_data))); 386 387 #if CONFIG_MULTITHREAD 388 if (cpi->row_mt == 1) { 389 if (cpi->row_mt_mutex_ == NULL) { 390 CHECK_MEM_ERROR(cm, cpi->row_mt_mutex_, 391 aom_malloc(sizeof(*(cpi->row_mt_mutex_)))); 392 if (cpi->row_mt_mutex_) pthread_mutex_init(cpi->row_mt_mutex_, NULL); 393 } 394 } 395 #endif 396 397 for (int i = num_workers - 1; i >= 0; i--) { 398 AVxWorker *const worker = &cpi->workers[i]; 399 EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; 400 401 ++cpi->num_workers; 402 winterface->init(worker); 403 worker->thread_name = "aom enc worker"; 404 405 thread_data->cpi = cpi; 406 thread_data->thread_id = i; 407 408 if (i > 0) { 409 // Allocate thread data. 410 CHECK_MEM_ERROR(cm, thread_data->td, 411 aom_memalign(32, sizeof(*thread_data->td))); 412 av1_zero(*thread_data->td); 413 414 // Set up pc_tree. 415 thread_data->td->pc_tree = NULL; 416 av1_setup_pc_tree(cm, thread_data->td); 417 418 CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf, 419 (uint8_t *)aom_memalign( 420 16, MAX_MB_PLANE * MAX_SB_SQUARE * 421 sizeof(*thread_data->td->above_pred_buf))); 422 CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf, 423 (uint8_t *)aom_memalign( 424 16, MAX_MB_PLANE * MAX_SB_SQUARE * 425 sizeof(*thread_data->td->left_pred_buf))); 426 427 CHECK_MEM_ERROR( 428 cm, thread_data->td->wsrc_buf, 429 (int32_t *)aom_memalign( 430 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf))); 431 432 CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info, 433 (InterModesInfo *)aom_malloc( 434 sizeof(*thread_data->td->inter_modes_info))); 435 436 for (int x = 0; x < 2; x++) 437 for (int y = 0; y < 2; y++) 438 CHECK_MEM_ERROR( 439 cm, thread_data->td->hash_value_buffer[x][y], 440 (uint32_t *)aom_malloc( 441 AOM_BUFFER_SIZE_FOR_BLOCK_HASH * 442 sizeof(*thread_data->td->hash_value_buffer[0][0]))); 443 444 CHECK_MEM_ERROR( 445 cm, thread_data->td->mask_buf, 446 (int32_t *)aom_memalign( 447 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf))); 448 // Allocate frame counters in thread data. 449 CHECK_MEM_ERROR(cm, thread_data->td->counts, 450 aom_calloc(1, sizeof(*thread_data->td->counts))); 451 452 // Allocate buffers used by palette coding mode. 453 CHECK_MEM_ERROR( 454 cm, thread_data->td->palette_buffer, 455 aom_memalign(16, sizeof(*thread_data->td->palette_buffer))); 456 457 CHECK_MEM_ERROR( 458 cm, thread_data->td->tmp_conv_dst, 459 aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * 460 sizeof(*thread_data->td->tmp_conv_dst))); 461 for (int j = 0; j < 2; ++j) { 462 CHECK_MEM_ERROR( 463 cm, thread_data->td->tmp_obmc_bufs[j], 464 aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * 465 sizeof(*thread_data->td->tmp_obmc_bufs[j]))); 466 } 467 468 // Create threads 469 if (!winterface->reset(worker)) 470 aom_internal_error(&cm->error, AOM_CODEC_ERROR, 471 "Tile encoder thread creation failed"); 472 } else { 473 // Main thread acts as a worker and uses the thread data in cpi. 474 thread_data->td = &cpi->td; 475 } 476 if (cpi->row_mt == 1) 477 CHECK_MEM_ERROR( 478 cm, thread_data->td->tctx, 479 (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx))); 480 winterface->sync(worker); 481 } 482 } 483 484 static void launch_enc_workers(AV1_COMP *cpi, int num_workers) { 485 const AVxWorkerInterface *const winterface = aom_get_worker_interface(); 486 // Encode a frame 487 for (int i = num_workers - 1; i >= 0; i--) { 488 AVxWorker *const worker = &cpi->workers[i]; 489 EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; 490 491 // Set the starting tile for each thread. 492 thread_data->start = i; 493 494 if (i == 0) 495 winterface->execute(worker); 496 else 497 winterface->launch(worker); 498 } 499 } 500 501 static void sync_enc_workers(AV1_COMP *cpi, int num_workers) { 502 const AVxWorkerInterface *const winterface = aom_get_worker_interface(); 503 int had_error = 0; 504 505 // Encoding ends. 506 for (int i = num_workers - 1; i >= 0; i--) { 507 AVxWorker *const worker = &cpi->workers[i]; 508 had_error |= !winterface->sync(worker); 509 } 510 511 if (had_error) 512 aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, 513 "Failed to encode tile data"); 514 } 515 516 static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) { 517 for (int i = num_workers - 1; i >= 0; i--) { 518 AVxWorker *const worker = &cpi->workers[i]; 519 EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; 520 cpi->intrabc_used |= thread_data->td->intrabc_used; 521 // Accumulate counters. 522 if (i > 0) { 523 av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); 524 accumulate_rd_opt(&cpi->td, thread_data->td); 525 cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count; 526 #if CONFIG_SPEED_STATS 527 cpi->td.mb.tx_search_count += thread_data->td->mb.tx_search_count; 528 #endif // CONFIG_SPEED_STATS 529 } 530 } 531 } 532 533 static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, 534 int num_workers) { 535 for (int i = num_workers - 1; i >= 0; i--) { 536 AVxWorker *const worker = &cpi->workers[i]; 537 EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; 538 539 worker->hook = hook; 540 worker->data1 = thread_data; 541 worker->data2 = NULL; 542 543 thread_data->td->intrabc_used = 0; 544 545 // Before encoding a frame, copy the thread data from cpi. 546 if (thread_data->td != &cpi->td) { 547 thread_data->td->mb = cpi->td.mb; 548 thread_data->td->rd_counts = cpi->td.rd_counts; 549 thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf; 550 thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf; 551 thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf; 552 553 thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info; 554 for (int x = 0; x < 2; x++) { 555 for (int y = 0; y < 2; y++) { 556 memcpy(thread_data->td->hash_value_buffer[x][y], 557 cpi->td.mb.hash_value_buffer[x][y], 558 AOM_BUFFER_SIZE_FOR_BLOCK_HASH * 559 sizeof(*thread_data->td->hash_value_buffer[0][0])); 560 thread_data->td->mb.hash_value_buffer[x][y] = 561 thread_data->td->hash_value_buffer[x][y]; 562 } 563 } 564 thread_data->td->mb.mask_buf = thread_data->td->mask_buf; 565 } 566 if (thread_data->td->counts != &cpi->counts) { 567 memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); 568 } 569 570 if (i > 0) { 571 thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; 572 thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; 573 for (int j = 0; j < 2; ++j) { 574 thread_data->td->mb.tmp_obmc_bufs[j] = 575 thread_data->td->tmp_obmc_bufs[j]; 576 } 577 578 thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; 579 for (int j = 0; j < 2; ++j) { 580 thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] = 581 thread_data->td->mb.tmp_obmc_bufs[j]; 582 } 583 } 584 } 585 } 586 587 void av1_encode_tiles_mt(AV1_COMP *cpi) { 588 AV1_COMMON *const cm = &cpi->common; 589 const int tile_cols = cm->tile_cols; 590 const int tile_rows = cm->tile_rows; 591 int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows); 592 593 if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) 594 av1_alloc_tile_data(cpi); 595 596 av1_init_tile_data(cpi); 597 // Only run once to create threads and allocate thread data. 598 if (cpi->num_workers == 0) { 599 create_enc_workers(cpi, num_workers); 600 } else { 601 num_workers = AOMMIN(num_workers, cpi->num_workers); 602 } 603 prepare_enc_workers(cpi, enc_worker_hook, num_workers); 604 launch_enc_workers(cpi, num_workers); 605 sync_enc_workers(cpi, num_workers); 606 accumulate_counters_enc_workers(cpi, num_workers); 607 } 608 609 // Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' 610 // members, so we treat it as an array, and sum over the whole length. 611 void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts, 612 const FRAME_COUNTS *counts) { 613 unsigned int *const acc = (unsigned int *)acc_counts; 614 const unsigned int *const cnt = (const unsigned int *)counts; 615 616 const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int); 617 618 for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i]; 619 } 620 621 void av1_encode_tiles_row_mt(AV1_COMP *cpi) { 622 AV1_COMMON *const cm = &cpi->common; 623 const int tile_cols = cm->tile_cols; 624 const int tile_rows = cm->tile_rows; 625 MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; 626 int num_workers = 0; 627 int total_num_threads_row_mt = 0; 628 int max_sb_rows = 0; 629 630 if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { 631 av1_row_mt_mem_dealloc(cpi); 632 av1_alloc_tile_data(cpi); 633 } 634 635 av1_init_tile_data(cpi); 636 637 for (int row = 0; row < tile_rows; row++) { 638 for (int col = 0; col < tile_cols; col++) { 639 TileDataEnc *tile_data = &cpi->tile_data[row * cm->tile_cols + col]; 640 int num_sb_rows_in_tile = 641 av1_get_sb_rows_in_tile(cm, tile_data->tile_info); 642 int num_sb_cols_in_tile = 643 av1_get_sb_cols_in_tile(cm, tile_data->tile_info); 644 total_num_threads_row_mt += 645 AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); 646 max_sb_rows = AOMMAX(max_sb_rows, num_sb_rows_in_tile); 647 } 648 } 649 // TODO(ravi.chaudhary (at) ittiam.com): Currently the percentage of 650 // post-processing stages in encoder is quiet low, so limiting the number of 651 // threads to the theoretical limit in row-mt does not have much impact on 652 // post-processing multi-threading stage. Need to revisit this when 653 // post-processing time starts shooting up. 654 num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt); 655 656 if (multi_thread_ctxt->allocated_tile_cols != tile_cols || 657 multi_thread_ctxt->allocated_tile_rows != tile_rows || 658 multi_thread_ctxt->allocated_sb_rows != max_sb_rows) { 659 av1_row_mt_mem_dealloc(cpi); 660 av1_row_mt_mem_alloc(cpi, max_sb_rows); 661 } 662 663 memset(multi_thread_ctxt->thread_id_to_tile_id, -1, 664 sizeof(*multi_thread_ctxt->thread_id_to_tile_id) * MAX_NUM_THREADS); 665 666 for (int tile_row = 0; tile_row < tile_rows; tile_row++) { 667 for (int tile_col = 0; tile_col < tile_cols; tile_col++) { 668 int tile_id = tile_row * tile_cols + tile_col; 669 TileDataEnc *this_tile = &cpi->tile_data[tile_id]; 670 671 // Initialize cur_col to -1 for all rows. 672 memset(this_tile->row_mt_sync.cur_col, -1, 673 sizeof(*this_tile->row_mt_sync.cur_col) * max_sb_rows); 674 this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start; 675 this_tile->row_mt_info.num_threads_working = 0; 676 677 av1_inter_mode_data_init(this_tile); 678 av1_zero_above_context(cm, &cpi->td.mb.e_mbd, 679 this_tile->tile_info.mi_col_start, 680 this_tile->tile_info.mi_col_end, tile_row); 681 this_tile->m_search_count = 0; // Count of motion search hits. 682 this_tile->ex_search_count = 0; // Exhaustive mesh search hits. 683 } 684 } 685 686 // Only run once to create threads and allocate thread data. 687 if (cpi->num_workers == 0) { 688 create_enc_workers(cpi, num_workers); 689 } else { 690 num_workers = AOMMIN(num_workers, cpi->num_workers); 691 } 692 assign_tile_to_thread(multi_thread_ctxt, tile_cols * tile_rows, num_workers); 693 prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers); 694 launch_enc_workers(cpi, num_workers); 695 sync_enc_workers(cpi, num_workers); 696 if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi); 697 accumulate_counters_enc_workers(cpi, num_workers); 698 } 699